|
{ |
|
"best_metric": 0.08026164770126343, |
|
"best_model_checkpoint": "./vit-base-cifar10/checkpoint-4300", |
|
"epoch": 2.0, |
|
"eval_steps": 100, |
|
"global_step": 4376, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004570383912248629, |
|
"grad_norm": 7.92866325378418, |
|
"learning_rate": 0.00019954296160877515, |
|
"loss": 0.2503, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.009140767824497258, |
|
"grad_norm": 0.6853145956993103, |
|
"learning_rate": 0.00019908592321755028, |
|
"loss": 0.2887, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.013711151736745886, |
|
"grad_norm": 8.078825950622559, |
|
"learning_rate": 0.00019862888482632542, |
|
"loss": 0.3522, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.018281535648994516, |
|
"grad_norm": 0.5457130670547485, |
|
"learning_rate": 0.00019817184643510056, |
|
"loss": 0.2236, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.022851919561243144, |
|
"grad_norm": 7.070144176483154, |
|
"learning_rate": 0.0001977148080438757, |
|
"loss": 0.3022, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.027422303473491772, |
|
"grad_norm": 6.712225437164307, |
|
"learning_rate": 0.00019725776965265083, |
|
"loss": 0.2947, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.031992687385740404, |
|
"grad_norm": 0.18951156735420227, |
|
"learning_rate": 0.00019680073126142596, |
|
"loss": 0.2885, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03656307129798903, |
|
"grad_norm": 5.366819858551025, |
|
"learning_rate": 0.0001963436928702011, |
|
"loss": 0.2822, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04113345521023766, |
|
"grad_norm": 1.0978411436080933, |
|
"learning_rate": 0.00019588665447897624, |
|
"loss": 0.2124, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.04570383912248629, |
|
"grad_norm": 0.3122899532318115, |
|
"learning_rate": 0.0001954296160877514, |
|
"loss": 0.1043, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04570383912248629, |
|
"eval_accuracy": 0.919, |
|
"eval_loss": 0.285547137260437, |
|
"eval_runtime": 71.7238, |
|
"eval_samples_per_second": 209.136, |
|
"eval_steps_per_second": 13.078, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.050274223034734916, |
|
"grad_norm": 0.872364342212677, |
|
"learning_rate": 0.00019497257769652654, |
|
"loss": 0.1525, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.054844606946983544, |
|
"grad_norm": 0.4638870358467102, |
|
"learning_rate": 0.00019451553930530167, |
|
"loss": 0.269, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.05941499085923217, |
|
"grad_norm": 4.21981143951416, |
|
"learning_rate": 0.0001940585009140768, |
|
"loss": 0.3743, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.06398537477148081, |
|
"grad_norm": 0.4475237727165222, |
|
"learning_rate": 0.00019360146252285195, |
|
"loss": 0.2608, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06855575868372943, |
|
"grad_norm": 6.892650127410889, |
|
"learning_rate": 0.00019314442413162706, |
|
"loss": 0.271, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07312614259597806, |
|
"grad_norm": 5.913930416107178, |
|
"learning_rate": 0.0001926873857404022, |
|
"loss": 0.1474, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07769652650822668, |
|
"grad_norm": 2.620058298110962, |
|
"learning_rate": 0.00019223034734917733, |
|
"loss": 0.1543, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08226691042047532, |
|
"grad_norm": 8.168417930603027, |
|
"learning_rate": 0.00019177330895795246, |
|
"loss": 0.4235, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.08683729433272395, |
|
"grad_norm": 6.655405044555664, |
|
"learning_rate": 0.0001913162705667276, |
|
"loss": 0.2081, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.09140767824497258, |
|
"grad_norm": 0.3367229104042053, |
|
"learning_rate": 0.00019085923217550274, |
|
"loss": 0.2671, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09140767824497258, |
|
"eval_accuracy": 0.9015333333333333, |
|
"eval_loss": 0.3649848401546478, |
|
"eval_runtime": 70.014, |
|
"eval_samples_per_second": 214.243, |
|
"eval_steps_per_second": 13.397, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09597806215722121, |
|
"grad_norm": 7.15602970123291, |
|
"learning_rate": 0.00019040219378427787, |
|
"loss": 0.5093, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.10054844606946983, |
|
"grad_norm": 3.229771137237549, |
|
"learning_rate": 0.000189945155393053, |
|
"loss": 0.4777, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.10511882998171847, |
|
"grad_norm": 7.438943862915039, |
|
"learning_rate": 0.00018948811700182815, |
|
"loss": 0.407, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.10968921389396709, |
|
"grad_norm": 8.4913969039917, |
|
"learning_rate": 0.00018903107861060328, |
|
"loss": 0.338, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.11425959780621572, |
|
"grad_norm": 1.3278025388717651, |
|
"learning_rate": 0.00018857404021937845, |
|
"loss": 0.3518, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.11882998171846434, |
|
"grad_norm": 9.502877235412598, |
|
"learning_rate": 0.00018811700182815358, |
|
"loss": 0.4069, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.12340036563071298, |
|
"grad_norm": 3.407360553741455, |
|
"learning_rate": 0.00018765996343692872, |
|
"loss": 0.2093, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.12797074954296161, |
|
"grad_norm": 9.035057067871094, |
|
"learning_rate": 0.00018720292504570386, |
|
"loss": 0.3598, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.13254113345521024, |
|
"grad_norm": 0.40073174238204956, |
|
"learning_rate": 0.000186745886654479, |
|
"loss": 0.2389, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.13711151736745886, |
|
"grad_norm": 0.32648196816444397, |
|
"learning_rate": 0.00018628884826325413, |
|
"loss": 0.2935, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.13711151736745886, |
|
"eval_accuracy": 0.9066666666666666, |
|
"eval_loss": 0.31670910120010376, |
|
"eval_runtime": 69.8802, |
|
"eval_samples_per_second": 214.653, |
|
"eval_steps_per_second": 13.423, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1416819012797075, |
|
"grad_norm": 5.271216869354248, |
|
"learning_rate": 0.00018583180987202927, |
|
"loss": 0.3934, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.14625228519195613, |
|
"grad_norm": 6.032111167907715, |
|
"learning_rate": 0.0001853747714808044, |
|
"loss": 0.2739, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.15082266910420475, |
|
"grad_norm": 1.9436883926391602, |
|
"learning_rate": 0.00018491773308957954, |
|
"loss": 0.2362, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.15539305301645337, |
|
"grad_norm": 7.990470886230469, |
|
"learning_rate": 0.00018446069469835467, |
|
"loss": 0.422, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.15996343692870202, |
|
"grad_norm": 3.6017348766326904, |
|
"learning_rate": 0.0001840036563071298, |
|
"loss": 0.3151, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.16453382084095064, |
|
"grad_norm": 4.97476053237915, |
|
"learning_rate": 0.00018354661791590495, |
|
"loss": 0.486, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.16910420475319926, |
|
"grad_norm": 5.418762683868408, |
|
"learning_rate": 0.00018308957952468008, |
|
"loss": 0.5847, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1736745886654479, |
|
"grad_norm": 2.143413782119751, |
|
"learning_rate": 0.00018263254113345522, |
|
"loss": 0.3188, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.17824497257769653, |
|
"grad_norm": 5.15682315826416, |
|
"learning_rate": 0.00018217550274223036, |
|
"loss": 0.27, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.18281535648994515, |
|
"grad_norm": 4.638512134552002, |
|
"learning_rate": 0.0001817184643510055, |
|
"loss": 0.27, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.18281535648994515, |
|
"eval_accuracy": 0.8922, |
|
"eval_loss": 0.35180962085723877, |
|
"eval_runtime": 69.6193, |
|
"eval_samples_per_second": 215.458, |
|
"eval_steps_per_second": 13.473, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.18738574040219377, |
|
"grad_norm": 7.933419704437256, |
|
"learning_rate": 0.00018126142595978063, |
|
"loss": 0.2467, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.19195612431444242, |
|
"grad_norm": 2.749178886413574, |
|
"learning_rate": 0.00018080438756855577, |
|
"loss": 0.283, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.19652650822669104, |
|
"grad_norm": 4.662679672241211, |
|
"learning_rate": 0.0001803473491773309, |
|
"loss": 0.3666, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.20109689213893966, |
|
"grad_norm": 8.538412094116211, |
|
"learning_rate": 0.00017989031078610604, |
|
"loss": 0.3315, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.2056672760511883, |
|
"grad_norm": 7.226283550262451, |
|
"learning_rate": 0.00017943327239488118, |
|
"loss": 0.2998, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.21023765996343693, |
|
"grad_norm": 7.400086402893066, |
|
"learning_rate": 0.0001789762340036563, |
|
"loss": 0.3298, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.21480804387568556, |
|
"grad_norm": 2.4320435523986816, |
|
"learning_rate": 0.00017851919561243145, |
|
"loss": 0.3493, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.21937842778793418, |
|
"grad_norm": 2.891914129257202, |
|
"learning_rate": 0.00017806215722120658, |
|
"loss": 0.2929, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.22394881170018283, |
|
"grad_norm": 4.258346080780029, |
|
"learning_rate": 0.00017760511882998172, |
|
"loss": 0.3847, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.22851919561243145, |
|
"grad_norm": 3.8576903343200684, |
|
"learning_rate": 0.00017714808043875686, |
|
"loss": 0.3634, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.22851919561243145, |
|
"eval_accuracy": 0.8953333333333333, |
|
"eval_loss": 0.36601880192756653, |
|
"eval_runtime": 70.2316, |
|
"eval_samples_per_second": 213.579, |
|
"eval_steps_per_second": 13.356, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.23308957952468007, |
|
"grad_norm": 6.1643853187561035, |
|
"learning_rate": 0.000176691042047532, |
|
"loss": 0.3764, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.2376599634369287, |
|
"grad_norm": 0.7884778380393982, |
|
"learning_rate": 0.00017623400365630713, |
|
"loss": 0.2016, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.24223034734917734, |
|
"grad_norm": 3.4569339752197266, |
|
"learning_rate": 0.00017577696526508227, |
|
"loss": 0.3292, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.24680073126142596, |
|
"grad_norm": 0.9117717146873474, |
|
"learning_rate": 0.0001753199268738574, |
|
"loss": 0.1951, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.2513711151736746, |
|
"grad_norm": 11.449209213256836, |
|
"learning_rate": 0.00017486288848263254, |
|
"loss": 0.3511, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.25594149908592323, |
|
"grad_norm": 3.824899435043335, |
|
"learning_rate": 0.00017440585009140768, |
|
"loss": 0.2262, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.26051188299817185, |
|
"grad_norm": 7.360519886016846, |
|
"learning_rate": 0.0001739488117001828, |
|
"loss": 0.2491, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.26508226691042047, |
|
"grad_norm": 6.038581848144531, |
|
"learning_rate": 0.00017349177330895795, |
|
"loss": 0.2548, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.2696526508226691, |
|
"grad_norm": 3.3812975883483887, |
|
"learning_rate": 0.0001730347349177331, |
|
"loss": 0.4146, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.2742230347349177, |
|
"grad_norm": 0.8644607067108154, |
|
"learning_rate": 0.00017257769652650825, |
|
"loss": 0.2559, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.2742230347349177, |
|
"eval_accuracy": 0.8901333333333333, |
|
"eval_loss": 0.3964242935180664, |
|
"eval_runtime": 70.6896, |
|
"eval_samples_per_second": 212.195, |
|
"eval_steps_per_second": 13.269, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.27879341864716634, |
|
"grad_norm": 0.19820252060890198, |
|
"learning_rate": 0.00017212065813528338, |
|
"loss": 0.2866, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.283363802559415, |
|
"grad_norm": 4.149306774139404, |
|
"learning_rate": 0.00017166361974405852, |
|
"loss": 0.2711, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.28793418647166363, |
|
"grad_norm": 0.7687764167785645, |
|
"learning_rate": 0.00017120658135283366, |
|
"loss": 0.2035, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.29250457038391225, |
|
"grad_norm": 3.1452839374542236, |
|
"learning_rate": 0.0001707495429616088, |
|
"loss": 0.3511, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.2970749542961609, |
|
"grad_norm": 4.324541091918945, |
|
"learning_rate": 0.00017029250457038393, |
|
"loss": 0.2869, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.3016453382084095, |
|
"grad_norm": 0.06479712575674057, |
|
"learning_rate": 0.00016983546617915907, |
|
"loss": 0.1225, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.3062157221206581, |
|
"grad_norm": 1.430450677871704, |
|
"learning_rate": 0.0001693784277879342, |
|
"loss": 0.2164, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.31078610603290674, |
|
"grad_norm": 3.9292774200439453, |
|
"learning_rate": 0.00016892138939670934, |
|
"loss": 0.3039, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.3153564899451554, |
|
"grad_norm": 5.680319309234619, |
|
"learning_rate": 0.00016846435100548448, |
|
"loss": 0.2702, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.31992687385740404, |
|
"grad_norm": 0.42744606733322144, |
|
"learning_rate": 0.0001680073126142596, |
|
"loss": 0.197, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.31992687385740404, |
|
"eval_accuracy": 0.9252666666666667, |
|
"eval_loss": 0.24806976318359375, |
|
"eval_runtime": 70.2676, |
|
"eval_samples_per_second": 213.47, |
|
"eval_steps_per_second": 13.349, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.32449725776965266, |
|
"grad_norm": 5.1783447265625, |
|
"learning_rate": 0.00016755027422303475, |
|
"loss": 0.4019, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.3290676416819013, |
|
"grad_norm": 4.345694541931152, |
|
"learning_rate": 0.00016709323583180986, |
|
"loss": 0.191, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.3336380255941499, |
|
"grad_norm": 5.100950241088867, |
|
"learning_rate": 0.000166636197440585, |
|
"loss": 0.2976, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.3382084095063985, |
|
"grad_norm": 6.052485942840576, |
|
"learning_rate": 0.00016617915904936016, |
|
"loss": 0.3266, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.34277879341864714, |
|
"grad_norm": 2.7631659507751465, |
|
"learning_rate": 0.0001657221206581353, |
|
"loss": 0.3042, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.3473491773308958, |
|
"grad_norm": 3.2634143829345703, |
|
"learning_rate": 0.00016526508226691043, |
|
"loss": 0.1501, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.35191956124314444, |
|
"grad_norm": 5.064187049865723, |
|
"learning_rate": 0.00016480804387568557, |
|
"loss": 0.3669, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.35648994515539306, |
|
"grad_norm": 6.092987060546875, |
|
"learning_rate": 0.0001643510054844607, |
|
"loss": 0.2448, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.3610603290676417, |
|
"grad_norm": 4.66456413269043, |
|
"learning_rate": 0.00016389396709323584, |
|
"loss": 0.3027, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.3656307129798903, |
|
"grad_norm": 6.245533466339111, |
|
"learning_rate": 0.00016343692870201098, |
|
"loss": 0.2594, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.3656307129798903, |
|
"eval_accuracy": 0.923, |
|
"eval_loss": 0.24855366349220276, |
|
"eval_runtime": 70.4374, |
|
"eval_samples_per_second": 212.955, |
|
"eval_steps_per_second": 13.317, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.3702010968921389, |
|
"grad_norm": 0.3337598741054535, |
|
"learning_rate": 0.0001629798903107861, |
|
"loss": 0.2141, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.37477148080438755, |
|
"grad_norm": 3.9450998306274414, |
|
"learning_rate": 0.00016252285191956125, |
|
"loss": 0.2284, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.3793418647166362, |
|
"grad_norm": 4.09628438949585, |
|
"learning_rate": 0.00016206581352833639, |
|
"loss": 0.2501, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.38391224862888484, |
|
"grad_norm": 3.9277567863464355, |
|
"learning_rate": 0.00016160877513711152, |
|
"loss": 0.1436, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.38848263254113347, |
|
"grad_norm": 1.291695237159729, |
|
"learning_rate": 0.00016115173674588666, |
|
"loss": 0.3047, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.3930530164533821, |
|
"grad_norm": 3.195793867111206, |
|
"learning_rate": 0.0001606946983546618, |
|
"loss": 0.2684, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.3976234003656307, |
|
"grad_norm": 9.900090217590332, |
|
"learning_rate": 0.00016023765996343693, |
|
"loss": 0.2006, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.40219378427787933, |
|
"grad_norm": 7.173875331878662, |
|
"learning_rate": 0.00015978062157221207, |
|
"loss": 0.3154, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.40676416819012795, |
|
"grad_norm": 6.029903411865234, |
|
"learning_rate": 0.0001593235831809872, |
|
"loss": 0.3175, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.4113345521023766, |
|
"grad_norm": 6.628474235534668, |
|
"learning_rate": 0.00015886654478976234, |
|
"loss": 0.4545, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.4113345521023766, |
|
"eval_accuracy": 0.9, |
|
"eval_loss": 0.327102929353714, |
|
"eval_runtime": 70.3538, |
|
"eval_samples_per_second": 213.208, |
|
"eval_steps_per_second": 13.333, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.41590493601462525, |
|
"grad_norm": 4.372049808502197, |
|
"learning_rate": 0.00015840950639853748, |
|
"loss": 0.2577, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.42047531992687387, |
|
"grad_norm": 5.5920562744140625, |
|
"learning_rate": 0.0001579524680073126, |
|
"loss": 0.2078, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.4250457038391225, |
|
"grad_norm": 0.5012129545211792, |
|
"learning_rate": 0.00015749542961608778, |
|
"loss": 0.4423, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.4296160877513711, |
|
"grad_norm": 4.091048717498779, |
|
"learning_rate": 0.0001570383912248629, |
|
"loss": 0.2831, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.43418647166361973, |
|
"grad_norm": 3.677157163619995, |
|
"learning_rate": 0.00015658135283363805, |
|
"loss": 0.3525, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.43875685557586835, |
|
"grad_norm": 2.8373067378997803, |
|
"learning_rate": 0.00015612431444241319, |
|
"loss": 0.1623, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.443327239488117, |
|
"grad_norm": 7.7640156745910645, |
|
"learning_rate": 0.00015566727605118832, |
|
"loss": 0.2469, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.44789762340036565, |
|
"grad_norm": 0.34190094470977783, |
|
"learning_rate": 0.00015521023765996346, |
|
"loss": 0.2173, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.4524680073126143, |
|
"grad_norm": 0.42838719487190247, |
|
"learning_rate": 0.0001547531992687386, |
|
"loss": 0.1671, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.4570383912248629, |
|
"grad_norm": 1.7106258869171143, |
|
"learning_rate": 0.00015429616087751373, |
|
"loss": 0.1243, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.4570383912248629, |
|
"eval_accuracy": 0.9268666666666666, |
|
"eval_loss": 0.24482281506061554, |
|
"eval_runtime": 70.0663, |
|
"eval_samples_per_second": 214.083, |
|
"eval_steps_per_second": 13.387, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.4616087751371115, |
|
"grad_norm": 4.664043426513672, |
|
"learning_rate": 0.00015383912248628884, |
|
"loss": 0.2915, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.46617915904936014, |
|
"grad_norm": 6.955049991607666, |
|
"learning_rate": 0.00015338208409506398, |
|
"loss": 0.2288, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.47074954296160876, |
|
"grad_norm": 6.842510223388672, |
|
"learning_rate": 0.00015292504570383911, |
|
"loss": 0.4217, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.4753199268738574, |
|
"grad_norm": 2.6210012435913086, |
|
"learning_rate": 0.00015246800731261425, |
|
"loss": 0.3455, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.47989031078610606, |
|
"grad_norm": 0.2048071324825287, |
|
"learning_rate": 0.0001520109689213894, |
|
"loss": 0.1599, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.4844606946983547, |
|
"grad_norm": 2.204970121383667, |
|
"learning_rate": 0.00015155393053016452, |
|
"loss": 0.2539, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.4890310786106033, |
|
"grad_norm": 4.208988189697266, |
|
"learning_rate": 0.00015109689213893966, |
|
"loss": 0.1719, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.4936014625228519, |
|
"grad_norm": 1.6201022863388062, |
|
"learning_rate": 0.00015063985374771482, |
|
"loss": 0.103, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.49817184643510054, |
|
"grad_norm": 2.211272954940796, |
|
"learning_rate": 0.00015018281535648996, |
|
"loss": 0.1956, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.5027422303473492, |
|
"grad_norm": 7.531051158905029, |
|
"learning_rate": 0.0001497257769652651, |
|
"loss": 0.3593, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5027422303473492, |
|
"eval_accuracy": 0.9354, |
|
"eval_loss": 0.2118164449930191, |
|
"eval_runtime": 70.235, |
|
"eval_samples_per_second": 213.569, |
|
"eval_steps_per_second": 13.355, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5073126142595978, |
|
"grad_norm": 3.182407855987549, |
|
"learning_rate": 0.00014926873857404023, |
|
"loss": 0.3023, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.5118829981718465, |
|
"grad_norm": 3.975743293762207, |
|
"learning_rate": 0.00014881170018281537, |
|
"loss": 0.2358, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.5164533820840951, |
|
"grad_norm": 4.823325157165527, |
|
"learning_rate": 0.0001483546617915905, |
|
"loss": 0.2031, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.5210237659963437, |
|
"grad_norm": 0.8731510043144226, |
|
"learning_rate": 0.00014789762340036564, |
|
"loss": 0.2254, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.5255941499085923, |
|
"grad_norm": 1.2150533199310303, |
|
"learning_rate": 0.00014744058500914078, |
|
"loss": 0.312, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.5301645338208409, |
|
"grad_norm": 1.9706271886825562, |
|
"learning_rate": 0.00014698354661791591, |
|
"loss": 0.1619, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.5347349177330896, |
|
"grad_norm": 0.36833953857421875, |
|
"learning_rate": 0.00014652650822669105, |
|
"loss": 0.1976, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.5393053016453382, |
|
"grad_norm": 3.9230244159698486, |
|
"learning_rate": 0.0001460694698354662, |
|
"loss": 0.2928, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.5438756855575868, |
|
"grad_norm": 4.378619194030762, |
|
"learning_rate": 0.00014561243144424132, |
|
"loss": 0.2008, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.5484460694698354, |
|
"grad_norm": 0.8093172907829285, |
|
"learning_rate": 0.00014515539305301646, |
|
"loss": 0.1375, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5484460694698354, |
|
"eval_accuracy": 0.9348666666666666, |
|
"eval_loss": 0.22052045166492462, |
|
"eval_runtime": 70.7854, |
|
"eval_samples_per_second": 211.908, |
|
"eval_steps_per_second": 13.251, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.553016453382084, |
|
"grad_norm": 8.147080421447754, |
|
"learning_rate": 0.0001446983546617916, |
|
"loss": 0.1968, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.5575868372943327, |
|
"grad_norm": 4.382139682769775, |
|
"learning_rate": 0.00014424131627056673, |
|
"loss": 0.2315, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.5621572212065814, |
|
"grad_norm": 2.122556209564209, |
|
"learning_rate": 0.00014378427787934187, |
|
"loss": 0.1646, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.56672760511883, |
|
"grad_norm": 4.841265678405762, |
|
"learning_rate": 0.000143327239488117, |
|
"loss": 0.2223, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.5712979890310786, |
|
"grad_norm": 0.27766382694244385, |
|
"learning_rate": 0.00014287020109689214, |
|
"loss": 0.2387, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.5758683729433273, |
|
"grad_norm": 1.7641736268997192, |
|
"learning_rate": 0.00014241316270566728, |
|
"loss": 0.1726, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.5804387568555759, |
|
"grad_norm": 6.29493522644043, |
|
"learning_rate": 0.00014195612431444244, |
|
"loss": 0.4514, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.5850091407678245, |
|
"grad_norm": 0.2854464054107666, |
|
"learning_rate": 0.00014149908592321758, |
|
"loss": 0.1004, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.5895795246800731, |
|
"grad_norm": 5.928961753845215, |
|
"learning_rate": 0.00014104204753199271, |
|
"loss": 0.2575, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.5941499085923218, |
|
"grad_norm": 0.3155369758605957, |
|
"learning_rate": 0.00014058500914076782, |
|
"loss": 0.1521, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.5941499085923218, |
|
"eval_accuracy": 0.9376, |
|
"eval_loss": 0.200880229473114, |
|
"eval_runtime": 70.0018, |
|
"eval_samples_per_second": 214.28, |
|
"eval_steps_per_second": 13.4, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.5987202925045704, |
|
"grad_norm": 5.347257614135742, |
|
"learning_rate": 0.00014012797074954296, |
|
"loss": 0.178, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.603290676416819, |
|
"grad_norm": 8.616045951843262, |
|
"learning_rate": 0.0001396709323583181, |
|
"loss": 0.2389, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.6078610603290676, |
|
"grad_norm": 3.013582229614258, |
|
"learning_rate": 0.00013921389396709323, |
|
"loss": 0.1769, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.6124314442413162, |
|
"grad_norm": 5.738295078277588, |
|
"learning_rate": 0.00013875685557586837, |
|
"loss": 0.1847, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.6170018281535649, |
|
"grad_norm": 0.47330769896507263, |
|
"learning_rate": 0.0001382998171846435, |
|
"loss": 0.1451, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.6215722120658135, |
|
"grad_norm": 4.576214790344238, |
|
"learning_rate": 0.00013784277879341864, |
|
"loss": 0.2656, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.6261425959780622, |
|
"grad_norm": 2.6387856006622314, |
|
"learning_rate": 0.00013738574040219378, |
|
"loss": 0.1181, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.6307129798903108, |
|
"grad_norm": 0.5952144265174866, |
|
"learning_rate": 0.00013692870201096892, |
|
"loss": 0.1286, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.6352833638025595, |
|
"grad_norm": 5.721958160400391, |
|
"learning_rate": 0.00013647166361974405, |
|
"loss": 0.2129, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.6398537477148081, |
|
"grad_norm": 1.397820234298706, |
|
"learning_rate": 0.0001360146252285192, |
|
"loss": 0.1237, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6398537477148081, |
|
"eval_accuracy": 0.9444666666666667, |
|
"eval_loss": 0.18025201559066772, |
|
"eval_runtime": 69.8721, |
|
"eval_samples_per_second": 214.678, |
|
"eval_steps_per_second": 13.425, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6444241316270567, |
|
"grad_norm": 2.166053533554077, |
|
"learning_rate": 0.00013555758683729432, |
|
"loss": 0.2256, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.6489945155393053, |
|
"grad_norm": 1.5744013786315918, |
|
"learning_rate": 0.0001351005484460695, |
|
"loss": 0.085, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.6535648994515539, |
|
"grad_norm": 5.0064167976379395, |
|
"learning_rate": 0.00013464351005484462, |
|
"loss": 0.1947, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.6581352833638026, |
|
"grad_norm": 6.537513732910156, |
|
"learning_rate": 0.00013418647166361976, |
|
"loss": 0.095, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.6627056672760512, |
|
"grad_norm": 10.165826797485352, |
|
"learning_rate": 0.0001337294332723949, |
|
"loss": 0.2335, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.6672760511882998, |
|
"grad_norm": 2.5049922466278076, |
|
"learning_rate": 0.00013327239488117003, |
|
"loss": 0.1958, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.6718464351005484, |
|
"grad_norm": 6.945699214935303, |
|
"learning_rate": 0.00013281535648994517, |
|
"loss": 0.141, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.676416819012797, |
|
"grad_norm": 0.31921499967575073, |
|
"learning_rate": 0.0001323583180987203, |
|
"loss": 0.1275, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.6809872029250457, |
|
"grad_norm": 6.356455326080322, |
|
"learning_rate": 0.00013190127970749544, |
|
"loss": 0.2715, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.6855575868372943, |
|
"grad_norm": 0.11626709252595901, |
|
"learning_rate": 0.00013144424131627058, |
|
"loss": 0.2214, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6855575868372943, |
|
"eval_accuracy": 0.9394666666666667, |
|
"eval_loss": 0.20262379944324493, |
|
"eval_runtime": 70.7416, |
|
"eval_samples_per_second": 212.039, |
|
"eval_steps_per_second": 13.26, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6901279707495429, |
|
"grad_norm": 10.25921630859375, |
|
"learning_rate": 0.00013098720292504572, |
|
"loss": 0.3417, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.6946983546617916, |
|
"grad_norm": 0.09503093361854553, |
|
"learning_rate": 0.00013053016453382085, |
|
"loss": 0.1734, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.6992687385740403, |
|
"grad_norm": 4.171358108520508, |
|
"learning_rate": 0.000130073126142596, |
|
"loss": 0.1565, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.7038391224862889, |
|
"grad_norm": 1.6032589673995972, |
|
"learning_rate": 0.00012961608775137112, |
|
"loss": 0.279, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.7084095063985375, |
|
"grad_norm": 4.150899410247803, |
|
"learning_rate": 0.00012915904936014626, |
|
"loss": 0.261, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.7129798903107861, |
|
"grad_norm": 3.3375895023345947, |
|
"learning_rate": 0.0001287020109689214, |
|
"loss": 0.2531, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.7175502742230347, |
|
"grad_norm": 6.167548179626465, |
|
"learning_rate": 0.00012824497257769653, |
|
"loss": 0.2602, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.7221206581352834, |
|
"grad_norm": 2.9471616744995117, |
|
"learning_rate": 0.00012778793418647167, |
|
"loss": 0.2944, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.726691042047532, |
|
"grad_norm": 1.4548298120498657, |
|
"learning_rate": 0.0001273308957952468, |
|
"loss": 0.0746, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.7312614259597806, |
|
"grad_norm": 6.375178337097168, |
|
"learning_rate": 0.00012687385740402194, |
|
"loss": 0.1324, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.7312614259597806, |
|
"eval_accuracy": 0.9493333333333334, |
|
"eval_loss": 0.16354645788669586, |
|
"eval_runtime": 70.3467, |
|
"eval_samples_per_second": 213.23, |
|
"eval_steps_per_second": 13.334, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.7358318098720292, |
|
"grad_norm": 6.716129302978516, |
|
"learning_rate": 0.00012641681901279708, |
|
"loss": 0.2021, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.7404021937842779, |
|
"grad_norm": 0.047931790351867676, |
|
"learning_rate": 0.00012595978062157222, |
|
"loss": 0.1456, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.7449725776965265, |
|
"grad_norm": 0.08262607455253601, |
|
"learning_rate": 0.00012550274223034735, |
|
"loss": 0.1167, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.7495429616087751, |
|
"grad_norm": 0.21918249130249023, |
|
"learning_rate": 0.0001250457038391225, |
|
"loss": 0.1558, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.7541133455210237, |
|
"grad_norm": 3.03836989402771, |
|
"learning_rate": 0.00012458866544789763, |
|
"loss": 0.1506, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.7586837294332724, |
|
"grad_norm": 9.57582950592041, |
|
"learning_rate": 0.00012413162705667276, |
|
"loss": 0.2193, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.7632541133455211, |
|
"grad_norm": 0.2596251964569092, |
|
"learning_rate": 0.0001236745886654479, |
|
"loss": 0.1041, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.7678244972577697, |
|
"grad_norm": 11.114243507385254, |
|
"learning_rate": 0.00012321755027422303, |
|
"loss": 0.1528, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.7723948811700183, |
|
"grad_norm": 5.171649932861328, |
|
"learning_rate": 0.00012276051188299817, |
|
"loss": 0.1856, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.7769652650822669, |
|
"grad_norm": 4.471177577972412, |
|
"learning_rate": 0.0001223034734917733, |
|
"loss": 0.1864, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.7769652650822669, |
|
"eval_accuracy": 0.9493333333333334, |
|
"eval_loss": 0.16721826791763306, |
|
"eval_runtime": 70.5466, |
|
"eval_samples_per_second": 212.625, |
|
"eval_steps_per_second": 13.296, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.7815356489945156, |
|
"grad_norm": 4.236210346221924, |
|
"learning_rate": 0.00012184643510054844, |
|
"loss": 0.1898, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.7861060329067642, |
|
"grad_norm": 3.0980677604675293, |
|
"learning_rate": 0.0001213893967093236, |
|
"loss": 0.1925, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.7906764168190128, |
|
"grad_norm": 2.8537585735321045, |
|
"learning_rate": 0.00012093235831809873, |
|
"loss": 0.1141, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.7952468007312614, |
|
"grad_norm": 3.942676305770874, |
|
"learning_rate": 0.00012047531992687387, |
|
"loss": 0.2004, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.79981718464351, |
|
"grad_norm": 2.8048367500305176, |
|
"learning_rate": 0.000120018281535649, |
|
"loss": 0.2308, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.8043875685557587, |
|
"grad_norm": 2.2184743881225586, |
|
"learning_rate": 0.00011956124314442414, |
|
"loss": 0.1032, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.8089579524680073, |
|
"grad_norm": 12.887242317199707, |
|
"learning_rate": 0.00011910420475319928, |
|
"loss": 0.2534, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.8135283363802559, |
|
"grad_norm": 0.6864253878593445, |
|
"learning_rate": 0.00011864716636197441, |
|
"loss": 0.1879, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.8180987202925045, |
|
"grad_norm": 3.042908191680908, |
|
"learning_rate": 0.00011819012797074955, |
|
"loss": 0.2072, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.8226691042047533, |
|
"grad_norm": 0.42097190022468567, |
|
"learning_rate": 0.00011773308957952469, |
|
"loss": 0.128, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.8226691042047533, |
|
"eval_accuracy": 0.9409333333333333, |
|
"eval_loss": 0.2014516144990921, |
|
"eval_runtime": 71.2106, |
|
"eval_samples_per_second": 210.643, |
|
"eval_steps_per_second": 13.172, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.8272394881170019, |
|
"grad_norm": 5.429555892944336, |
|
"learning_rate": 0.00011727605118829984, |
|
"loss": 0.099, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.8318098720292505, |
|
"grad_norm": 6.33099889755249, |
|
"learning_rate": 0.00011681901279707497, |
|
"loss": 0.1546, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.8363802559414991, |
|
"grad_norm": 5.382350921630859, |
|
"learning_rate": 0.00011636197440585011, |
|
"loss": 0.2645, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.8409506398537477, |
|
"grad_norm": 2.201632022857666, |
|
"learning_rate": 0.00011590493601462524, |
|
"loss": 0.2078, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.8455210237659964, |
|
"grad_norm": 7.094603538513184, |
|
"learning_rate": 0.00011544789762340038, |
|
"loss": 0.3097, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.850091407678245, |
|
"grad_norm": 0.3324418067932129, |
|
"learning_rate": 0.00011499085923217552, |
|
"loss": 0.1623, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.8546617915904936, |
|
"grad_norm": 0.12678645551204681, |
|
"learning_rate": 0.00011453382084095065, |
|
"loss": 0.1275, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.8592321755027422, |
|
"grad_norm": 2.2118465900421143, |
|
"learning_rate": 0.00011407678244972578, |
|
"loss": 0.1834, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.8638025594149908, |
|
"grad_norm": 11.703352928161621, |
|
"learning_rate": 0.00011361974405850091, |
|
"loss": 0.165, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.8683729433272395, |
|
"grad_norm": 5.437178611755371, |
|
"learning_rate": 0.00011316270566727605, |
|
"loss": 0.121, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.8683729433272395, |
|
"eval_accuracy": 0.9451333333333334, |
|
"eval_loss": 0.17528271675109863, |
|
"eval_runtime": 70.5119, |
|
"eval_samples_per_second": 212.73, |
|
"eval_steps_per_second": 13.303, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.8729433272394881, |
|
"grad_norm": 2.402738332748413, |
|
"learning_rate": 0.00011270566727605119, |
|
"loss": 0.1896, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.8775137111517367, |
|
"grad_norm": 0.6686537861824036, |
|
"learning_rate": 0.00011224862888482632, |
|
"loss": 0.1535, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.8820840950639853, |
|
"grad_norm": 3.365333080291748, |
|
"learning_rate": 0.00011179159049360146, |
|
"loss": 0.1135, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.886654478976234, |
|
"grad_norm": 3.6913065910339355, |
|
"learning_rate": 0.0001113345521023766, |
|
"loss": 0.1391, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.8912248628884827, |
|
"grad_norm": 7.079347133636475, |
|
"learning_rate": 0.00011087751371115173, |
|
"loss": 0.1528, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.8957952468007313, |
|
"grad_norm": 2.3773577213287354, |
|
"learning_rate": 0.00011042047531992688, |
|
"loss": 0.1345, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.9003656307129799, |
|
"grad_norm": 3.536985158920288, |
|
"learning_rate": 0.00010996343692870202, |
|
"loss": 0.1762, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.9049360146252285, |
|
"grad_norm": 0.4026005268096924, |
|
"learning_rate": 0.00010950639853747715, |
|
"loss": 0.2182, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.9095063985374772, |
|
"grad_norm": 2.521723747253418, |
|
"learning_rate": 0.00010904936014625229, |
|
"loss": 0.1531, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.9140767824497258, |
|
"grad_norm": 0.42068588733673096, |
|
"learning_rate": 0.00010859232175502743, |
|
"loss": 0.1918, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9140767824497258, |
|
"eval_accuracy": 0.9588, |
|
"eval_loss": 0.13700534403324127, |
|
"eval_runtime": 70.8331, |
|
"eval_samples_per_second": 211.766, |
|
"eval_steps_per_second": 13.242, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9186471663619744, |
|
"grad_norm": 0.27603089809417725, |
|
"learning_rate": 0.00010813528336380256, |
|
"loss": 0.172, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.923217550274223, |
|
"grad_norm": 1.674926996231079, |
|
"learning_rate": 0.0001076782449725777, |
|
"loss": 0.1129, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.9277879341864717, |
|
"grad_norm": 0.18493302166461945, |
|
"learning_rate": 0.00010722120658135284, |
|
"loss": 0.0895, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.9323583180987203, |
|
"grad_norm": 4.929255485534668, |
|
"learning_rate": 0.00010676416819012797, |
|
"loss": 0.0942, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.9369287020109689, |
|
"grad_norm": 2.887568950653076, |
|
"learning_rate": 0.00010630712979890312, |
|
"loss": 0.1769, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.9414990859232175, |
|
"grad_norm": 2.5681095123291016, |
|
"learning_rate": 0.00010585009140767826, |
|
"loss": 0.1939, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.9460694698354661, |
|
"grad_norm": 9.683144569396973, |
|
"learning_rate": 0.0001053930530164534, |
|
"loss": 0.212, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.9506398537477148, |
|
"grad_norm": 0.2908894419670105, |
|
"learning_rate": 0.00010493601462522853, |
|
"loss": 0.1635, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.9552102376599635, |
|
"grad_norm": 5.623505115509033, |
|
"learning_rate": 0.00010447897623400367, |
|
"loss": 0.0803, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.9597806215722121, |
|
"grad_norm": 1.6659337282180786, |
|
"learning_rate": 0.0001040219378427788, |
|
"loss": 0.1658, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.9597806215722121, |
|
"eval_accuracy": 0.9534666666666667, |
|
"eval_loss": 0.15428349375724792, |
|
"eval_runtime": 70.1951, |
|
"eval_samples_per_second": 213.69, |
|
"eval_steps_per_second": 13.363, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.9643510054844607, |
|
"grad_norm": 3.0254101753234863, |
|
"learning_rate": 0.00010356489945155394, |
|
"loss": 0.2228, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.9689213893967094, |
|
"grad_norm": 4.310238838195801, |
|
"learning_rate": 0.00010310786106032908, |
|
"loss": 0.1241, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.973491773308958, |
|
"grad_norm": 3.578296422958374, |
|
"learning_rate": 0.00010265082266910421, |
|
"loss": 0.212, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.9780621572212066, |
|
"grad_norm": 1.9950228929519653, |
|
"learning_rate": 0.00010219378427787935, |
|
"loss": 0.2508, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.9826325411334552, |
|
"grad_norm": 0.7868995666503906, |
|
"learning_rate": 0.0001017367458866545, |
|
"loss": 0.1755, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.9872029250457038, |
|
"grad_norm": 2.1217503547668457, |
|
"learning_rate": 0.00010127970749542961, |
|
"loss": 0.1431, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.9917733089579525, |
|
"grad_norm": 0.1202094554901123, |
|
"learning_rate": 0.00010082266910420475, |
|
"loss": 0.0914, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.9963436928702011, |
|
"grad_norm": 4.719986438751221, |
|
"learning_rate": 0.00010036563071297988, |
|
"loss": 0.137, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.0009140767824498, |
|
"grad_norm": 0.4999386668205261, |
|
"learning_rate": 9.990859232175503e-05, |
|
"loss": 0.1686, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.0054844606946984, |
|
"grad_norm": 0.2870078980922699, |
|
"learning_rate": 9.945155393053017e-05, |
|
"loss": 0.1088, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.0054844606946984, |
|
"eval_accuracy": 0.9577333333333333, |
|
"eval_loss": 0.1361219733953476, |
|
"eval_runtime": 70.2423, |
|
"eval_samples_per_second": 213.547, |
|
"eval_steps_per_second": 13.354, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.010054844606947, |
|
"grad_norm": 0.8209459781646729, |
|
"learning_rate": 9.89945155393053e-05, |
|
"loss": 0.1142, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.0146252285191957, |
|
"grad_norm": 9.744668006896973, |
|
"learning_rate": 9.853747714808045e-05, |
|
"loss": 0.0546, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.0191956124314443, |
|
"grad_norm": 0.15120500326156616, |
|
"learning_rate": 9.808043875685559e-05, |
|
"loss": 0.0184, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.023765996343693, |
|
"grad_norm": 3.5596585273742676, |
|
"learning_rate": 9.762340036563071e-05, |
|
"loss": 0.1219, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.0283363802559415, |
|
"grad_norm": 4.676599025726318, |
|
"learning_rate": 9.716636197440585e-05, |
|
"loss": 0.0767, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.0329067641681902, |
|
"grad_norm": 0.02072470262646675, |
|
"learning_rate": 9.670932358318099e-05, |
|
"loss": 0.0152, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.0374771480804388, |
|
"grad_norm": 0.09274908900260925, |
|
"learning_rate": 9.625228519195612e-05, |
|
"loss": 0.0423, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.0420475319926874, |
|
"grad_norm": 2.4434385299682617, |
|
"learning_rate": 9.579524680073126e-05, |
|
"loss": 0.0651, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.046617915904936, |
|
"grad_norm": 0.028794042766094208, |
|
"learning_rate": 9.53382084095064e-05, |
|
"loss": 0.0635, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.0511882998171846, |
|
"grad_norm": 1.619289755821228, |
|
"learning_rate": 9.488117001828155e-05, |
|
"loss": 0.0916, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.0511882998171846, |
|
"eval_accuracy": 0.9596666666666667, |
|
"eval_loss": 0.13929586112499237, |
|
"eval_runtime": 70.231, |
|
"eval_samples_per_second": 213.581, |
|
"eval_steps_per_second": 13.356, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.0557586837294333, |
|
"grad_norm": 4.361185073852539, |
|
"learning_rate": 9.442413162705668e-05, |
|
"loss": 0.039, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.0603290676416819, |
|
"grad_norm": 0.0270086620002985, |
|
"learning_rate": 9.396709323583182e-05, |
|
"loss": 0.087, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.0648994515539305, |
|
"grad_norm": 0.09628736972808838, |
|
"learning_rate": 9.351005484460696e-05, |
|
"loss": 0.0222, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.0694698354661791, |
|
"grad_norm": 4.285031318664551, |
|
"learning_rate": 9.305301645338209e-05, |
|
"loss": 0.0478, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.0740402193784278, |
|
"grad_norm": 0.06926529854536057, |
|
"learning_rate": 9.259597806215723e-05, |
|
"loss": 0.0531, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.0786106032906764, |
|
"grad_norm": 0.01788182742893696, |
|
"learning_rate": 9.213893967093236e-05, |
|
"loss": 0.0723, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.083180987202925, |
|
"grad_norm": 0.1285097748041153, |
|
"learning_rate": 9.16819012797075e-05, |
|
"loss": 0.0565, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.0877513711151736, |
|
"grad_norm": 0.013244764879345894, |
|
"learning_rate": 9.122486288848264e-05, |
|
"loss": 0.0363, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.0923217550274222, |
|
"grad_norm": 2.3318049907684326, |
|
"learning_rate": 9.076782449725777e-05, |
|
"loss": 0.0584, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.0968921389396709, |
|
"grad_norm": 0.3406066298484802, |
|
"learning_rate": 9.031078610603291e-05, |
|
"loss": 0.005, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.0968921389396709, |
|
"eval_accuracy": 0.9620666666666666, |
|
"eval_loss": 0.12949973344802856, |
|
"eval_runtime": 70.9675, |
|
"eval_samples_per_second": 211.364, |
|
"eval_steps_per_second": 13.217, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.1014625228519195, |
|
"grad_norm": 0.09992707520723343, |
|
"learning_rate": 8.985374771480805e-05, |
|
"loss": 0.006, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.106032906764168, |
|
"grad_norm": 3.9798166751861572, |
|
"learning_rate": 8.939670932358318e-05, |
|
"loss": 0.0458, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.1106032906764167, |
|
"grad_norm": 0.20624032616615295, |
|
"learning_rate": 8.893967093235832e-05, |
|
"loss": 0.0366, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.1151736745886653, |
|
"grad_norm": 0.03891080617904663, |
|
"learning_rate": 8.848263254113346e-05, |
|
"loss": 0.0118, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.1197440585009142, |
|
"grad_norm": 7.250652313232422, |
|
"learning_rate": 8.802559414990859e-05, |
|
"loss": 0.0622, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.1243144424131628, |
|
"grad_norm": 2.0701119899749756, |
|
"learning_rate": 8.756855575868373e-05, |
|
"loss": 0.0504, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.1288848263254114, |
|
"grad_norm": 4.752568244934082, |
|
"learning_rate": 8.711151736745888e-05, |
|
"loss": 0.0581, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.13345521023766, |
|
"grad_norm": 0.023835673928260803, |
|
"learning_rate": 8.665447897623402e-05, |
|
"loss": 0.0401, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.1380255941499087, |
|
"grad_norm": 0.009058034047484398, |
|
"learning_rate": 8.619744058500915e-05, |
|
"loss": 0.0406, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.1425959780621573, |
|
"grad_norm": 0.2688920795917511, |
|
"learning_rate": 8.574040219378429e-05, |
|
"loss": 0.0294, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.1425959780621573, |
|
"eval_accuracy": 0.9639333333333333, |
|
"eval_loss": 0.1327054649591446, |
|
"eval_runtime": 70.9468, |
|
"eval_samples_per_second": 211.426, |
|
"eval_steps_per_second": 13.221, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.147166361974406, |
|
"grad_norm": 0.007774589583277702, |
|
"learning_rate": 8.528336380255942e-05, |
|
"loss": 0.0427, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 1.1517367458866545, |
|
"grad_norm": 5.866430759429932, |
|
"learning_rate": 8.482632541133455e-05, |
|
"loss": 0.0595, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 1.1563071297989032, |
|
"grad_norm": 0.11866763979196548, |
|
"learning_rate": 8.436928702010968e-05, |
|
"loss": 0.0673, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 1.1608775137111518, |
|
"grad_norm": 5.5359978675842285, |
|
"learning_rate": 8.391224862888482e-05, |
|
"loss": 0.1446, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 1.1654478976234004, |
|
"grad_norm": 0.016985343769192696, |
|
"learning_rate": 8.345521023765997e-05, |
|
"loss": 0.0559, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.170018281535649, |
|
"grad_norm": 0.7032074928283691, |
|
"learning_rate": 8.29981718464351e-05, |
|
"loss": 0.0093, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 1.1745886654478976, |
|
"grad_norm": 0.14500297605991364, |
|
"learning_rate": 8.254113345521024e-05, |
|
"loss": 0.0591, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 1.1791590493601463, |
|
"grad_norm": 4.615384578704834, |
|
"learning_rate": 8.208409506398538e-05, |
|
"loss": 0.0167, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 1.1837294332723949, |
|
"grad_norm": 3.747305154800415, |
|
"learning_rate": 8.162705667276052e-05, |
|
"loss": 0.1315, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 1.1882998171846435, |
|
"grad_norm": 6.55547571182251, |
|
"learning_rate": 8.117001828153565e-05, |
|
"loss": 0.0939, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.1882998171846435, |
|
"eval_accuracy": 0.9621333333333333, |
|
"eval_loss": 0.1408853828907013, |
|
"eval_runtime": 70.3628, |
|
"eval_samples_per_second": 213.181, |
|
"eval_steps_per_second": 13.331, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.1928702010968921, |
|
"grad_norm": 0.018332751467823982, |
|
"learning_rate": 8.071297989031079e-05, |
|
"loss": 0.0422, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 1.1974405850091407, |
|
"grad_norm": 8.07509708404541, |
|
"learning_rate": 8.025594149908592e-05, |
|
"loss": 0.0548, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 1.2020109689213894, |
|
"grad_norm": 0.015664540231227875, |
|
"learning_rate": 7.979890310786106e-05, |
|
"loss": 0.0201, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 1.206581352833638, |
|
"grad_norm": 0.34166190028190613, |
|
"learning_rate": 7.934186471663621e-05, |
|
"loss": 0.0063, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 1.2111517367458866, |
|
"grad_norm": 0.024543585255742073, |
|
"learning_rate": 7.888482632541135e-05, |
|
"loss": 0.067, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.2157221206581352, |
|
"grad_norm": 5.0015788078308105, |
|
"learning_rate": 7.842778793418648e-05, |
|
"loss": 0.0677, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 1.2202925045703839, |
|
"grad_norm": 0.3825192153453827, |
|
"learning_rate": 7.79707495429616e-05, |
|
"loss": 0.0447, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 1.2248628884826325, |
|
"grad_norm": 1.0526628494262695, |
|
"learning_rate": 7.751371115173674e-05, |
|
"loss": 0.0264, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 1.229433272394881, |
|
"grad_norm": 0.015143281780183315, |
|
"learning_rate": 7.705667276051188e-05, |
|
"loss": 0.0182, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 1.2340036563071297, |
|
"grad_norm": 6.874438285827637, |
|
"learning_rate": 7.659963436928702e-05, |
|
"loss": 0.0756, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.2340036563071297, |
|
"eval_accuracy": 0.9682, |
|
"eval_loss": 0.1201971173286438, |
|
"eval_runtime": 70.6459, |
|
"eval_samples_per_second": 212.327, |
|
"eval_steps_per_second": 13.277, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.2385740402193783, |
|
"grad_norm": 0.02307463437318802, |
|
"learning_rate": 7.614259597806215e-05, |
|
"loss": 0.033, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 1.2431444241316272, |
|
"grad_norm": 0.01136768702417612, |
|
"learning_rate": 7.56855575868373e-05, |
|
"loss": 0.0762, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 1.2477148080438756, |
|
"grad_norm": 5.031988620758057, |
|
"learning_rate": 7.522851919561244e-05, |
|
"loss": 0.0692, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 1.2522851919561244, |
|
"grad_norm": 0.028815852478146553, |
|
"learning_rate": 7.477148080438758e-05, |
|
"loss": 0.008, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 1.2568555758683728, |
|
"grad_norm": 10.7840576171875, |
|
"learning_rate": 7.431444241316271e-05, |
|
"loss": 0.0711, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.2614259597806217, |
|
"grad_norm": 3.8280370235443115, |
|
"learning_rate": 7.385740402193785e-05, |
|
"loss": 0.0581, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 1.26599634369287, |
|
"grad_norm": 0.03191199526190758, |
|
"learning_rate": 7.340036563071298e-05, |
|
"loss": 0.0882, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 1.270566727605119, |
|
"grad_norm": 0.010684626176953316, |
|
"learning_rate": 7.294332723948812e-05, |
|
"loss": 0.0228, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 1.2751371115173675, |
|
"grad_norm": 0.01165696233510971, |
|
"learning_rate": 7.248628884826326e-05, |
|
"loss": 0.0364, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 1.2797074954296161, |
|
"grad_norm": 5.020371913909912, |
|
"learning_rate": 7.20292504570384e-05, |
|
"loss": 0.0466, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.2797074954296161, |
|
"eval_accuracy": 0.964, |
|
"eval_loss": 0.1273525506258011, |
|
"eval_runtime": 70.5245, |
|
"eval_samples_per_second": 212.692, |
|
"eval_steps_per_second": 13.3, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.2842778793418648, |
|
"grad_norm": 3.339799165725708, |
|
"learning_rate": 7.157221206581353e-05, |
|
"loss": 0.0331, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 1.2888482632541134, |
|
"grad_norm": 2.2458271980285645, |
|
"learning_rate": 7.111517367458867e-05, |
|
"loss": 0.0139, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 1.293418647166362, |
|
"grad_norm": 0.03158143162727356, |
|
"learning_rate": 7.06581352833638e-05, |
|
"loss": 0.0316, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 1.2979890310786106, |
|
"grad_norm": 0.012245237827301025, |
|
"learning_rate": 7.020109689213894e-05, |
|
"loss": 0.0501, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 1.3025594149908593, |
|
"grad_norm": 6.688177585601807, |
|
"learning_rate": 6.974405850091408e-05, |
|
"loss": 0.0349, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.3071297989031079, |
|
"grad_norm": 0.01068816240876913, |
|
"learning_rate": 6.928702010968921e-05, |
|
"loss": 0.0373, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 1.3117001828153565, |
|
"grad_norm": 0.017882896587252617, |
|
"learning_rate": 6.882998171846435e-05, |
|
"loss": 0.0272, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 1.3162705667276051, |
|
"grad_norm": 0.009448254480957985, |
|
"learning_rate": 6.837294332723948e-05, |
|
"loss": 0.045, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 1.3208409506398537, |
|
"grad_norm": 8.753164291381836, |
|
"learning_rate": 6.791590493601463e-05, |
|
"loss": 0.1252, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 1.3254113345521024, |
|
"grad_norm": 0.02937444858253002, |
|
"learning_rate": 6.745886654478977e-05, |
|
"loss": 0.0565, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.3254113345521024, |
|
"eval_accuracy": 0.9662666666666667, |
|
"eval_loss": 0.12496425956487656, |
|
"eval_runtime": 70.9263, |
|
"eval_samples_per_second": 211.487, |
|
"eval_steps_per_second": 13.225, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.329981718464351, |
|
"grad_norm": 0.7451736927032471, |
|
"learning_rate": 6.700182815356491e-05, |
|
"loss": 0.0298, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 1.3345521023765996, |
|
"grad_norm": 0.014505515806376934, |
|
"learning_rate": 6.654478976234004e-05, |
|
"loss": 0.0954, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 1.3391224862888482, |
|
"grad_norm": 6.800475597381592, |
|
"learning_rate": 6.608775137111518e-05, |
|
"loss": 0.0584, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 1.3436928702010968, |
|
"grad_norm": 0.08044274151325226, |
|
"learning_rate": 6.563071297989032e-05, |
|
"loss": 0.0944, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 1.3482632541133455, |
|
"grad_norm": 0.051912058144807816, |
|
"learning_rate": 6.517367458866545e-05, |
|
"loss": 0.0128, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.352833638025594, |
|
"grad_norm": 0.8656260967254639, |
|
"learning_rate": 6.471663619744059e-05, |
|
"loss": 0.1187, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 1.3574040219378427, |
|
"grad_norm": 0.004978422075510025, |
|
"learning_rate": 6.425959780621573e-05, |
|
"loss": 0.0228, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 1.3619744058500913, |
|
"grad_norm": 0.5934199094772339, |
|
"learning_rate": 6.380255941499086e-05, |
|
"loss": 0.0038, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 1.3665447897623402, |
|
"grad_norm": 3.7717771530151367, |
|
"learning_rate": 6.3345521023766e-05, |
|
"loss": 0.087, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 1.3711151736745886, |
|
"grad_norm": 0.01153448224067688, |
|
"learning_rate": 6.288848263254114e-05, |
|
"loss": 0.0609, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.3711151736745886, |
|
"eval_accuracy": 0.9656666666666667, |
|
"eval_loss": 0.12994171679019928, |
|
"eval_runtime": 71.2707, |
|
"eval_samples_per_second": 210.465, |
|
"eval_steps_per_second": 13.161, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.3756855575868374, |
|
"grad_norm": 0.17517925798892975, |
|
"learning_rate": 6.243144424131627e-05, |
|
"loss": 0.026, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 1.3802559414990858, |
|
"grad_norm": 0.11696294695138931, |
|
"learning_rate": 6.197440585009141e-05, |
|
"loss": 0.025, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 1.3848263254113347, |
|
"grad_norm": 0.007365319412201643, |
|
"learning_rate": 6.151736745886654e-05, |
|
"loss": 0.0251, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 1.389396709323583, |
|
"grad_norm": 0.3844846189022064, |
|
"learning_rate": 6.106032906764168e-05, |
|
"loss": 0.0252, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 1.393967093235832, |
|
"grad_norm": 0.014660494402050972, |
|
"learning_rate": 6.0603290676416824e-05, |
|
"loss": 0.0274, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.3985374771480805, |
|
"grad_norm": 4.684697151184082, |
|
"learning_rate": 6.014625228519196e-05, |
|
"loss": 0.0294, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 1.4031078610603291, |
|
"grad_norm": 0.05706701800227165, |
|
"learning_rate": 5.96892138939671e-05, |
|
"loss": 0.0053, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 1.4076782449725778, |
|
"grad_norm": 0.010988248512148857, |
|
"learning_rate": 5.923217550274224e-05, |
|
"loss": 0.0227, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 1.4122486288848264, |
|
"grad_norm": 0.009499771520495415, |
|
"learning_rate": 5.8775137111517377e-05, |
|
"loss": 0.0061, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 1.416819012797075, |
|
"grad_norm": 0.35233938694000244, |
|
"learning_rate": 5.83180987202925e-05, |
|
"loss": 0.0201, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.416819012797075, |
|
"eval_accuracy": 0.9685333333333334, |
|
"eval_loss": 0.12030760943889618, |
|
"eval_runtime": 70.6066, |
|
"eval_samples_per_second": 212.445, |
|
"eval_steps_per_second": 13.285, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.4213893967093236, |
|
"grad_norm": 0.680586576461792, |
|
"learning_rate": 5.786106032906764e-05, |
|
"loss": 0.0825, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 1.4259597806215722, |
|
"grad_norm": 1.1647635698318481, |
|
"learning_rate": 5.740402193784278e-05, |
|
"loss": 0.0668, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 1.4305301645338209, |
|
"grad_norm": 0.3994854986667633, |
|
"learning_rate": 5.6946983546617915e-05, |
|
"loss": 0.054, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 1.4351005484460695, |
|
"grad_norm": 0.012723923660814762, |
|
"learning_rate": 5.648994515539305e-05, |
|
"loss": 0.1109, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 1.4396709323583181, |
|
"grad_norm": 0.02893258072435856, |
|
"learning_rate": 5.603290676416819e-05, |
|
"loss": 0.0447, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 1.4442413162705667, |
|
"grad_norm": 2.296046495437622, |
|
"learning_rate": 5.557586837294333e-05, |
|
"loss": 0.0139, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 1.4488117001828154, |
|
"grad_norm": 0.0075446791015565395, |
|
"learning_rate": 5.511882998171847e-05, |
|
"loss": 0.0254, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 1.453382084095064, |
|
"grad_norm": 4.601187705993652, |
|
"learning_rate": 5.47074954296161e-05, |
|
"loss": 0.0619, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 1.4579524680073126, |
|
"grad_norm": 0.028089461848139763, |
|
"learning_rate": 5.425045703839122e-05, |
|
"loss": 0.0324, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 1.4625228519195612, |
|
"grad_norm": 0.00567116541787982, |
|
"learning_rate": 5.3793418647166363e-05, |
|
"loss": 0.0258, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.4625228519195612, |
|
"eval_accuracy": 0.9692666666666667, |
|
"eval_loss": 0.11664163321256638, |
|
"eval_runtime": 70.1343, |
|
"eval_samples_per_second": 213.875, |
|
"eval_steps_per_second": 13.374, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.4670932358318098, |
|
"grad_norm": 0.008565380237996578, |
|
"learning_rate": 5.33363802559415e-05, |
|
"loss": 0.1077, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 1.4716636197440585, |
|
"grad_norm": 6.626660346984863, |
|
"learning_rate": 5.2879341864716636e-05, |
|
"loss": 0.0685, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 1.476234003656307, |
|
"grad_norm": 5.336280822753906, |
|
"learning_rate": 5.242230347349177e-05, |
|
"loss": 0.0734, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 1.4808043875685557, |
|
"grad_norm": 2.9951882362365723, |
|
"learning_rate": 5.196526508226691e-05, |
|
"loss": 0.0216, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 1.4853747714808043, |
|
"grad_norm": 0.44102242588996887, |
|
"learning_rate": 5.150822669104205e-05, |
|
"loss": 0.0036, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 1.489945155393053, |
|
"grad_norm": 3.0561587810516357, |
|
"learning_rate": 5.105118829981719e-05, |
|
"loss": 0.0808, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 1.4945155393053016, |
|
"grad_norm": 0.09664315730333328, |
|
"learning_rate": 5.0594149908592325e-05, |
|
"loss": 0.0796, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 1.4990859232175504, |
|
"grad_norm": 0.01629328727722168, |
|
"learning_rate": 5.013711151736746e-05, |
|
"loss": 0.003, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 1.5036563071297988, |
|
"grad_norm": 0.008465313352644444, |
|
"learning_rate": 4.96800731261426e-05, |
|
"loss": 0.1065, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 1.5082266910420477, |
|
"grad_norm": 2.8709588050842285, |
|
"learning_rate": 4.9223034734917734e-05, |
|
"loss": 0.0913, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.5082266910420477, |
|
"eval_accuracy": 0.9736, |
|
"eval_loss": 0.10086899250745773, |
|
"eval_runtime": 70.4855, |
|
"eval_samples_per_second": 212.81, |
|
"eval_steps_per_second": 13.308, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.512797074954296, |
|
"grad_norm": 0.006857722532004118, |
|
"learning_rate": 4.876599634369287e-05, |
|
"loss": 0.0035, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 1.517367458866545, |
|
"grad_norm": 0.046820204704999924, |
|
"learning_rate": 4.830895795246801e-05, |
|
"loss": 0.0234, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 1.5219378427787933, |
|
"grad_norm": 0.009667345322668552, |
|
"learning_rate": 4.785191956124315e-05, |
|
"loss": 0.0304, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 1.5265082266910421, |
|
"grad_norm": 5.041330814361572, |
|
"learning_rate": 4.739488117001829e-05, |
|
"loss": 0.0812, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 1.5310786106032905, |
|
"grad_norm": 0.12366422265768051, |
|
"learning_rate": 4.693784277879342e-05, |
|
"loss": 0.0913, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 1.5356489945155394, |
|
"grad_norm": 0.19020313024520874, |
|
"learning_rate": 4.648080438756856e-05, |
|
"loss": 0.0027, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 1.5402193784277878, |
|
"grad_norm": 0.008719071745872498, |
|
"learning_rate": 4.6023765996343696e-05, |
|
"loss": 0.0087, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 1.5447897623400366, |
|
"grad_norm": 5.164638996124268, |
|
"learning_rate": 4.556672760511883e-05, |
|
"loss": 0.0509, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 1.5493601462522852, |
|
"grad_norm": 1.9091380834579468, |
|
"learning_rate": 4.510968921389397e-05, |
|
"loss": 0.0622, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 1.5539305301645339, |
|
"grad_norm": 0.015776393935084343, |
|
"learning_rate": 4.4652650822669105e-05, |
|
"loss": 0.0235, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.5539305301645339, |
|
"eval_accuracy": 0.9732, |
|
"eval_loss": 0.0964307188987732, |
|
"eval_runtime": 71.1476, |
|
"eval_samples_per_second": 210.829, |
|
"eval_steps_per_second": 13.184, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.5585009140767825, |
|
"grad_norm": 0.04088925942778587, |
|
"learning_rate": 4.419561243144424e-05, |
|
"loss": 0.0579, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 1.563071297989031, |
|
"grad_norm": 0.04248817265033722, |
|
"learning_rate": 4.3738574040219385e-05, |
|
"loss": 0.0368, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 1.5676416819012797, |
|
"grad_norm": 0.06178814917802811, |
|
"learning_rate": 4.328153564899452e-05, |
|
"loss": 0.0356, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 1.5722120658135283, |
|
"grad_norm": 0.014863620512187481, |
|
"learning_rate": 4.282449725776965e-05, |
|
"loss": 0.0173, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 1.576782449725777, |
|
"grad_norm": 0.0053153312765061855, |
|
"learning_rate": 4.236745886654479e-05, |
|
"loss": 0.0295, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 1.5813528336380256, |
|
"grad_norm": 0.03949157893657684, |
|
"learning_rate": 4.191042047531993e-05, |
|
"loss": 0.0751, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 1.5859232175502742, |
|
"grad_norm": 0.01701487973332405, |
|
"learning_rate": 4.145338208409507e-05, |
|
"loss": 0.0311, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 1.5904936014625228, |
|
"grad_norm": 0.06288379430770874, |
|
"learning_rate": 4.09963436928702e-05, |
|
"loss": 0.0779, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 1.5950639853747715, |
|
"grad_norm": 12.93021297454834, |
|
"learning_rate": 4.053930530164534e-05, |
|
"loss": 0.0274, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 1.59963436928702, |
|
"grad_norm": 0.03469611704349518, |
|
"learning_rate": 4.008226691042048e-05, |
|
"loss": 0.0089, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.59963436928702, |
|
"eval_accuracy": 0.9747333333333333, |
|
"eval_loss": 0.09657016396522522, |
|
"eval_runtime": 70.2849, |
|
"eval_samples_per_second": 213.417, |
|
"eval_steps_per_second": 13.346, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.6042047531992687, |
|
"grad_norm": 0.01145760528743267, |
|
"learning_rate": 3.962522851919561e-05, |
|
"loss": 0.0226, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 1.6087751371115173, |
|
"grad_norm": 0.006313066463917494, |
|
"learning_rate": 3.916819012797075e-05, |
|
"loss": 0.1061, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 1.6133455210237662, |
|
"grad_norm": 0.006270520854741335, |
|
"learning_rate": 3.8711151736745885e-05, |
|
"loss": 0.0439, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 1.6179159049360146, |
|
"grad_norm": 2.423236608505249, |
|
"learning_rate": 3.825411334552103e-05, |
|
"loss": 0.0273, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 1.6224862888482634, |
|
"grad_norm": 0.025253351777791977, |
|
"learning_rate": 3.7797074954296165e-05, |
|
"loss": 0.0265, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 1.6270566727605118, |
|
"grad_norm": 0.016615109518170357, |
|
"learning_rate": 3.73400365630713e-05, |
|
"loss": 0.0226, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 1.6316270566727606, |
|
"grad_norm": 0.006486339028924704, |
|
"learning_rate": 3.688299817184644e-05, |
|
"loss": 0.0108, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 1.636197440585009, |
|
"grad_norm": 0.019093792885541916, |
|
"learning_rate": 3.6425959780621574e-05, |
|
"loss": 0.0916, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 1.6407678244972579, |
|
"grad_norm": 0.014140899293124676, |
|
"learning_rate": 3.596892138939671e-05, |
|
"loss": 0.1538, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 1.6453382084095063, |
|
"grad_norm": 0.009632795117795467, |
|
"learning_rate": 3.551188299817185e-05, |
|
"loss": 0.0455, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.6453382084095063, |
|
"eval_accuracy": 0.9748, |
|
"eval_loss": 0.09634851664304733, |
|
"eval_runtime": 70.4832, |
|
"eval_samples_per_second": 212.817, |
|
"eval_steps_per_second": 13.308, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.6499085923217551, |
|
"grad_norm": 0.538037896156311, |
|
"learning_rate": 3.505484460694698e-05, |
|
"loss": 0.0632, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 1.6544789762340035, |
|
"grad_norm": 2.536642551422119, |
|
"learning_rate": 3.459780621572212e-05, |
|
"loss": 0.0215, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 1.6590493601462524, |
|
"grad_norm": 0.014462544582784176, |
|
"learning_rate": 3.414076782449726e-05, |
|
"loss": 0.0783, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 1.6636197440585008, |
|
"grad_norm": 0.04104587808251381, |
|
"learning_rate": 3.36837294332724e-05, |
|
"loss": 0.0468, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 1.6681901279707496, |
|
"grad_norm": 12.753653526306152, |
|
"learning_rate": 3.3226691042047536e-05, |
|
"loss": 0.096, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 1.672760511882998, |
|
"grad_norm": 0.6275530457496643, |
|
"learning_rate": 3.2769652650822665e-05, |
|
"loss": 0.0459, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 1.6773308957952469, |
|
"grad_norm": 0.028450943529605865, |
|
"learning_rate": 3.231261425959781e-05, |
|
"loss": 0.0129, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 1.6819012797074955, |
|
"grad_norm": 3.766301393508911, |
|
"learning_rate": 3.1855575868372945e-05, |
|
"loss": 0.01, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 1.686471663619744, |
|
"grad_norm": 1.72735595703125, |
|
"learning_rate": 3.139853747714808e-05, |
|
"loss": 0.0162, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 1.6910420475319927, |
|
"grad_norm": 0.012926338240504265, |
|
"learning_rate": 3.094149908592322e-05, |
|
"loss": 0.0271, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.6910420475319927, |
|
"eval_accuracy": 0.9762666666666666, |
|
"eval_loss": 0.0874376893043518, |
|
"eval_runtime": 71.1478, |
|
"eval_samples_per_second": 210.829, |
|
"eval_steps_per_second": 13.184, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.6956124314442413, |
|
"grad_norm": 0.20961987972259521, |
|
"learning_rate": 3.0484460694698358e-05, |
|
"loss": 0.0269, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 1.70018281535649, |
|
"grad_norm": 5.752171039581299, |
|
"learning_rate": 3.0027422303473497e-05, |
|
"loss": 0.0117, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 1.7047531992687386, |
|
"grad_norm": 0.02492084540426731, |
|
"learning_rate": 2.9570383912248627e-05, |
|
"loss": 0.0494, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 1.7093235831809872, |
|
"grad_norm": 1.6408967971801758, |
|
"learning_rate": 2.9113345521023767e-05, |
|
"loss": 0.0079, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 1.7138939670932358, |
|
"grad_norm": 0.010781402699649334, |
|
"learning_rate": 2.8656307129798903e-05, |
|
"loss": 0.0288, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 1.7184643510054844, |
|
"grad_norm": 0.007589300163090229, |
|
"learning_rate": 2.8199268738574043e-05, |
|
"loss": 0.0304, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 1.723034734917733, |
|
"grad_norm": 0.0105056157335639, |
|
"learning_rate": 2.774223034734918e-05, |
|
"loss": 0.0023, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 1.7276051188299817, |
|
"grad_norm": 0.028248343616724014, |
|
"learning_rate": 2.7285191956124316e-05, |
|
"loss": 0.0156, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 1.7321755027422303, |
|
"grad_norm": 0.004776041954755783, |
|
"learning_rate": 2.6828153564899456e-05, |
|
"loss": 0.0113, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 1.736745886654479, |
|
"grad_norm": 7.163562297821045, |
|
"learning_rate": 2.637111517367459e-05, |
|
"loss": 0.0407, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.736745886654479, |
|
"eval_accuracy": 0.9761333333333333, |
|
"eval_loss": 0.08977096527814865, |
|
"eval_runtime": 70.6674, |
|
"eval_samples_per_second": 212.262, |
|
"eval_steps_per_second": 13.273, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.7413162705667276, |
|
"grad_norm": 1.1980034112930298, |
|
"learning_rate": 2.5914076782449725e-05, |
|
"loss": 0.008, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 1.7458866544789764, |
|
"grad_norm": 5.438980579376221, |
|
"learning_rate": 2.5457038391224865e-05, |
|
"loss": 0.0558, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 1.7504570383912248, |
|
"grad_norm": 0.02217746712267399, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.0057, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 1.7550274223034736, |
|
"grad_norm": 0.595504641532898, |
|
"learning_rate": 2.4542961608775138e-05, |
|
"loss": 0.0551, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 1.759597806215722, |
|
"grad_norm": 0.09388578683137894, |
|
"learning_rate": 2.4085923217550274e-05, |
|
"loss": 0.0744, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 1.7641681901279709, |
|
"grad_norm": 2.807389736175537, |
|
"learning_rate": 2.362888482632541e-05, |
|
"loss": 0.0051, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 1.7687385740402193, |
|
"grad_norm": 0.006797166541218758, |
|
"learning_rate": 2.317184643510055e-05, |
|
"loss": 0.0069, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 1.7733089579524681, |
|
"grad_norm": 0.0043932488188147545, |
|
"learning_rate": 2.2714808043875687e-05, |
|
"loss": 0.0435, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 1.7778793418647165, |
|
"grad_norm": 0.009305426850914955, |
|
"learning_rate": 2.2257769652650823e-05, |
|
"loss": 0.0189, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 1.7824497257769654, |
|
"grad_norm": 0.003490304574370384, |
|
"learning_rate": 2.180073126142596e-05, |
|
"loss": 0.1095, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.7824497257769654, |
|
"eval_accuracy": 0.976, |
|
"eval_loss": 0.08494840562343597, |
|
"eval_runtime": 71.5803, |
|
"eval_samples_per_second": 209.555, |
|
"eval_steps_per_second": 13.104, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.7870201096892138, |
|
"grad_norm": 0.00673332205042243, |
|
"learning_rate": 2.13436928702011e-05, |
|
"loss": 0.0653, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 1.7915904936014626, |
|
"grad_norm": 0.9469023942947388, |
|
"learning_rate": 2.0886654478976232e-05, |
|
"loss": 0.0077, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 1.796160877513711, |
|
"grad_norm": 0.03154715150594711, |
|
"learning_rate": 2.0429616087751372e-05, |
|
"loss": 0.0323, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 1.8007312614259599, |
|
"grad_norm": 0.020610906183719635, |
|
"learning_rate": 1.997257769652651e-05, |
|
"loss": 0.0211, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 1.8053016453382082, |
|
"grad_norm": 0.014532508328557014, |
|
"learning_rate": 1.9515539305301648e-05, |
|
"loss": 0.0357, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 1.809872029250457, |
|
"grad_norm": 0.020481685176491737, |
|
"learning_rate": 1.905850091407678e-05, |
|
"loss": 0.0498, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 1.8144424131627057, |
|
"grad_norm": 0.018279431387782097, |
|
"learning_rate": 1.860146252285192e-05, |
|
"loss": 0.0559, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 1.8190127970749543, |
|
"grad_norm": 0.03680342435836792, |
|
"learning_rate": 1.8144424131627057e-05, |
|
"loss": 0.0176, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 1.823583180987203, |
|
"grad_norm": 0.014550072140991688, |
|
"learning_rate": 1.7687385740402197e-05, |
|
"loss": 0.1098, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 1.8281535648994516, |
|
"grad_norm": 0.01788398250937462, |
|
"learning_rate": 1.723034734917733e-05, |
|
"loss": 0.0327, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.8281535648994516, |
|
"eval_accuracy": 0.9745333333333334, |
|
"eval_loss": 0.0925898626446724, |
|
"eval_runtime": 71.855, |
|
"eval_samples_per_second": 208.754, |
|
"eval_steps_per_second": 13.054, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.8327239488117002, |
|
"grad_norm": 1.894504189491272, |
|
"learning_rate": 1.677330895795247e-05, |
|
"loss": 0.0373, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 1.8372943327239488, |
|
"grad_norm": 0.005655787419527769, |
|
"learning_rate": 1.6316270566727607e-05, |
|
"loss": 0.0067, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 1.8418647166361974, |
|
"grad_norm": 4.612732410430908, |
|
"learning_rate": 1.5859232175502743e-05, |
|
"loss": 0.0183, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 1.846435100548446, |
|
"grad_norm": 0.7349024415016174, |
|
"learning_rate": 1.540219378427788e-05, |
|
"loss": 0.0021, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 1.8510054844606947, |
|
"grad_norm": 0.03837637975811958, |
|
"learning_rate": 1.4945155393053017e-05, |
|
"loss": 0.0031, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 1.8555758683729433, |
|
"grad_norm": 0.0046151746064424515, |
|
"learning_rate": 1.4488117001828156e-05, |
|
"loss": 0.1252, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 1.860146252285192, |
|
"grad_norm": 0.01960400864481926, |
|
"learning_rate": 1.403107861060329e-05, |
|
"loss": 0.0024, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 1.8647166361974405, |
|
"grad_norm": 0.012547549791634083, |
|
"learning_rate": 1.3574040219378428e-05, |
|
"loss": 0.0207, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 1.8692870201096892, |
|
"grad_norm": 0.0962536633014679, |
|
"learning_rate": 1.3117001828153566e-05, |
|
"loss": 0.0475, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 1.8738574040219378, |
|
"grad_norm": 0.012872631661593914, |
|
"learning_rate": 1.2659963436928701e-05, |
|
"loss": 0.0427, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.8738574040219378, |
|
"eval_accuracy": 0.9768666666666667, |
|
"eval_loss": 0.08114204555749893, |
|
"eval_runtime": 70.3536, |
|
"eval_samples_per_second": 213.209, |
|
"eval_steps_per_second": 13.333, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.8784277879341866, |
|
"grad_norm": 0.004386584740132093, |
|
"learning_rate": 1.220292504570384e-05, |
|
"loss": 0.0148, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 1.882998171846435, |
|
"grad_norm": 7.613697528839111, |
|
"learning_rate": 1.1745886654478977e-05, |
|
"loss": 0.0598, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 1.8875685557586839, |
|
"grad_norm": 0.19733187556266785, |
|
"learning_rate": 1.1288848263254114e-05, |
|
"loss": 0.0254, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 1.8921389396709323, |
|
"grad_norm": 0.003689356381073594, |
|
"learning_rate": 1.0831809872029252e-05, |
|
"loss": 0.0044, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 1.8967093235831811, |
|
"grad_norm": 1.0105313062667847, |
|
"learning_rate": 1.0374771480804388e-05, |
|
"loss": 0.03, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 1.9012797074954295, |
|
"grad_norm": 0.02574901282787323, |
|
"learning_rate": 9.917733089579526e-06, |
|
"loss": 0.012, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 1.9058500914076784, |
|
"grad_norm": 4.768786907196045, |
|
"learning_rate": 9.460694698354663e-06, |
|
"loss": 0.0395, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 1.9104204753199268, |
|
"grad_norm": 0.004033361561596394, |
|
"learning_rate": 9.0036563071298e-06, |
|
"loss": 0.012, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 1.9149908592321756, |
|
"grad_norm": 0.013113109394907951, |
|
"learning_rate": 8.546617915904936e-06, |
|
"loss": 0.0672, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 1.919561243144424, |
|
"grad_norm": 0.294810950756073, |
|
"learning_rate": 8.089579524680074e-06, |
|
"loss": 0.003, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.919561243144424, |
|
"eval_accuracy": 0.9761333333333333, |
|
"eval_loss": 0.08205202966928482, |
|
"eval_runtime": 70.0255, |
|
"eval_samples_per_second": 214.208, |
|
"eval_steps_per_second": 13.395, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.9241316270566728, |
|
"grad_norm": 0.05837790668010712, |
|
"learning_rate": 7.63254113345521e-06, |
|
"loss": 0.0415, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 1.9287020109689212, |
|
"grad_norm": 1.440628170967102, |
|
"learning_rate": 7.175502742230347e-06, |
|
"loss": 0.0267, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 1.93327239488117, |
|
"grad_norm": 0.04937027022242546, |
|
"learning_rate": 6.7184643510054855e-06, |
|
"loss": 0.0385, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 1.9378427787934185, |
|
"grad_norm": 0.006680316291749477, |
|
"learning_rate": 6.261425959780622e-06, |
|
"loss": 0.0025, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 1.9424131627056673, |
|
"grad_norm": 0.004382527898997068, |
|
"learning_rate": 5.804387568555759e-06, |
|
"loss": 0.0843, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 1.946983546617916, |
|
"grad_norm": 0.06263825297355652, |
|
"learning_rate": 5.3473491773308956e-06, |
|
"loss": 0.0359, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 1.9515539305301646, |
|
"grad_norm": 0.017276106402277946, |
|
"learning_rate": 4.890310786106033e-06, |
|
"loss": 0.0264, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 1.9561243144424132, |
|
"grad_norm": 0.7312209606170654, |
|
"learning_rate": 4.43327239488117e-06, |
|
"loss": 0.0128, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 1.9606946983546618, |
|
"grad_norm": 0.007708389312028885, |
|
"learning_rate": 3.976234003656307e-06, |
|
"loss": 0.0356, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 1.9652650822669104, |
|
"grad_norm": 0.004229346755892038, |
|
"learning_rate": 3.5191956124314446e-06, |
|
"loss": 0.0182, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.9652650822669104, |
|
"eval_accuracy": 0.9772666666666666, |
|
"eval_loss": 0.08026164770126343, |
|
"eval_runtime": 69.5428, |
|
"eval_samples_per_second": 215.695, |
|
"eval_steps_per_second": 13.488, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.969835466179159, |
|
"grad_norm": 0.0215240940451622, |
|
"learning_rate": 3.0621572212065814e-06, |
|
"loss": 0.062, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 1.9744058500914077, |
|
"grad_norm": 0.022770356386899948, |
|
"learning_rate": 2.6051188299817187e-06, |
|
"loss": 0.0424, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 1.9789762340036563, |
|
"grad_norm": 0.01893909089267254, |
|
"learning_rate": 2.148080438756856e-06, |
|
"loss": 0.0031, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 1.983546617915905, |
|
"grad_norm": 7.610752105712891, |
|
"learning_rate": 1.691042047531993e-06, |
|
"loss": 0.0827, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 1.9881170018281535, |
|
"grad_norm": 0.008086251094937325, |
|
"learning_rate": 1.2340036563071298e-06, |
|
"loss": 0.0087, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 1.9926873857404022, |
|
"grad_norm": 4.746099948883057, |
|
"learning_rate": 7.769652650822669e-07, |
|
"loss": 0.0487, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 1.9972577696526508, |
|
"grad_norm": 0.3847046494483948, |
|
"learning_rate": 3.1992687385740404e-07, |
|
"loss": 0.0442, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 4376, |
|
"total_flos": 5.42482821328896e+18, |
|
"train_loss": 0.13969962793986365, |
|
"train_runtime": 4718.8358, |
|
"train_samples_per_second": 14.834, |
|
"train_steps_per_second": 0.927 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.9772666666666666, |
|
"eval_loss": 0.08026164770126343, |
|
"eval_runtime": 73.8075, |
|
"eval_samples_per_second": 203.231, |
|
"eval_steps_per_second": 12.709, |
|
"step": 4376 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4376, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.42482821328896e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|