diff --git "a/checkpoint-29993/trainer_state.json" "b/checkpoint-29993/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-29993/trainer_state.json" @@ -0,0 +1,21916 @@ +{ + "best_metric": 0.2127562165260315, + "best_model_checkpoint": "female_vit9/checkpoint-29993", + "epoch": 89.0, + "eval_steps": 500, + "global_step": 29993, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02967359050445104, + "grad_norm": 10.257715225219727, + "learning_rate": 2e-08, + "loss": 0.061, + "step": 10 + }, + { + "epoch": 0.05934718100890208, + "grad_norm": 14.8863525390625, + "learning_rate": 4e-08, + "loss": 0.1389, + "step": 20 + }, + { + "epoch": 0.08902077151335312, + "grad_norm": 2.380739212036133, + "learning_rate": 6e-08, + "loss": 0.1569, + "step": 30 + }, + { + "epoch": 0.11869436201780416, + "grad_norm": 6.20700216293335, + "learning_rate": 8e-08, + "loss": 0.1568, + "step": 40 + }, + { + "epoch": 0.14836795252225518, + "grad_norm": 22.02073097229004, + "learning_rate": 1e-07, + "loss": 0.2032, + "step": 50 + }, + { + "epoch": 0.17804154302670624, + "grad_norm": 0.28586480021476746, + "learning_rate": 9.997028231797919e-08, + "loss": 0.1842, + "step": 60 + }, + { + "epoch": 0.20771513353115728, + "grad_norm": 5.154716491699219, + "learning_rate": 9.994056463595839e-08, + "loss": 0.2053, + "step": 70 + }, + { + "epoch": 0.23738872403560832, + "grad_norm": 16.37206268310547, + "learning_rate": 9.99108469539376e-08, + "loss": 0.1946, + "step": 80 + }, + { + "epoch": 0.26706231454005935, + "grad_norm": 11.895088195800781, + "learning_rate": 9.988112927191679e-08, + "loss": 0.1926, + "step": 90 + }, + { + "epoch": 0.29673590504451036, + "grad_norm": 10.761597633361816, + "learning_rate": 9.985141158989599e-08, + "loss": 0.3026, + "step": 100 + }, + { + "epoch": 0.3264094955489614, + "grad_norm": 14.649810791015625, + "learning_rate": 9.982169390787518e-08, + "loss": 0.1547, + "step": 110 + }, + { + "epoch": 0.3560830860534125, + "grad_norm": 1.6462516784667969, + "learning_rate": 9.979197622585438e-08, + "loss": 0.1067, + "step": 120 + }, + { + "epoch": 0.3857566765578635, + "grad_norm": 17.538700103759766, + "learning_rate": 9.976225854383357e-08, + "loss": 0.1586, + "step": 130 + }, + { + "epoch": 0.41543026706231456, + "grad_norm": 4.689877510070801, + "learning_rate": 9.973254086181278e-08, + "loss": 0.1856, + "step": 140 + }, + { + "epoch": 0.44510385756676557, + "grad_norm": 0.907271146774292, + "learning_rate": 9.970282317979198e-08, + "loss": 0.3094, + "step": 150 + }, + { + "epoch": 0.47477744807121663, + "grad_norm": 14.273152351379395, + "learning_rate": 9.967310549777117e-08, + "loss": 0.0567, + "step": 160 + }, + { + "epoch": 0.5044510385756676, + "grad_norm": 10.739826202392578, + "learning_rate": 9.964338781575037e-08, + "loss": 0.4097, + "step": 170 + }, + { + "epoch": 0.5341246290801187, + "grad_norm": 15.876839637756348, + "learning_rate": 9.961367013372956e-08, + "loss": 0.3015, + "step": 180 + }, + { + "epoch": 0.5637982195845698, + "grad_norm": 25.169584274291992, + "learning_rate": 9.958395245170876e-08, + "loss": 0.1902, + "step": 190 + }, + { + "epoch": 0.5934718100890207, + "grad_norm": 13.318856239318848, + "learning_rate": 9.955423476968797e-08, + "loss": 0.2586, + "step": 200 + }, + { + "epoch": 0.6231454005934718, + "grad_norm": 29.106382369995117, + "learning_rate": 9.952451708766716e-08, + "loss": 0.0579, + "step": 210 + }, + { + "epoch": 0.6528189910979229, + "grad_norm": 0.25188905000686646, + "learning_rate": 9.949479940564636e-08, + "loss": 0.0699, + "step": 220 + }, + { + "epoch": 0.6824925816023739, + "grad_norm": 1.976697564125061, + "learning_rate": 9.946508172362555e-08, + "loss": 0.2503, + "step": 230 + }, + { + "epoch": 0.712166172106825, + "grad_norm": 4.690854072570801, + "learning_rate": 9.943536404160475e-08, + "loss": 0.1852, + "step": 240 + }, + { + "epoch": 0.7418397626112759, + "grad_norm": 0.5412238836288452, + "learning_rate": 9.940564635958395e-08, + "loss": 0.2047, + "step": 250 + }, + { + "epoch": 0.771513353115727, + "grad_norm": 5.5837202072143555, + "learning_rate": 9.937592867756315e-08, + "loss": 0.1163, + "step": 260 + }, + { + "epoch": 0.8011869436201781, + "grad_norm": 7.459112644195557, + "learning_rate": 9.934621099554235e-08, + "loss": 0.2091, + "step": 270 + }, + { + "epoch": 0.8308605341246291, + "grad_norm": 3.4183671474456787, + "learning_rate": 9.931649331352155e-08, + "loss": 0.0541, + "step": 280 + }, + { + "epoch": 0.8605341246290801, + "grad_norm": 10.509713172912598, + "learning_rate": 9.928677563150074e-08, + "loss": 0.1645, + "step": 290 + }, + { + "epoch": 0.8902077151335311, + "grad_norm": 6.347425937652588, + "learning_rate": 9.925705794947994e-08, + "loss": 0.1841, + "step": 300 + }, + { + "epoch": 0.9198813056379822, + "grad_norm": 0.30194389820098877, + "learning_rate": 9.922734026745913e-08, + "loss": 0.1825, + "step": 310 + }, + { + "epoch": 0.9495548961424333, + "grad_norm": 0.15537817776203156, + "learning_rate": 9.919762258543834e-08, + "loss": 0.0637, + "step": 320 + }, + { + "epoch": 0.9792284866468842, + "grad_norm": 0.07438170909881592, + "learning_rate": 9.916790490341754e-08, + "loss": 0.1308, + "step": 330 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.9276169265033407, + "eval_loss": 0.22205422818660736, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.443, + "eval_samples_per_second": 139.377, + "eval_steps_per_second": 17.538, + "step": 337 + }, + { + "epoch": 1.0089020771513353, + "grad_norm": 0.5107494592666626, + "learning_rate": 9.913818722139673e-08, + "loss": 0.0889, + "step": 340 + }, + { + "epoch": 1.0385756676557865, + "grad_norm": 18.18282699584961, + "learning_rate": 9.910846953937593e-08, + "loss": 0.0811, + "step": 350 + }, + { + "epoch": 1.0682492581602374, + "grad_norm": 0.7097955346107483, + "learning_rate": 9.907875185735512e-08, + "loss": 0.1913, + "step": 360 + }, + { + "epoch": 1.0979228486646884, + "grad_norm": 27.824705123901367, + "learning_rate": 9.904903417533432e-08, + "loss": 0.1437, + "step": 370 + }, + { + "epoch": 1.1275964391691395, + "grad_norm": 6.011620044708252, + "learning_rate": 9.901931649331353e-08, + "loss": 0.3995, + "step": 380 + }, + { + "epoch": 1.1572700296735905, + "grad_norm": 7.169536590576172, + "learning_rate": 9.898959881129272e-08, + "loss": 0.1788, + "step": 390 + }, + { + "epoch": 1.1869436201780414, + "grad_norm": 12.991846084594727, + "learning_rate": 9.895988112927192e-08, + "loss": 0.2099, + "step": 400 + }, + { + "epoch": 1.2166172106824926, + "grad_norm": 11.618097305297852, + "learning_rate": 9.893016344725111e-08, + "loss": 0.1416, + "step": 410 + }, + { + "epoch": 1.2462908011869436, + "grad_norm": 3.57865834236145, + "learning_rate": 9.890044576523031e-08, + "loss": 0.2138, + "step": 420 + }, + { + "epoch": 1.2759643916913945, + "grad_norm": 0.1849578469991684, + "learning_rate": 9.88707280832095e-08, + "loss": 0.069, + "step": 430 + }, + { + "epoch": 1.3056379821958457, + "grad_norm": 28.50833511352539, + "learning_rate": 9.884101040118871e-08, + "loss": 0.4369, + "step": 440 + }, + { + "epoch": 1.3353115727002967, + "grad_norm": 16.469209671020508, + "learning_rate": 9.881129271916791e-08, + "loss": 0.2749, + "step": 450 + }, + { + "epoch": 1.3649851632047478, + "grad_norm": 21.51459503173828, + "learning_rate": 9.87815750371471e-08, + "loss": 0.2488, + "step": 460 + }, + { + "epoch": 1.3946587537091988, + "grad_norm": 4.527249336242676, + "learning_rate": 9.87518573551263e-08, + "loss": 0.0564, + "step": 470 + }, + { + "epoch": 1.4243323442136497, + "grad_norm": 0.035517800599336624, + "learning_rate": 9.87221396731055e-08, + "loss": 0.1529, + "step": 480 + }, + { + "epoch": 1.454005934718101, + "grad_norm": 0.34869855642318726, + "learning_rate": 9.869242199108469e-08, + "loss": 0.1386, + "step": 490 + }, + { + "epoch": 1.4836795252225519, + "grad_norm": 0.41621166467666626, + "learning_rate": 9.86627043090639e-08, + "loss": 0.2891, + "step": 500 + }, + { + "epoch": 1.513353115727003, + "grad_norm": 17.198963165283203, + "learning_rate": 9.86329866270431e-08, + "loss": 0.211, + "step": 510 + }, + { + "epoch": 1.543026706231454, + "grad_norm": 0.3732677698135376, + "learning_rate": 9.860326894502229e-08, + "loss": 0.1679, + "step": 520 + }, + { + "epoch": 1.572700296735905, + "grad_norm": 0.4925495386123657, + "learning_rate": 9.857355126300149e-08, + "loss": 0.256, + "step": 530 + }, + { + "epoch": 1.6023738872403561, + "grad_norm": 20.852487564086914, + "learning_rate": 9.854383358098068e-08, + "loss": 0.1569, + "step": 540 + }, + { + "epoch": 1.632047477744807, + "grad_norm": 21.292661666870117, + "learning_rate": 9.851411589895988e-08, + "loss": 0.1516, + "step": 550 + }, + { + "epoch": 1.6617210682492582, + "grad_norm": 3.7981202602386475, + "learning_rate": 9.848439821693909e-08, + "loss": 0.051, + "step": 560 + }, + { + "epoch": 1.6913946587537092, + "grad_norm": 3.502124786376953, + "learning_rate": 9.845468053491828e-08, + "loss": 0.3367, + "step": 570 + }, + { + "epoch": 1.7210682492581602, + "grad_norm": 2.02787446975708, + "learning_rate": 9.842496285289746e-08, + "loss": 0.1914, + "step": 580 + }, + { + "epoch": 1.7507418397626113, + "grad_norm": 0.0951739177107811, + "learning_rate": 9.839524517087666e-08, + "loss": 0.1157, + "step": 590 + }, + { + "epoch": 1.7804154302670623, + "grad_norm": 0.3759836256504059, + "learning_rate": 9.836552748885585e-08, + "loss": 0.1469, + "step": 600 + }, + { + "epoch": 1.8100890207715135, + "grad_norm": 0.06309275329113007, + "learning_rate": 9.833580980683506e-08, + "loss": 0.151, + "step": 610 + }, + { + "epoch": 1.8397626112759644, + "grad_norm": 0.3695402443408966, + "learning_rate": 9.830609212481426e-08, + "loss": 0.1666, + "step": 620 + }, + { + "epoch": 1.8694362017804154, + "grad_norm": 0.6110500693321228, + "learning_rate": 9.827637444279345e-08, + "loss": 0.1231, + "step": 630 + }, + { + "epoch": 1.8991097922848663, + "grad_norm": 5.492069721221924, + "learning_rate": 9.824665676077265e-08, + "loss": 0.1615, + "step": 640 + }, + { + "epoch": 1.9287833827893175, + "grad_norm": 2.2929513454437256, + "learning_rate": 9.821693907875185e-08, + "loss": 0.1468, + "step": 650 + }, + { + "epoch": 1.9584569732937687, + "grad_norm": 3.572039842605591, + "learning_rate": 9.818722139673104e-08, + "loss": 0.2234, + "step": 660 + }, + { + "epoch": 1.9881305637982196, + "grad_norm": 2.8277790546417236, + "learning_rate": 9.815750371471025e-08, + "loss": 0.1551, + "step": 670 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.9287305122494433, + "eval_loss": 0.21955013275146484, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4362, + "eval_samples_per_second": 139.524, + "eval_steps_per_second": 17.557, + "step": 674 + }, + { + "epoch": 2.0178041543026706, + "grad_norm": 1.8238873481750488, + "learning_rate": 9.812778603268944e-08, + "loss": 0.0824, + "step": 680 + }, + { + "epoch": 2.0474777448071215, + "grad_norm": 9.894556045532227, + "learning_rate": 9.809806835066864e-08, + "loss": 0.2662, + "step": 690 + }, + { + "epoch": 2.077151335311573, + "grad_norm": 0.3564457893371582, + "learning_rate": 9.806835066864784e-08, + "loss": 0.2184, + "step": 700 + }, + { + "epoch": 2.106824925816024, + "grad_norm": 9.817183494567871, + "learning_rate": 9.803863298662703e-08, + "loss": 0.1024, + "step": 710 + }, + { + "epoch": 2.136498516320475, + "grad_norm": 1.706502079963684, + "learning_rate": 9.800891530460623e-08, + "loss": 0.2483, + "step": 720 + }, + { + "epoch": 2.166172106824926, + "grad_norm": 6.505466938018799, + "learning_rate": 9.797919762258544e-08, + "loss": 0.3066, + "step": 730 + }, + { + "epoch": 2.1958456973293767, + "grad_norm": 32.928218841552734, + "learning_rate": 9.794947994056463e-08, + "loss": 0.2152, + "step": 740 + }, + { + "epoch": 2.2255192878338277, + "grad_norm": 2.1103193759918213, + "learning_rate": 9.791976225854383e-08, + "loss": 0.0757, + "step": 750 + }, + { + "epoch": 2.255192878338279, + "grad_norm": 14.539588928222656, + "learning_rate": 9.789004457652302e-08, + "loss": 0.1916, + "step": 760 + }, + { + "epoch": 2.28486646884273, + "grad_norm": 12.687321662902832, + "learning_rate": 9.786032689450222e-08, + "loss": 0.3382, + "step": 770 + }, + { + "epoch": 2.314540059347181, + "grad_norm": 4.770256519317627, + "learning_rate": 9.783060921248141e-08, + "loss": 0.118, + "step": 780 + }, + { + "epoch": 2.344213649851632, + "grad_norm": 0.96148681640625, + "learning_rate": 9.780089153046062e-08, + "loss": 0.2175, + "step": 790 + }, + { + "epoch": 2.373887240356083, + "grad_norm": 3.5745584964752197, + "learning_rate": 9.777117384843982e-08, + "loss": 0.0621, + "step": 800 + }, + { + "epoch": 2.4035608308605343, + "grad_norm": 0.9035162925720215, + "learning_rate": 9.774145616641901e-08, + "loss": 0.2568, + "step": 810 + }, + { + "epoch": 2.4332344213649852, + "grad_norm": 27.070751190185547, + "learning_rate": 9.771173848439821e-08, + "loss": 0.1798, + "step": 820 + }, + { + "epoch": 2.462908011869436, + "grad_norm": 0.044718727469444275, + "learning_rate": 9.76820208023774e-08, + "loss": 0.3017, + "step": 830 + }, + { + "epoch": 2.492581602373887, + "grad_norm": 11.643866539001465, + "learning_rate": 9.76523031203566e-08, + "loss": 0.112, + "step": 840 + }, + { + "epoch": 2.5222551928783385, + "grad_norm": 13.780647277832031, + "learning_rate": 9.762258543833581e-08, + "loss": 0.1501, + "step": 850 + }, + { + "epoch": 2.551928783382789, + "grad_norm": 1.5545850992202759, + "learning_rate": 9.7592867756315e-08, + "loss": 0.1231, + "step": 860 + }, + { + "epoch": 2.5816023738872405, + "grad_norm": 7.574569225311279, + "learning_rate": 9.75631500742942e-08, + "loss": 0.1798, + "step": 870 + }, + { + "epoch": 2.6112759643916914, + "grad_norm": 5.720097064971924, + "learning_rate": 9.75334323922734e-08, + "loss": 0.215, + "step": 880 + }, + { + "epoch": 2.6409495548961424, + "grad_norm": 0.058183860033750534, + "learning_rate": 9.750371471025259e-08, + "loss": 0.2117, + "step": 890 + }, + { + "epoch": 2.6706231454005933, + "grad_norm": 32.62077331542969, + "learning_rate": 9.747399702823179e-08, + "loss": 0.1662, + "step": 900 + }, + { + "epoch": 2.7002967359050443, + "grad_norm": 17.234649658203125, + "learning_rate": 9.7444279346211e-08, + "loss": 0.358, + "step": 910 + }, + { + "epoch": 2.7299703264094957, + "grad_norm": 1.6736148595809937, + "learning_rate": 9.741456166419019e-08, + "loss": 0.207, + "step": 920 + }, + { + "epoch": 2.7596439169139466, + "grad_norm": 27.09353256225586, + "learning_rate": 9.738484398216939e-08, + "loss": 0.2132, + "step": 930 + }, + { + "epoch": 2.7893175074183976, + "grad_norm": 23.66081428527832, + "learning_rate": 9.735512630014858e-08, + "loss": 0.2213, + "step": 940 + }, + { + "epoch": 2.8189910979228485, + "grad_norm": 1.5521515607833862, + "learning_rate": 9.732540861812778e-08, + "loss": 0.0688, + "step": 950 + }, + { + "epoch": 2.8486646884272995, + "grad_norm": 9.196721076965332, + "learning_rate": 9.729569093610697e-08, + "loss": 0.2753, + "step": 960 + }, + { + "epoch": 2.878338278931751, + "grad_norm": 22.413421630859375, + "learning_rate": 9.726597325408618e-08, + "loss": 0.2315, + "step": 970 + }, + { + "epoch": 2.908011869436202, + "grad_norm": 1.5819326639175415, + "learning_rate": 9.723625557206538e-08, + "loss": 0.0925, + "step": 980 + }, + { + "epoch": 2.9376854599406528, + "grad_norm": 6.5375075340271, + "learning_rate": 9.720653789004457e-08, + "loss": 0.1702, + "step": 990 + }, + { + "epoch": 2.9673590504451037, + "grad_norm": 0.5410488843917847, + "learning_rate": 9.717682020802377e-08, + "loss": 0.1346, + "step": 1000 + }, + { + "epoch": 2.9970326409495547, + "grad_norm": 5.688258647918701, + "learning_rate": 9.714710252600296e-08, + "loss": 0.061, + "step": 1010 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.9309576837416481, + "eval_loss": 0.22162236273288727, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4116, + "eval_samples_per_second": 140.058, + "eval_steps_per_second": 17.624, + "step": 1011 + }, + { + "epoch": 3.026706231454006, + "grad_norm": 15.594870567321777, + "learning_rate": 9.711738484398216e-08, + "loss": 0.3625, + "step": 1020 + }, + { + "epoch": 3.056379821958457, + "grad_norm": 4.256815433502197, + "learning_rate": 9.708766716196137e-08, + "loss": 0.1324, + "step": 1030 + }, + { + "epoch": 3.086053412462908, + "grad_norm": 0.16979201138019562, + "learning_rate": 9.705794947994056e-08, + "loss": 0.0816, + "step": 1040 + }, + { + "epoch": 3.115727002967359, + "grad_norm": 19.755964279174805, + "learning_rate": 9.702823179791976e-08, + "loss": 0.3018, + "step": 1050 + }, + { + "epoch": 3.14540059347181, + "grad_norm": 9.49512004852295, + "learning_rate": 9.699851411589895e-08, + "loss": 0.1433, + "step": 1060 + }, + { + "epoch": 3.1750741839762613, + "grad_norm": 1.5727099180221558, + "learning_rate": 9.696879643387815e-08, + "loss": 0.0872, + "step": 1070 + }, + { + "epoch": 3.2047477744807122, + "grad_norm": 2.8518857955932617, + "learning_rate": 9.693907875185734e-08, + "loss": 0.1637, + "step": 1080 + }, + { + "epoch": 3.234421364985163, + "grad_norm": 4.147556781768799, + "learning_rate": 9.690936106983655e-08, + "loss": 0.2143, + "step": 1090 + }, + { + "epoch": 3.264094955489614, + "grad_norm": 0.2385440170764923, + "learning_rate": 9.687964338781575e-08, + "loss": 0.2468, + "step": 1100 + }, + { + "epoch": 3.293768545994065, + "grad_norm": 18.481351852416992, + "learning_rate": 9.684992570579494e-08, + "loss": 0.12, + "step": 1110 + }, + { + "epoch": 3.3234421364985165, + "grad_norm": 8.817008018493652, + "learning_rate": 9.682020802377414e-08, + "loss": 0.0716, + "step": 1120 + }, + { + "epoch": 3.3531157270029674, + "grad_norm": 1.0929116010665894, + "learning_rate": 9.679049034175334e-08, + "loss": 0.2835, + "step": 1130 + }, + { + "epoch": 3.3827893175074184, + "grad_norm": 18.9724063873291, + "learning_rate": 9.676077265973253e-08, + "loss": 0.0711, + "step": 1140 + }, + { + "epoch": 3.4124629080118694, + "grad_norm": 2.1412360668182373, + "learning_rate": 9.673105497771174e-08, + "loss": 0.1571, + "step": 1150 + }, + { + "epoch": 3.4421364985163203, + "grad_norm": 2.8230714797973633, + "learning_rate": 9.670133729569094e-08, + "loss": 0.0882, + "step": 1160 + }, + { + "epoch": 3.4718100890207717, + "grad_norm": 16.23350715637207, + "learning_rate": 9.667161961367013e-08, + "loss": 0.0996, + "step": 1170 + }, + { + "epoch": 3.5014836795252227, + "grad_norm": 0.09892373532056808, + "learning_rate": 9.664190193164933e-08, + "loss": 0.134, + "step": 1180 + }, + { + "epoch": 3.5311572700296736, + "grad_norm": 0.31314051151275635, + "learning_rate": 9.661218424962852e-08, + "loss": 0.1795, + "step": 1190 + }, + { + "epoch": 3.5608308605341246, + "grad_norm": 0.7977163791656494, + "learning_rate": 9.658246656760772e-08, + "loss": 0.4363, + "step": 1200 + }, + { + "epoch": 3.5905044510385755, + "grad_norm": 2.3565633296966553, + "learning_rate": 9.655274888558693e-08, + "loss": 0.1572, + "step": 1210 + }, + { + "epoch": 3.620178041543027, + "grad_norm": 0.6459141373634338, + "learning_rate": 9.652303120356612e-08, + "loss": 0.1299, + "step": 1220 + }, + { + "epoch": 3.649851632047478, + "grad_norm": 1.7934010028839111, + "learning_rate": 9.649331352154532e-08, + "loss": 0.093, + "step": 1230 + }, + { + "epoch": 3.679525222551929, + "grad_norm": 1.0115402936935425, + "learning_rate": 9.646359583952451e-08, + "loss": 0.103, + "step": 1240 + }, + { + "epoch": 3.7091988130563798, + "grad_norm": 7.563291072845459, + "learning_rate": 9.643387815750371e-08, + "loss": 0.2548, + "step": 1250 + }, + { + "epoch": 3.7388724035608307, + "grad_norm": 0.07186966389417648, + "learning_rate": 9.64041604754829e-08, + "loss": 0.1275, + "step": 1260 + }, + { + "epoch": 3.768545994065282, + "grad_norm": 11.680068016052246, + "learning_rate": 9.637444279346211e-08, + "loss": 0.0453, + "step": 1270 + }, + { + "epoch": 3.798219584569733, + "grad_norm": 19.03893280029297, + "learning_rate": 9.634472511144131e-08, + "loss": 0.1882, + "step": 1280 + }, + { + "epoch": 3.827893175074184, + "grad_norm": 0.46553656458854675, + "learning_rate": 9.63150074294205e-08, + "loss": 0.1602, + "step": 1290 + }, + { + "epoch": 3.857566765578635, + "grad_norm": 7.726144790649414, + "learning_rate": 9.62852897473997e-08, + "loss": 0.1461, + "step": 1300 + }, + { + "epoch": 3.887240356083086, + "grad_norm": 0.09316195547580719, + "learning_rate": 9.62555720653789e-08, + "loss": 0.1268, + "step": 1310 + }, + { + "epoch": 3.9169139465875373, + "grad_norm": 33.36213302612305, + "learning_rate": 9.622585438335809e-08, + "loss": 0.324, + "step": 1320 + }, + { + "epoch": 3.9465875370919883, + "grad_norm": 0.8318944573402405, + "learning_rate": 9.61961367013373e-08, + "loss": 0.1072, + "step": 1330 + }, + { + "epoch": 3.9762611275964392, + "grad_norm": 24.752574920654297, + "learning_rate": 9.61664190193165e-08, + "loss": 0.0489, + "step": 1340 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.9320712694877505, + "eval_loss": 0.21980150043964386, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.5546, + "eval_samples_per_second": 137.003, + "eval_steps_per_second": 17.24, + "step": 1348 + }, + { + "epoch": 4.005934718100891, + "grad_norm": 5.664433479309082, + "learning_rate": 9.613670133729569e-08, + "loss": 0.249, + "step": 1350 + }, + { + "epoch": 4.035608308605341, + "grad_norm": 13.840527534484863, + "learning_rate": 9.610698365527488e-08, + "loss": 0.1925, + "step": 1360 + }, + { + "epoch": 4.0652818991097925, + "grad_norm": 0.25648483633995056, + "learning_rate": 9.607726597325408e-08, + "loss": 0.1415, + "step": 1370 + }, + { + "epoch": 4.094955489614243, + "grad_norm": 0.10049951821565628, + "learning_rate": 9.604754829123328e-08, + "loss": 0.2459, + "step": 1380 + }, + { + "epoch": 4.1246290801186944, + "grad_norm": 9.84668254852295, + "learning_rate": 9.601783060921248e-08, + "loss": 0.3387, + "step": 1390 + }, + { + "epoch": 4.154302670623146, + "grad_norm": 10.615157127380371, + "learning_rate": 9.598811292719168e-08, + "loss": 0.1516, + "step": 1400 + }, + { + "epoch": 4.183976261127596, + "grad_norm": 0.4312196969985962, + "learning_rate": 9.595839524517088e-08, + "loss": 0.0479, + "step": 1410 + }, + { + "epoch": 4.213649851632048, + "grad_norm": 0.8046972155570984, + "learning_rate": 9.592867756315007e-08, + "loss": 0.2483, + "step": 1420 + }, + { + "epoch": 4.243323442136498, + "grad_norm": 20.6893367767334, + "learning_rate": 9.589895988112927e-08, + "loss": 0.1182, + "step": 1430 + }, + { + "epoch": 4.27299703264095, + "grad_norm": 14.856695175170898, + "learning_rate": 9.586924219910846e-08, + "loss": 0.2759, + "step": 1440 + }, + { + "epoch": 4.302670623145401, + "grad_norm": 2.018095016479492, + "learning_rate": 9.583952451708767e-08, + "loss": 0.1403, + "step": 1450 + }, + { + "epoch": 4.332344213649852, + "grad_norm": 6.393588066101074, + "learning_rate": 9.580980683506687e-08, + "loss": 0.2718, + "step": 1460 + }, + { + "epoch": 4.362017804154303, + "grad_norm": 37.21635055541992, + "learning_rate": 9.578008915304606e-08, + "loss": 0.2314, + "step": 1470 + }, + { + "epoch": 4.3916913946587535, + "grad_norm": 31.304306030273438, + "learning_rate": 9.575037147102526e-08, + "loss": 0.1064, + "step": 1480 + }, + { + "epoch": 4.421364985163205, + "grad_norm": 0.6080612540245056, + "learning_rate": 9.572065378900445e-08, + "loss": 0.2135, + "step": 1490 + }, + { + "epoch": 4.451038575667655, + "grad_norm": 9.551019668579102, + "learning_rate": 9.569093610698366e-08, + "loss": 0.1526, + "step": 1500 + }, + { + "epoch": 4.480712166172107, + "grad_norm": 0.06823253631591797, + "learning_rate": 9.566121842496286e-08, + "loss": 0.1327, + "step": 1510 + }, + { + "epoch": 4.510385756676558, + "grad_norm": 0.05191659927368164, + "learning_rate": 9.563150074294205e-08, + "loss": 0.0751, + "step": 1520 + }, + { + "epoch": 4.540059347181009, + "grad_norm": 19.333621978759766, + "learning_rate": 9.560178306092125e-08, + "loss": 0.1881, + "step": 1530 + }, + { + "epoch": 4.56973293768546, + "grad_norm": 0.6308963298797607, + "learning_rate": 9.557206537890044e-08, + "loss": 0.1114, + "step": 1540 + }, + { + "epoch": 4.5994065281899115, + "grad_norm": 24.189180374145508, + "learning_rate": 9.554234769687964e-08, + "loss": 0.2122, + "step": 1550 + }, + { + "epoch": 4.629080118694362, + "grad_norm": 0.09758025407791138, + "learning_rate": 9.551263001485885e-08, + "loss": 0.0763, + "step": 1560 + }, + { + "epoch": 4.658753709198813, + "grad_norm": 4.695437431335449, + "learning_rate": 9.548291233283804e-08, + "loss": 0.228, + "step": 1570 + }, + { + "epoch": 4.688427299703264, + "grad_norm": 8.652104377746582, + "learning_rate": 9.545319465081724e-08, + "loss": 0.0467, + "step": 1580 + }, + { + "epoch": 4.718100890207715, + "grad_norm": 0.9268213510513306, + "learning_rate": 9.542347696879643e-08, + "loss": 0.2528, + "step": 1590 + }, + { + "epoch": 4.747774480712166, + "grad_norm": 22.154449462890625, + "learning_rate": 9.539375928677563e-08, + "loss": 0.1614, + "step": 1600 + }, + { + "epoch": 4.777448071216617, + "grad_norm": 1.9331769943237305, + "learning_rate": 9.536404160475483e-08, + "loss": 0.1377, + "step": 1610 + }, + { + "epoch": 4.807121661721069, + "grad_norm": 7.637106418609619, + "learning_rate": 9.533432392273403e-08, + "loss": 0.2563, + "step": 1620 + }, + { + "epoch": 4.836795252225519, + "grad_norm": 0.40905532240867615, + "learning_rate": 9.530460624071322e-08, + "loss": 0.1649, + "step": 1630 + }, + { + "epoch": 4.8664688427299705, + "grad_norm": 12.678409576416016, + "learning_rate": 9.527488855869241e-08, + "loss": 0.0932, + "step": 1640 + }, + { + "epoch": 4.896142433234421, + "grad_norm": 27.173160552978516, + "learning_rate": 9.524517087667161e-08, + "loss": 0.149, + "step": 1650 + }, + { + "epoch": 4.925816023738872, + "grad_norm": 0.3214416801929474, + "learning_rate": 9.52154531946508e-08, + "loss": 0.2342, + "step": 1660 + }, + { + "epoch": 4.955489614243324, + "grad_norm": 7.1857805252075195, + "learning_rate": 9.518573551263001e-08, + "loss": 0.0475, + "step": 1670 + }, + { + "epoch": 4.985163204747774, + "grad_norm": 0.6020322442054749, + "learning_rate": 9.515601783060921e-08, + "loss": 0.0597, + "step": 1680 + }, + { + "epoch": 5.0, + "eval_accuracy": 0.9320712694877505, + "eval_loss": 0.22042761743068695, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.5018, + "eval_samples_per_second": 138.116, + "eval_steps_per_second": 17.38, + "step": 1685 + }, + { + "epoch": 5.014836795252226, + "grad_norm": 0.03891874477267265, + "learning_rate": 9.51263001485884e-08, + "loss": 0.2467, + "step": 1690 + }, + { + "epoch": 5.044510385756676, + "grad_norm": 10.157297134399414, + "learning_rate": 9.50965824665676e-08, + "loss": 0.2342, + "step": 1700 + }, + { + "epoch": 5.074183976261128, + "grad_norm": 1.7377625703811646, + "learning_rate": 9.50668647845468e-08, + "loss": 0.1763, + "step": 1710 + }, + { + "epoch": 5.103857566765579, + "grad_norm": 15.028761863708496, + "learning_rate": 9.503714710252599e-08, + "loss": 0.1392, + "step": 1720 + }, + { + "epoch": 5.1335311572700295, + "grad_norm": 0.21524587273597717, + "learning_rate": 9.50074294205052e-08, + "loss": 0.175, + "step": 1730 + }, + { + "epoch": 5.163204747774481, + "grad_norm": 24.743486404418945, + "learning_rate": 9.49777117384844e-08, + "loss": 0.1252, + "step": 1740 + }, + { + "epoch": 5.192878338278931, + "grad_norm": 23.803251266479492, + "learning_rate": 9.494799405646359e-08, + "loss": 0.2305, + "step": 1750 + }, + { + "epoch": 5.222551928783383, + "grad_norm": 0.5506026744842529, + "learning_rate": 9.491827637444278e-08, + "loss": 0.155, + "step": 1760 + }, + { + "epoch": 5.252225519287834, + "grad_norm": 0.18158352375030518, + "learning_rate": 9.488855869242198e-08, + "loss": 0.1881, + "step": 1770 + }, + { + "epoch": 5.281899109792285, + "grad_norm": 0.6016466617584229, + "learning_rate": 9.485884101040118e-08, + "loss": 0.1806, + "step": 1780 + }, + { + "epoch": 5.311572700296736, + "grad_norm": 2.5097482204437256, + "learning_rate": 9.482912332838038e-08, + "loss": 0.2145, + "step": 1790 + }, + { + "epoch": 5.341246290801187, + "grad_norm": 2.1078896522521973, + "learning_rate": 9.479940564635958e-08, + "loss": 0.0504, + "step": 1800 + }, + { + "epoch": 5.370919881305638, + "grad_norm": 26.432212829589844, + "learning_rate": 9.476968796433878e-08, + "loss": 0.0545, + "step": 1810 + }, + { + "epoch": 5.400593471810089, + "grad_norm": 22.304941177368164, + "learning_rate": 9.473997028231797e-08, + "loss": 0.1826, + "step": 1820 + }, + { + "epoch": 5.43026706231454, + "grad_norm": 22.587263107299805, + "learning_rate": 9.471025260029717e-08, + "loss": 0.281, + "step": 1830 + }, + { + "epoch": 5.459940652818991, + "grad_norm": 0.0452299527823925, + "learning_rate": 9.468053491827636e-08, + "loss": 0.1794, + "step": 1840 + }, + { + "epoch": 5.489614243323442, + "grad_norm": 13.588239669799805, + "learning_rate": 9.465081723625557e-08, + "loss": 0.1505, + "step": 1850 + }, + { + "epoch": 5.519287833827893, + "grad_norm": 7.825653553009033, + "learning_rate": 9.462109955423477e-08, + "loss": 0.074, + "step": 1860 + }, + { + "epoch": 5.548961424332344, + "grad_norm": 3.1529901027679443, + "learning_rate": 9.459138187221396e-08, + "loss": 0.0974, + "step": 1870 + }, + { + "epoch": 5.578635014836795, + "grad_norm": 6.2667765617370605, + "learning_rate": 9.456166419019316e-08, + "loss": 0.0953, + "step": 1880 + }, + { + "epoch": 5.6083086053412465, + "grad_norm": 1.5149171352386475, + "learning_rate": 9.453194650817235e-08, + "loss": 0.2506, + "step": 1890 + }, + { + "epoch": 5.637982195845697, + "grad_norm": 0.24876146018505096, + "learning_rate": 9.450222882615155e-08, + "loss": 0.2107, + "step": 1900 + }, + { + "epoch": 5.667655786350148, + "grad_norm": 15.837482452392578, + "learning_rate": 9.447251114413076e-08, + "loss": 0.2872, + "step": 1910 + }, + { + "epoch": 5.697329376854599, + "grad_norm": 0.8214932680130005, + "learning_rate": 9.444279346210995e-08, + "loss": 0.1848, + "step": 1920 + }, + { + "epoch": 5.72700296735905, + "grad_norm": 0.059738751500844955, + "learning_rate": 9.441307578008915e-08, + "loss": 0.1357, + "step": 1930 + }, + { + "epoch": 5.756676557863502, + "grad_norm": 1.543289303779602, + "learning_rate": 9.438335809806834e-08, + "loss": 0.1731, + "step": 1940 + }, + { + "epoch": 5.786350148367952, + "grad_norm": 4.5284037590026855, + "learning_rate": 9.435364041604754e-08, + "loss": 0.3643, + "step": 1950 + }, + { + "epoch": 5.816023738872404, + "grad_norm": 4.7399187088012695, + "learning_rate": 9.432392273402673e-08, + "loss": 0.146, + "step": 1960 + }, + { + "epoch": 5.845697329376854, + "grad_norm": 1.707919716835022, + "learning_rate": 9.429420505200594e-08, + "loss": 0.1735, + "step": 1970 + }, + { + "epoch": 5.8753709198813056, + "grad_norm": 0.31268176436424255, + "learning_rate": 9.426448736998514e-08, + "loss": 0.0649, + "step": 1980 + }, + { + "epoch": 5.905044510385757, + "grad_norm": 0.14701899886131287, + "learning_rate": 9.423476968796433e-08, + "loss": 0.0633, + "step": 1990 + }, + { + "epoch": 5.9347181008902075, + "grad_norm": 1.2780276536941528, + "learning_rate": 9.420505200594353e-08, + "loss": 0.1533, + "step": 2000 + }, + { + "epoch": 5.964391691394659, + "grad_norm": 7.38761043548584, + "learning_rate": 9.417533432392273e-08, + "loss": 0.0895, + "step": 2010 + }, + { + "epoch": 5.994065281899109, + "grad_norm": 1.903907299041748, + "learning_rate": 9.414561664190192e-08, + "loss": 0.2264, + "step": 2020 + }, + { + "epoch": 6.0, + "eval_accuracy": 0.933184855233853, + "eval_loss": 0.21959011256694794, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.5054, + "eval_samples_per_second": 138.039, + "eval_steps_per_second": 17.37, + "step": 2022 + }, + { + "epoch": 6.023738872403561, + "grad_norm": 11.533327102661133, + "learning_rate": 9.411589895988113e-08, + "loss": 0.0502, + "step": 2030 + }, + { + "epoch": 6.053412462908012, + "grad_norm": 1.7694834470748901, + "learning_rate": 9.408618127786033e-08, + "loss": 0.1098, + "step": 2040 + }, + { + "epoch": 6.083086053412463, + "grad_norm": 0.21380053460597992, + "learning_rate": 9.405646359583952e-08, + "loss": 0.1602, + "step": 2050 + }, + { + "epoch": 6.112759643916914, + "grad_norm": 24.957130432128906, + "learning_rate": 9.402674591381872e-08, + "loss": 0.1304, + "step": 2060 + }, + { + "epoch": 6.142433234421365, + "grad_norm": 0.6771255135536194, + "learning_rate": 9.399702823179791e-08, + "loss": 0.0181, + "step": 2070 + }, + { + "epoch": 6.172106824925816, + "grad_norm": 6.838940143585205, + "learning_rate": 9.396731054977711e-08, + "loss": 0.1007, + "step": 2080 + }, + { + "epoch": 6.201780415430267, + "grad_norm": 1.5791741609573364, + "learning_rate": 9.393759286775632e-08, + "loss": 0.0781, + "step": 2090 + }, + { + "epoch": 6.231454005934718, + "grad_norm": 0.1715279221534729, + "learning_rate": 9.390787518573551e-08, + "loss": 0.1844, + "step": 2100 + }, + { + "epoch": 6.261127596439169, + "grad_norm": 1.6452608108520508, + "learning_rate": 9.387815750371471e-08, + "loss": 0.1615, + "step": 2110 + }, + { + "epoch": 6.29080118694362, + "grad_norm": 0.09978493303060532, + "learning_rate": 9.38484398216939e-08, + "loss": 0.1281, + "step": 2120 + }, + { + "epoch": 6.320474777448071, + "grad_norm": 2.7937347888946533, + "learning_rate": 9.38187221396731e-08, + "loss": 0.1468, + "step": 2130 + }, + { + "epoch": 6.350148367952523, + "grad_norm": 0.05468699336051941, + "learning_rate": 9.378900445765229e-08, + "loss": 0.0603, + "step": 2140 + }, + { + "epoch": 6.379821958456973, + "grad_norm": 2.123202323913574, + "learning_rate": 9.37592867756315e-08, + "loss": 0.052, + "step": 2150 + }, + { + "epoch": 6.4094955489614245, + "grad_norm": 25.76131820678711, + "learning_rate": 9.37295690936107e-08, + "loss": 0.163, + "step": 2160 + }, + { + "epoch": 6.439169139465875, + "grad_norm": 22.840877532958984, + "learning_rate": 9.369985141158989e-08, + "loss": 0.253, + "step": 2170 + }, + { + "epoch": 6.468842729970326, + "grad_norm": 7.853781700134277, + "learning_rate": 9.367013372956909e-08, + "loss": 0.1275, + "step": 2180 + }, + { + "epoch": 6.498516320474778, + "grad_norm": 0.4352964460849762, + "learning_rate": 9.364041604754828e-08, + "loss": 0.2543, + "step": 2190 + }, + { + "epoch": 6.528189910979228, + "grad_norm": 20.616052627563477, + "learning_rate": 9.361069836552748e-08, + "loss": 0.1558, + "step": 2200 + }, + { + "epoch": 6.55786350148368, + "grad_norm": 2.2548294067382812, + "learning_rate": 9.358098068350669e-08, + "loss": 0.1124, + "step": 2210 + }, + { + "epoch": 6.58753709198813, + "grad_norm": 0.04087768495082855, + "learning_rate": 9.355126300148588e-08, + "loss": 0.2847, + "step": 2220 + }, + { + "epoch": 6.617210682492582, + "grad_norm": 10.290130615234375, + "learning_rate": 9.352154531946508e-08, + "loss": 0.2369, + "step": 2230 + }, + { + "epoch": 6.646884272997033, + "grad_norm": 11.530308723449707, + "learning_rate": 9.349182763744427e-08, + "loss": 0.3249, + "step": 2240 + }, + { + "epoch": 6.6765578635014835, + "grad_norm": 0.19080044329166412, + "learning_rate": 9.346210995542347e-08, + "loss": 0.1413, + "step": 2250 + }, + { + "epoch": 6.706231454005935, + "grad_norm": 8.740340232849121, + "learning_rate": 9.343239227340267e-08, + "loss": 0.2014, + "step": 2260 + }, + { + "epoch": 6.735905044510385, + "grad_norm": 0.28369346261024475, + "learning_rate": 9.340267459138187e-08, + "loss": 0.1382, + "step": 2270 + }, + { + "epoch": 6.765578635014837, + "grad_norm": 10.544970512390137, + "learning_rate": 9.337295690936107e-08, + "loss": 0.0716, + "step": 2280 + }, + { + "epoch": 6.795252225519288, + "grad_norm": 8.014925003051758, + "learning_rate": 9.334323922734027e-08, + "loss": 0.1219, + "step": 2290 + }, + { + "epoch": 6.824925816023739, + "grad_norm": 8.281830787658691, + "learning_rate": 9.331352154531946e-08, + "loss": 0.2684, + "step": 2300 + }, + { + "epoch": 6.85459940652819, + "grad_norm": 3.888934850692749, + "learning_rate": 9.328380386329866e-08, + "loss": 0.2444, + "step": 2310 + }, + { + "epoch": 6.884272997032641, + "grad_norm": 18.619415283203125, + "learning_rate": 9.325408618127785e-08, + "loss": 0.1922, + "step": 2320 + }, + { + "epoch": 6.913946587537092, + "grad_norm": 11.178292274475098, + "learning_rate": 9.322436849925706e-08, + "loss": 0.1251, + "step": 2330 + }, + { + "epoch": 6.943620178041543, + "grad_norm": 8.197835922241211, + "learning_rate": 9.319465081723626e-08, + "loss": 0.2675, + "step": 2340 + }, + { + "epoch": 6.973293768545994, + "grad_norm": 20.707571029663086, + "learning_rate": 9.316493313521545e-08, + "loss": 0.3079, + "step": 2350 + }, + { + "epoch": 7.0, + "eval_accuracy": 0.9320712694877505, + "eval_loss": 0.2174277901649475, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.8291, + "eval_samples_per_second": 131.496, + "eval_steps_per_second": 16.547, + "step": 2359 + }, + { + "epoch": 7.002967359050445, + "grad_norm": 17.041221618652344, + "learning_rate": 9.313521545319465e-08, + "loss": 0.0955, + "step": 2360 + }, + { + "epoch": 7.032640949554896, + "grad_norm": 0.027638059109449387, + "learning_rate": 9.310549777117384e-08, + "loss": 0.1875, + "step": 2370 + }, + { + "epoch": 7.062314540059347, + "grad_norm": 12.500692367553711, + "learning_rate": 9.307578008915304e-08, + "loss": 0.1934, + "step": 2380 + }, + { + "epoch": 7.091988130563799, + "grad_norm": 0.26454639434814453, + "learning_rate": 9.304606240713225e-08, + "loss": 0.1491, + "step": 2390 + }, + { + "epoch": 7.121661721068249, + "grad_norm": 0.25949811935424805, + "learning_rate": 9.301634472511144e-08, + "loss": 0.0901, + "step": 2400 + }, + { + "epoch": 7.1513353115727005, + "grad_norm": 0.05942277982831001, + "learning_rate": 9.298662704309064e-08, + "loss": 0.2742, + "step": 2410 + }, + { + "epoch": 7.181008902077151, + "grad_norm": 32.26604080200195, + "learning_rate": 9.295690936106983e-08, + "loss": 0.3326, + "step": 2420 + }, + { + "epoch": 7.210682492581602, + "grad_norm": 1.570496916770935, + "learning_rate": 9.292719167904903e-08, + "loss": 0.1845, + "step": 2430 + }, + { + "epoch": 7.240356083086054, + "grad_norm": 8.201016426086426, + "learning_rate": 9.289747399702822e-08, + "loss": 0.1702, + "step": 2440 + }, + { + "epoch": 7.270029673590504, + "grad_norm": 17.853300094604492, + "learning_rate": 9.286775631500743e-08, + "loss": 0.2506, + "step": 2450 + }, + { + "epoch": 7.299703264094956, + "grad_norm": 7.054744720458984, + "learning_rate": 9.283803863298663e-08, + "loss": 0.2073, + "step": 2460 + }, + { + "epoch": 7.329376854599406, + "grad_norm": 13.076616287231445, + "learning_rate": 9.280832095096582e-08, + "loss": 0.2507, + "step": 2470 + }, + { + "epoch": 7.359050445103858, + "grad_norm": 13.066951751708984, + "learning_rate": 9.277860326894502e-08, + "loss": 0.1105, + "step": 2480 + }, + { + "epoch": 7.388724035608309, + "grad_norm": 28.354564666748047, + "learning_rate": 9.274888558692422e-08, + "loss": 0.085, + "step": 2490 + }, + { + "epoch": 7.4183976261127595, + "grad_norm": 0.3784800171852112, + "learning_rate": 9.271916790490341e-08, + "loss": 0.0343, + "step": 2500 + }, + { + "epoch": 7.448071216617211, + "grad_norm": 0.6029236912727356, + "learning_rate": 9.268945022288262e-08, + "loss": 0.275, + "step": 2510 + }, + { + "epoch": 7.4777448071216615, + "grad_norm": 2.1216726303100586, + "learning_rate": 9.265973254086182e-08, + "loss": 0.1623, + "step": 2520 + }, + { + "epoch": 7.507418397626113, + "grad_norm": 0.0355231836438179, + "learning_rate": 9.263001485884101e-08, + "loss": 0.1408, + "step": 2530 + }, + { + "epoch": 7.537091988130564, + "grad_norm": 0.29140162467956543, + "learning_rate": 9.26002971768202e-08, + "loss": 0.1015, + "step": 2540 + }, + { + "epoch": 7.566765578635015, + "grad_norm": 0.29732346534729004, + "learning_rate": 9.25705794947994e-08, + "loss": 0.0699, + "step": 2550 + }, + { + "epoch": 7.596439169139466, + "grad_norm": 0.2110615074634552, + "learning_rate": 9.25408618127786e-08, + "loss": 0.0751, + "step": 2560 + }, + { + "epoch": 7.626112759643917, + "grad_norm": 8.235937118530273, + "learning_rate": 9.25111441307578e-08, + "loss": 0.1675, + "step": 2570 + }, + { + "epoch": 7.655786350148368, + "grad_norm": 0.5667470693588257, + "learning_rate": 9.2481426448737e-08, + "loss": 0.0689, + "step": 2580 + }, + { + "epoch": 7.6854599406528195, + "grad_norm": 5.608954429626465, + "learning_rate": 9.24517087667162e-08, + "loss": 0.2088, + "step": 2590 + }, + { + "epoch": 7.71513353115727, + "grad_norm": 0.2770245671272278, + "learning_rate": 9.242199108469539e-08, + "loss": 0.0525, + "step": 2600 + }, + { + "epoch": 7.744807121661721, + "grad_norm": 24.04474449157715, + "learning_rate": 9.239227340267459e-08, + "loss": 0.1339, + "step": 2610 + }, + { + "epoch": 7.774480712166172, + "grad_norm": 12.285730361938477, + "learning_rate": 9.236255572065378e-08, + "loss": 0.1512, + "step": 2620 + }, + { + "epoch": 7.804154302670623, + "grad_norm": 0.9491327404975891, + "learning_rate": 9.233283803863299e-08, + "loss": 0.0579, + "step": 2630 + }, + { + "epoch": 7.833827893175075, + "grad_norm": 22.28582763671875, + "learning_rate": 9.230312035661219e-08, + "loss": 0.1705, + "step": 2640 + }, + { + "epoch": 7.863501483679525, + "grad_norm": 5.683590888977051, + "learning_rate": 9.227340267459138e-08, + "loss": 0.1553, + "step": 2650 + }, + { + "epoch": 7.893175074183977, + "grad_norm": 0.3525567352771759, + "learning_rate": 9.224368499257058e-08, + "loss": 0.0262, + "step": 2660 + }, + { + "epoch": 7.922848664688427, + "grad_norm": 13.616344451904297, + "learning_rate": 9.221396731054977e-08, + "loss": 0.2904, + "step": 2670 + }, + { + "epoch": 7.9525222551928785, + "grad_norm": 10.47485065460205, + "learning_rate": 9.218424962852897e-08, + "loss": 0.2029, + "step": 2680 + }, + { + "epoch": 7.98219584569733, + "grad_norm": 17.774648666381836, + "learning_rate": 9.215453194650817e-08, + "loss": 0.2106, + "step": 2690 + }, + { + "epoch": 8.0, + "eval_accuracy": 0.9309576837416481, + "eval_loss": 0.21911980211734772, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.5515, + "eval_samples_per_second": 137.068, + "eval_steps_per_second": 17.248, + "step": 2696 + }, + { + "epoch": 8.011869436201781, + "grad_norm": 24.107219696044922, + "learning_rate": 9.212481426448736e-08, + "loss": 0.2328, + "step": 2700 + }, + { + "epoch": 8.041543026706231, + "grad_norm": 15.06407642364502, + "learning_rate": 9.209509658246656e-08, + "loss": 0.137, + "step": 2710 + }, + { + "epoch": 8.071216617210682, + "grad_norm": 1.4588819742202759, + "learning_rate": 9.206537890044575e-08, + "loss": 0.1164, + "step": 2720 + }, + { + "epoch": 8.100890207715134, + "grad_norm": 9.553811073303223, + "learning_rate": 9.203566121842495e-08, + "loss": 0.0509, + "step": 2730 + }, + { + "epoch": 8.130563798219585, + "grad_norm": 3.9450254440307617, + "learning_rate": 9.200594353640416e-08, + "loss": 0.1089, + "step": 2740 + }, + { + "epoch": 8.160237388724036, + "grad_norm": 12.40637493133545, + "learning_rate": 9.197622585438335e-08, + "loss": 0.3249, + "step": 2750 + }, + { + "epoch": 8.189910979228486, + "grad_norm": 0.10400036722421646, + "learning_rate": 9.194650817236255e-08, + "loss": 0.0394, + "step": 2760 + }, + { + "epoch": 8.219584569732937, + "grad_norm": 0.6901931762695312, + "learning_rate": 9.191679049034174e-08, + "loss": 0.1, + "step": 2770 + }, + { + "epoch": 8.249258160237389, + "grad_norm": 0.25287920236587524, + "learning_rate": 9.188707280832094e-08, + "loss": 0.2565, + "step": 2780 + }, + { + "epoch": 8.27893175074184, + "grad_norm": 0.2964816391468048, + "learning_rate": 9.185735512630013e-08, + "loss": 0.1042, + "step": 2790 + }, + { + "epoch": 8.308605341246292, + "grad_norm": 8.276501655578613, + "learning_rate": 9.182763744427934e-08, + "loss": 0.2532, + "step": 2800 + }, + { + "epoch": 8.338278931750741, + "grad_norm": 0.1342378556728363, + "learning_rate": 9.179791976225854e-08, + "loss": 0.1946, + "step": 2810 + }, + { + "epoch": 8.367952522255193, + "grad_norm": 1.319739818572998, + "learning_rate": 9.176820208023773e-08, + "loss": 0.4211, + "step": 2820 + }, + { + "epoch": 8.397626112759644, + "grad_norm": 1.121277093887329, + "learning_rate": 9.173848439821693e-08, + "loss": 0.0599, + "step": 2830 + }, + { + "epoch": 8.427299703264095, + "grad_norm": 16.37959098815918, + "learning_rate": 9.170876671619612e-08, + "loss": 0.1308, + "step": 2840 + }, + { + "epoch": 8.456973293768545, + "grad_norm": 5.720865726470947, + "learning_rate": 9.167904903417532e-08, + "loss": 0.1212, + "step": 2850 + }, + { + "epoch": 8.486646884272997, + "grad_norm": 0.27242183685302734, + "learning_rate": 9.164933135215453e-08, + "loss": 0.0869, + "step": 2860 + }, + { + "epoch": 8.516320474777448, + "grad_norm": 27.89917755126953, + "learning_rate": 9.161961367013372e-08, + "loss": 0.1789, + "step": 2870 + }, + { + "epoch": 8.5459940652819, + "grad_norm": 21.476816177368164, + "learning_rate": 9.158989598811292e-08, + "loss": 0.1614, + "step": 2880 + }, + { + "epoch": 8.57566765578635, + "grad_norm": 0.8670193552970886, + "learning_rate": 9.156017830609212e-08, + "loss": 0.0711, + "step": 2890 + }, + { + "epoch": 8.605341246290802, + "grad_norm": 2.2620668411254883, + "learning_rate": 9.153046062407131e-08, + "loss": 0.0315, + "step": 2900 + }, + { + "epoch": 8.635014836795252, + "grad_norm": 1.7524352073669434, + "learning_rate": 9.15007429420505e-08, + "loss": 0.1375, + "step": 2910 + }, + { + "epoch": 8.664688427299703, + "grad_norm": 0.3612070381641388, + "learning_rate": 9.147102526002972e-08, + "loss": 0.3209, + "step": 2920 + }, + { + "epoch": 8.694362017804155, + "grad_norm": 0.3898479640483856, + "learning_rate": 9.144130757800891e-08, + "loss": 0.1687, + "step": 2930 + }, + { + "epoch": 8.724035608308606, + "grad_norm": 0.21251612901687622, + "learning_rate": 9.14115898959881e-08, + "loss": 0.2432, + "step": 2940 + }, + { + "epoch": 8.753709198813056, + "grad_norm": 42.85517120361328, + "learning_rate": 9.13818722139673e-08, + "loss": 0.2343, + "step": 2950 + }, + { + "epoch": 8.783382789317507, + "grad_norm": 32.96746826171875, + "learning_rate": 9.13521545319465e-08, + "loss": 0.2233, + "step": 2960 + }, + { + "epoch": 8.813056379821958, + "grad_norm": 0.10275249928236008, + "learning_rate": 9.132243684992569e-08, + "loss": 0.0424, + "step": 2970 + }, + { + "epoch": 8.84272997032641, + "grad_norm": 18.732297897338867, + "learning_rate": 9.12927191679049e-08, + "loss": 0.1184, + "step": 2980 + }, + { + "epoch": 8.872403560830861, + "grad_norm": 1.1528812646865845, + "learning_rate": 9.12630014858841e-08, + "loss": 0.0322, + "step": 2990 + }, + { + "epoch": 8.90207715133531, + "grad_norm": 0.85603266954422, + "learning_rate": 9.123328380386329e-08, + "loss": 0.1462, + "step": 3000 + }, + { + "epoch": 8.931750741839762, + "grad_norm": 20.252397537231445, + "learning_rate": 9.120356612184249e-08, + "loss": 0.1394, + "step": 3010 + }, + { + "epoch": 8.961424332344214, + "grad_norm": 14.308334350585938, + "learning_rate": 9.117384843982168e-08, + "loss": 0.3164, + "step": 3020 + }, + { + "epoch": 8.991097922848665, + "grad_norm": 30.625259399414062, + "learning_rate": 9.114413075780089e-08, + "loss": 0.0844, + "step": 3030 + }, + { + "epoch": 9.0, + "eval_accuracy": 0.9320712694877505, + "eval_loss": 0.22208766639232635, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.6508, + "eval_samples_per_second": 135.021, + "eval_steps_per_second": 16.99, + "step": 3033 + }, + { + "epoch": 9.020771513353116, + "grad_norm": 0.3939421474933624, + "learning_rate": 9.111441307578009e-08, + "loss": 0.133, + "step": 3040 + }, + { + "epoch": 9.050445103857566, + "grad_norm": 2.0974738597869873, + "learning_rate": 9.108469539375928e-08, + "loss": 0.103, + "step": 3050 + }, + { + "epoch": 9.080118694362017, + "grad_norm": 0.10231151431798935, + "learning_rate": 9.105497771173848e-08, + "loss": 0.0458, + "step": 3060 + }, + { + "epoch": 9.109792284866469, + "grad_norm": 0.24687761068344116, + "learning_rate": 9.102526002971767e-08, + "loss": 0.1846, + "step": 3070 + }, + { + "epoch": 9.13946587537092, + "grad_norm": 18.640626907348633, + "learning_rate": 9.099554234769687e-08, + "loss": 0.0903, + "step": 3080 + }, + { + "epoch": 9.169139465875372, + "grad_norm": 12.755825996398926, + "learning_rate": 9.096582466567608e-08, + "loss": 0.0973, + "step": 3090 + }, + { + "epoch": 9.198813056379821, + "grad_norm": 14.442891120910645, + "learning_rate": 9.093610698365527e-08, + "loss": 0.2226, + "step": 3100 + }, + { + "epoch": 9.228486646884273, + "grad_norm": 6.920801162719727, + "learning_rate": 9.090638930163447e-08, + "loss": 0.1157, + "step": 3110 + }, + { + "epoch": 9.258160237388724, + "grad_norm": 2.163926124572754, + "learning_rate": 9.087667161961366e-08, + "loss": 0.2638, + "step": 3120 + }, + { + "epoch": 9.287833827893175, + "grad_norm": 0.025466131046414375, + "learning_rate": 9.084695393759286e-08, + "loss": 0.1513, + "step": 3130 + }, + { + "epoch": 9.317507418397627, + "grad_norm": 0.12605036795139313, + "learning_rate": 9.081723625557206e-08, + "loss": 0.1642, + "step": 3140 + }, + { + "epoch": 9.347181008902076, + "grad_norm": 2.3774449825286865, + "learning_rate": 9.078751857355126e-08, + "loss": 0.1526, + "step": 3150 + }, + { + "epoch": 9.376854599406528, + "grad_norm": 0.1418914645910263, + "learning_rate": 9.075780089153046e-08, + "loss": 0.2275, + "step": 3160 + }, + { + "epoch": 9.40652818991098, + "grad_norm": 0.2972716689109802, + "learning_rate": 9.072808320950966e-08, + "loss": 0.1661, + "step": 3170 + }, + { + "epoch": 9.43620178041543, + "grad_norm": 17.706111907958984, + "learning_rate": 9.069836552748885e-08, + "loss": 0.1698, + "step": 3180 + }, + { + "epoch": 9.465875370919882, + "grad_norm": 16.488874435424805, + "learning_rate": 9.066864784546805e-08, + "loss": 0.1881, + "step": 3190 + }, + { + "epoch": 9.495548961424332, + "grad_norm": 22.875417709350586, + "learning_rate": 9.063893016344724e-08, + "loss": 0.1366, + "step": 3200 + }, + { + "epoch": 9.525222551928783, + "grad_norm": 4.580638885498047, + "learning_rate": 9.060921248142645e-08, + "loss": 0.2142, + "step": 3210 + }, + { + "epoch": 9.554896142433234, + "grad_norm": 11.091020584106445, + "learning_rate": 9.057949479940565e-08, + "loss": 0.159, + "step": 3220 + }, + { + "epoch": 9.584569732937686, + "grad_norm": 13.344446182250977, + "learning_rate": 9.054977711738484e-08, + "loss": 0.024, + "step": 3230 + }, + { + "epoch": 9.614243323442137, + "grad_norm": 0.2928605377674103, + "learning_rate": 9.052005943536404e-08, + "loss": 0.1155, + "step": 3240 + }, + { + "epoch": 9.643916913946587, + "grad_norm": 0.06850615888834, + "learning_rate": 9.049034175334323e-08, + "loss": 0.1119, + "step": 3250 + }, + { + "epoch": 9.673590504451038, + "grad_norm": 2.413254499435425, + "learning_rate": 9.046062407132243e-08, + "loss": 0.3793, + "step": 3260 + }, + { + "epoch": 9.70326409495549, + "grad_norm": 15.867901802062988, + "learning_rate": 9.043090638930164e-08, + "loss": 0.1708, + "step": 3270 + }, + { + "epoch": 9.732937685459941, + "grad_norm": 0.6947576403617859, + "learning_rate": 9.040118870728083e-08, + "loss": 0.133, + "step": 3280 + }, + { + "epoch": 9.762611275964392, + "grad_norm": 1.1813578605651855, + "learning_rate": 9.037147102526003e-08, + "loss": 0.147, + "step": 3290 + }, + { + "epoch": 9.792284866468842, + "grad_norm": 1.2385790348052979, + "learning_rate": 9.034175334323922e-08, + "loss": 0.3009, + "step": 3300 + }, + { + "epoch": 9.821958456973293, + "grad_norm": 24.9970703125, + "learning_rate": 9.031203566121842e-08, + "loss": 0.1131, + "step": 3310 + }, + { + "epoch": 9.851632047477745, + "grad_norm": 11.930057525634766, + "learning_rate": 9.028231797919761e-08, + "loss": 0.1365, + "step": 3320 + }, + { + "epoch": 9.881305637982196, + "grad_norm": 13.112090110778809, + "learning_rate": 9.025260029717682e-08, + "loss": 0.0878, + "step": 3330 + }, + { + "epoch": 9.910979228486648, + "grad_norm": 10.377202987670898, + "learning_rate": 9.022288261515602e-08, + "loss": 0.0871, + "step": 3340 + }, + { + "epoch": 9.940652818991097, + "grad_norm": 1.3489915132522583, + "learning_rate": 9.019316493313521e-08, + "loss": 0.1515, + "step": 3350 + }, + { + "epoch": 9.970326409495549, + "grad_norm": 9.637001991271973, + "learning_rate": 9.016344725111441e-08, + "loss": 0.2148, + "step": 3360 + }, + { + "epoch": 10.0, + "grad_norm": 49.020973205566406, + "learning_rate": 9.01337295690936e-08, + "loss": 0.1854, + "step": 3370 + }, + { + "epoch": 10.0, + "eval_accuracy": 0.9298440979955457, + "eval_loss": 0.22109197080135345, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 11.004, + "eval_samples_per_second": 81.606, + "eval_steps_per_second": 10.269, + "step": 3370 + }, + { + "epoch": 10.029673590504451, + "grad_norm": 16.169588088989258, + "learning_rate": 9.01040118870728e-08, + "loss": 0.3014, + "step": 3380 + }, + { + "epoch": 10.059347181008903, + "grad_norm": 18.831567764282227, + "learning_rate": 9.007429420505201e-08, + "loss": 0.2898, + "step": 3390 + }, + { + "epoch": 10.089020771513352, + "grad_norm": 0.08478633314371109, + "learning_rate": 9.00445765230312e-08, + "loss": 0.0215, + "step": 3400 + }, + { + "epoch": 10.118694362017804, + "grad_norm": 26.11762809753418, + "learning_rate": 9.00148588410104e-08, + "loss": 0.2864, + "step": 3410 + }, + { + "epoch": 10.148367952522255, + "grad_norm": 0.8345847725868225, + "learning_rate": 8.99851411589896e-08, + "loss": 0.139, + "step": 3420 + }, + { + "epoch": 10.178041543026707, + "grad_norm": 0.77332603931427, + "learning_rate": 8.995542347696879e-08, + "loss": 0.2118, + "step": 3430 + }, + { + "epoch": 10.207715133531158, + "grad_norm": 12.60360050201416, + "learning_rate": 8.992570579494799e-08, + "loss": 0.2436, + "step": 3440 + }, + { + "epoch": 10.237388724035608, + "grad_norm": 0.405839204788208, + "learning_rate": 8.98959881129272e-08, + "loss": 0.1032, + "step": 3450 + }, + { + "epoch": 10.267062314540059, + "grad_norm": 0.0863649919629097, + "learning_rate": 8.986627043090639e-08, + "loss": 0.0722, + "step": 3460 + }, + { + "epoch": 10.29673590504451, + "grad_norm": 0.2525078058242798, + "learning_rate": 8.983655274888559e-08, + "loss": 0.0676, + "step": 3470 + }, + { + "epoch": 10.326409495548962, + "grad_norm": 0.5855773091316223, + "learning_rate": 8.980683506686478e-08, + "loss": 0.0859, + "step": 3480 + }, + { + "epoch": 10.356083086053413, + "grad_norm": 22.568944931030273, + "learning_rate": 8.977711738484398e-08, + "loss": 0.2962, + "step": 3490 + }, + { + "epoch": 10.385756676557863, + "grad_norm": 21.30486297607422, + "learning_rate": 8.974739970282317e-08, + "loss": 0.0966, + "step": 3500 + }, + { + "epoch": 10.415430267062314, + "grad_norm": 1.1676387786865234, + "learning_rate": 8.971768202080238e-08, + "loss": 0.1006, + "step": 3510 + }, + { + "epoch": 10.445103857566766, + "grad_norm": 0.09169832617044449, + "learning_rate": 8.968796433878158e-08, + "loss": 0.1794, + "step": 3520 + }, + { + "epoch": 10.474777448071217, + "grad_norm": 0.034213222563266754, + "learning_rate": 8.965824665676077e-08, + "loss": 0.1431, + "step": 3530 + }, + { + "epoch": 10.504451038575668, + "grad_norm": 29.456317901611328, + "learning_rate": 8.962852897473997e-08, + "loss": 0.1719, + "step": 3540 + }, + { + "epoch": 10.534124629080118, + "grad_norm": 17.015743255615234, + "learning_rate": 8.959881129271916e-08, + "loss": 0.1941, + "step": 3550 + }, + { + "epoch": 10.56379821958457, + "grad_norm": 1.1065995693206787, + "learning_rate": 8.956909361069836e-08, + "loss": 0.201, + "step": 3560 + }, + { + "epoch": 10.59347181008902, + "grad_norm": 0.5447260737419128, + "learning_rate": 8.953937592867757e-08, + "loss": 0.0792, + "step": 3570 + }, + { + "epoch": 10.623145400593472, + "grad_norm": 0.39647677540779114, + "learning_rate": 8.950965824665676e-08, + "loss": 0.1648, + "step": 3580 + }, + { + "epoch": 10.652818991097924, + "grad_norm": 9.720879554748535, + "learning_rate": 8.947994056463596e-08, + "loss": 0.1307, + "step": 3590 + }, + { + "epoch": 10.682492581602373, + "grad_norm": 11.971400260925293, + "learning_rate": 8.945022288261516e-08, + "loss": 0.0529, + "step": 3600 + }, + { + "epoch": 10.712166172106825, + "grad_norm": 3.7124195098876953, + "learning_rate": 8.942050520059435e-08, + "loss": 0.151, + "step": 3610 + }, + { + "epoch": 10.741839762611276, + "grad_norm": 4.488855838775635, + "learning_rate": 8.939078751857355e-08, + "loss": 0.1335, + "step": 3620 + }, + { + "epoch": 10.771513353115727, + "grad_norm": 0.0224628746509552, + "learning_rate": 8.936106983655275e-08, + "loss": 0.0739, + "step": 3630 + }, + { + "epoch": 10.801186943620179, + "grad_norm": 7.953196048736572, + "learning_rate": 8.933135215453195e-08, + "loss": 0.0892, + "step": 3640 + }, + { + "epoch": 10.830860534124628, + "grad_norm": 7.482048988342285, + "learning_rate": 8.930163447251115e-08, + "loss": 0.2132, + "step": 3650 + }, + { + "epoch": 10.86053412462908, + "grad_norm": 0.28434640169143677, + "learning_rate": 8.927191679049034e-08, + "loss": 0.2755, + "step": 3660 + }, + { + "epoch": 10.890207715133531, + "grad_norm": 0.03850617632269859, + "learning_rate": 8.924219910846954e-08, + "loss": 0.1239, + "step": 3670 + }, + { + "epoch": 10.919881305637983, + "grad_norm": 1.5239570140838623, + "learning_rate": 8.921248142644873e-08, + "loss": 0.1832, + "step": 3680 + }, + { + "epoch": 10.949554896142434, + "grad_norm": 1.2762856483459473, + "learning_rate": 8.918276374442794e-08, + "loss": 0.1727, + "step": 3690 + }, + { + "epoch": 10.979228486646884, + "grad_norm": 16.80221939086914, + "learning_rate": 8.915304606240714e-08, + "loss": 0.2154, + "step": 3700 + }, + { + "epoch": 11.0, + "eval_accuracy": 0.9320712694877505, + "eval_loss": 0.22015173733234406, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 10.9912, + "eval_samples_per_second": 81.702, + "eval_steps_per_second": 10.281, + "step": 3707 + }, + { + "epoch": 11.008902077151335, + "grad_norm": 6.15535831451416, + "learning_rate": 8.912332838038633e-08, + "loss": 0.066, + "step": 3710 + }, + { + "epoch": 11.038575667655786, + "grad_norm": 0.046502452343702316, + "learning_rate": 8.909361069836553e-08, + "loss": 0.1583, + "step": 3720 + }, + { + "epoch": 11.068249258160238, + "grad_norm": 0.10934209823608398, + "learning_rate": 8.906389301634472e-08, + "loss": 0.0661, + "step": 3730 + }, + { + "epoch": 11.09792284866469, + "grad_norm": 9.474443435668945, + "learning_rate": 8.903417533432392e-08, + "loss": 0.3723, + "step": 3740 + }, + { + "epoch": 11.127596439169139, + "grad_norm": 21.12241554260254, + "learning_rate": 8.900445765230311e-08, + "loss": 0.1351, + "step": 3750 + }, + { + "epoch": 11.15727002967359, + "grad_norm": 8.242246627807617, + "learning_rate": 8.897473997028231e-08, + "loss": 0.2095, + "step": 3760 + }, + { + "epoch": 11.186943620178042, + "grad_norm": 34.50856018066406, + "learning_rate": 8.89450222882615e-08, + "loss": 0.1801, + "step": 3770 + }, + { + "epoch": 11.216617210682493, + "grad_norm": 24.16424560546875, + "learning_rate": 8.89153046062407e-08, + "loss": 0.0372, + "step": 3780 + }, + { + "epoch": 11.246290801186944, + "grad_norm": 11.621710777282715, + "learning_rate": 8.88855869242199e-08, + "loss": 0.3184, + "step": 3790 + }, + { + "epoch": 11.275964391691394, + "grad_norm": 0.8992859721183777, + "learning_rate": 8.88558692421991e-08, + "loss": 0.1621, + "step": 3800 + }, + { + "epoch": 11.305637982195845, + "grad_norm": 19.456026077270508, + "learning_rate": 8.88261515601783e-08, + "loss": 0.0989, + "step": 3810 + }, + { + "epoch": 11.335311572700297, + "grad_norm": 0.041665442287921906, + "learning_rate": 8.87964338781575e-08, + "loss": 0.1117, + "step": 3820 + }, + { + "epoch": 11.364985163204748, + "grad_norm": 0.8888910412788391, + "learning_rate": 8.876671619613669e-08, + "loss": 0.1463, + "step": 3830 + }, + { + "epoch": 11.3946587537092, + "grad_norm": 13.716751098632812, + "learning_rate": 8.873699851411589e-08, + "loss": 0.1736, + "step": 3840 + }, + { + "epoch": 11.42433234421365, + "grad_norm": 16.228403091430664, + "learning_rate": 8.870728083209508e-08, + "loss": 0.2301, + "step": 3850 + }, + { + "epoch": 11.4540059347181, + "grad_norm": 0.3054238259792328, + "learning_rate": 8.867756315007429e-08, + "loss": 0.0806, + "step": 3860 + }, + { + "epoch": 11.483679525222552, + "grad_norm": 17.66272735595703, + "learning_rate": 8.864784546805349e-08, + "loss": 0.2471, + "step": 3870 + }, + { + "epoch": 11.513353115727003, + "grad_norm": 1.2907960414886475, + "learning_rate": 8.861812778603268e-08, + "loss": 0.1475, + "step": 3880 + }, + { + "epoch": 11.543026706231455, + "grad_norm": 2.2657744884490967, + "learning_rate": 8.858841010401188e-08, + "loss": 0.1208, + "step": 3890 + }, + { + "epoch": 11.572700296735905, + "grad_norm": 3.258605718612671, + "learning_rate": 8.855869242199107e-08, + "loss": 0.1882, + "step": 3900 + }, + { + "epoch": 11.602373887240356, + "grad_norm": 30.848482131958008, + "learning_rate": 8.852897473997027e-08, + "loss": 0.0772, + "step": 3910 + }, + { + "epoch": 11.632047477744807, + "grad_norm": 15.555870056152344, + "learning_rate": 8.849925705794948e-08, + "loss": 0.345, + "step": 3920 + }, + { + "epoch": 11.661721068249259, + "grad_norm": 10.853310585021973, + "learning_rate": 8.846953937592867e-08, + "loss": 0.1074, + "step": 3930 + }, + { + "epoch": 11.691394658753708, + "grad_norm": 23.74134635925293, + "learning_rate": 8.843982169390787e-08, + "loss": 0.1901, + "step": 3940 + }, + { + "epoch": 11.72106824925816, + "grad_norm": 0.23338347673416138, + "learning_rate": 8.841010401188706e-08, + "loss": 0.2067, + "step": 3950 + }, + { + "epoch": 11.750741839762611, + "grad_norm": 21.99913215637207, + "learning_rate": 8.838038632986626e-08, + "loss": 0.1886, + "step": 3960 + }, + { + "epoch": 11.780415430267063, + "grad_norm": 6.419389247894287, + "learning_rate": 8.835066864784546e-08, + "loss": 0.1294, + "step": 3970 + }, + { + "epoch": 11.810089020771514, + "grad_norm": 0.052077826112508774, + "learning_rate": 8.832095096582466e-08, + "loss": 0.0542, + "step": 3980 + }, + { + "epoch": 11.839762611275965, + "grad_norm": 15.534661293029785, + "learning_rate": 8.829123328380386e-08, + "loss": 0.1772, + "step": 3990 + }, + { + "epoch": 11.869436201780415, + "grad_norm": 23.976192474365234, + "learning_rate": 8.826151560178305e-08, + "loss": 0.1134, + "step": 4000 + }, + { + "epoch": 11.899109792284866, + "grad_norm": 12.765986442565918, + "learning_rate": 8.823179791976225e-08, + "loss": 0.1904, + "step": 4010 + }, + { + "epoch": 11.928783382789318, + "grad_norm": 2.1996729373931885, + "learning_rate": 8.820208023774145e-08, + "loss": 0.1163, + "step": 4020 + }, + { + "epoch": 11.958456973293769, + "grad_norm": 6.991058826446533, + "learning_rate": 8.817236255572064e-08, + "loss": 0.1233, + "step": 4030 + }, + { + "epoch": 11.988130563798219, + "grad_norm": 1.23166024684906, + "learning_rate": 8.814264487369985e-08, + "loss": 0.2043, + "step": 4040 + }, + { + "epoch": 12.0, + "eval_accuracy": 0.9309576837416481, + "eval_loss": 0.2198721468448639, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 10.808, + "eval_samples_per_second": 83.086, + "eval_steps_per_second": 10.455, + "step": 4044 + }, + { + "epoch": 12.01780415430267, + "grad_norm": 12.069042205810547, + "learning_rate": 8.811292719167905e-08, + "loss": 0.2869, + "step": 4050 + }, + { + "epoch": 12.047477744807122, + "grad_norm": 0.6559314131736755, + "learning_rate": 8.808320950965824e-08, + "loss": 0.1104, + "step": 4060 + }, + { + "epoch": 12.077151335311573, + "grad_norm": 30.510793685913086, + "learning_rate": 8.805349182763744e-08, + "loss": 0.1183, + "step": 4070 + }, + { + "epoch": 12.106824925816024, + "grad_norm": 0.8435547351837158, + "learning_rate": 8.802377414561663e-08, + "loss": 0.1817, + "step": 4080 + }, + { + "epoch": 12.136498516320474, + "grad_norm": 18.076919555664062, + "learning_rate": 8.799405646359583e-08, + "loss": 0.1911, + "step": 4090 + }, + { + "epoch": 12.166172106824925, + "grad_norm": 1.4586642980575562, + "learning_rate": 8.796433878157504e-08, + "loss": 0.3478, + "step": 4100 + }, + { + "epoch": 12.195845697329377, + "grad_norm": 31.108623504638672, + "learning_rate": 8.793462109955423e-08, + "loss": 0.168, + "step": 4110 + }, + { + "epoch": 12.225519287833828, + "grad_norm": 12.87566089630127, + "learning_rate": 8.790490341753343e-08, + "loss": 0.1741, + "step": 4120 + }, + { + "epoch": 12.25519287833828, + "grad_norm": 22.065593719482422, + "learning_rate": 8.787518573551262e-08, + "loss": 0.1237, + "step": 4130 + }, + { + "epoch": 12.28486646884273, + "grad_norm": 1.696725606918335, + "learning_rate": 8.784546805349182e-08, + "loss": 0.1599, + "step": 4140 + }, + { + "epoch": 12.31454005934718, + "grad_norm": 0.23535385727882385, + "learning_rate": 8.781575037147101e-08, + "loss": 0.1507, + "step": 4150 + }, + { + "epoch": 12.344213649851632, + "grad_norm": 0.6873607635498047, + "learning_rate": 8.778603268945022e-08, + "loss": 0.0387, + "step": 4160 + }, + { + "epoch": 12.373887240356083, + "grad_norm": 0.7653520703315735, + "learning_rate": 8.775631500742942e-08, + "loss": 0.0498, + "step": 4170 + }, + { + "epoch": 12.403560830860535, + "grad_norm": 0.42647314071655273, + "learning_rate": 8.772659732540861e-08, + "loss": 0.1271, + "step": 4180 + }, + { + "epoch": 12.433234421364984, + "grad_norm": 2.022610664367676, + "learning_rate": 8.769687964338781e-08, + "loss": 0.1681, + "step": 4190 + }, + { + "epoch": 12.462908011869436, + "grad_norm": 4.275892734527588, + "learning_rate": 8.7667161961367e-08, + "loss": 0.14, + "step": 4200 + }, + { + "epoch": 12.492581602373887, + "grad_norm": 0.4738043546676636, + "learning_rate": 8.76374442793462e-08, + "loss": 0.1636, + "step": 4210 + }, + { + "epoch": 12.522255192878339, + "grad_norm": 3.2330703735351562, + "learning_rate": 8.760772659732541e-08, + "loss": 0.07, + "step": 4220 + }, + { + "epoch": 12.55192878338279, + "grad_norm": 19.79316520690918, + "learning_rate": 8.75780089153046e-08, + "loss": 0.1279, + "step": 4230 + }, + { + "epoch": 12.58160237388724, + "grad_norm": 14.1696195602417, + "learning_rate": 8.75482912332838e-08, + "loss": 0.1473, + "step": 4240 + }, + { + "epoch": 12.611275964391691, + "grad_norm": 0.28780511021614075, + "learning_rate": 8.7518573551263e-08, + "loss": 0.0984, + "step": 4250 + }, + { + "epoch": 12.640949554896142, + "grad_norm": 17.837764739990234, + "learning_rate": 8.748885586924219e-08, + "loss": 0.0804, + "step": 4260 + }, + { + "epoch": 12.670623145400594, + "grad_norm": 0.019604748114943504, + "learning_rate": 8.745913818722139e-08, + "loss": 0.2305, + "step": 4270 + }, + { + "epoch": 12.700296735905045, + "grad_norm": 3.1394736766815186, + "learning_rate": 8.74294205052006e-08, + "loss": 0.1089, + "step": 4280 + }, + { + "epoch": 12.729970326409495, + "grad_norm": 22.184972763061523, + "learning_rate": 8.739970282317979e-08, + "loss": 0.1396, + "step": 4290 + }, + { + "epoch": 12.759643916913946, + "grad_norm": 4.390349864959717, + "learning_rate": 8.736998514115899e-08, + "loss": 0.1166, + "step": 4300 + }, + { + "epoch": 12.789317507418398, + "grad_norm": 0.45344337821006775, + "learning_rate": 8.734026745913818e-08, + "loss": 0.1531, + "step": 4310 + }, + { + "epoch": 12.818991097922849, + "grad_norm": 32.4881591796875, + "learning_rate": 8.731054977711738e-08, + "loss": 0.1501, + "step": 4320 + }, + { + "epoch": 12.8486646884273, + "grad_norm": 31.16307830810547, + "learning_rate": 8.728083209509657e-08, + "loss": 0.2246, + "step": 4330 + }, + { + "epoch": 12.87833827893175, + "grad_norm": 0.30473053455352783, + "learning_rate": 8.725111441307578e-08, + "loss": 0.1377, + "step": 4340 + }, + { + "epoch": 12.908011869436201, + "grad_norm": 8.339690208435059, + "learning_rate": 8.722139673105498e-08, + "loss": 0.2458, + "step": 4350 + }, + { + "epoch": 12.937685459940653, + "grad_norm": 0.2281438410282135, + "learning_rate": 8.719167904903417e-08, + "loss": 0.1499, + "step": 4360 + }, + { + "epoch": 12.967359050445104, + "grad_norm": 0.11619602143764496, + "learning_rate": 8.716196136701337e-08, + "loss": 0.1962, + "step": 4370 + }, + { + "epoch": 12.997032640949556, + "grad_norm": 6.358119487762451, + "learning_rate": 8.713224368499256e-08, + "loss": 0.0453, + "step": 4380 + }, + { + "epoch": 13.0, + "eval_accuracy": 0.9320712694877505, + "eval_loss": 0.218551367521286, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 10.853, + "eval_samples_per_second": 82.742, + "eval_steps_per_second": 10.412, + "step": 4381 + }, + { + "epoch": 13.026706231454005, + "grad_norm": 0.5942051410675049, + "learning_rate": 8.710252600297176e-08, + "loss": 0.059, + "step": 4390 + }, + { + "epoch": 13.056379821958457, + "grad_norm": 0.3411566913127899, + "learning_rate": 8.707280832095097e-08, + "loss": 0.1309, + "step": 4400 + }, + { + "epoch": 13.086053412462908, + "grad_norm": 0.3161620497703552, + "learning_rate": 8.704309063893016e-08, + "loss": 0.1433, + "step": 4410 + }, + { + "epoch": 13.11572700296736, + "grad_norm": 1.3269423246383667, + "learning_rate": 8.701337295690936e-08, + "loss": 0.1246, + "step": 4420 + }, + { + "epoch": 13.14540059347181, + "grad_norm": 13.976136207580566, + "learning_rate": 8.698365527488855e-08, + "loss": 0.3153, + "step": 4430 + }, + { + "epoch": 13.17507418397626, + "grad_norm": 2.6477408409118652, + "learning_rate": 8.695393759286775e-08, + "loss": 0.1076, + "step": 4440 + }, + { + "epoch": 13.204747774480712, + "grad_norm": 0.6852326989173889, + "learning_rate": 8.692421991084696e-08, + "loss": 0.311, + "step": 4450 + }, + { + "epoch": 13.234421364985163, + "grad_norm": 0.5858950614929199, + "learning_rate": 8.689450222882615e-08, + "loss": 0.2726, + "step": 4460 + }, + { + "epoch": 13.264094955489615, + "grad_norm": 4.602228164672852, + "learning_rate": 8.686478454680535e-08, + "loss": 0.0844, + "step": 4470 + }, + { + "epoch": 13.293768545994066, + "grad_norm": 13.746955871582031, + "learning_rate": 8.683506686478455e-08, + "loss": 0.1829, + "step": 4480 + }, + { + "epoch": 13.323442136498516, + "grad_norm": 9.661428451538086, + "learning_rate": 8.680534918276374e-08, + "loss": 0.102, + "step": 4490 + }, + { + "epoch": 13.353115727002967, + "grad_norm": 29.154470443725586, + "learning_rate": 8.677563150074294e-08, + "loss": 0.3589, + "step": 4500 + }, + { + "epoch": 13.382789317507418, + "grad_norm": 0.8160312175750732, + "learning_rate": 8.674591381872214e-08, + "loss": 0.0798, + "step": 4510 + }, + { + "epoch": 13.41246290801187, + "grad_norm": 0.5174208283424377, + "learning_rate": 8.671619613670134e-08, + "loss": 0.2771, + "step": 4520 + }, + { + "epoch": 13.442136498516321, + "grad_norm": 2.9450185298919678, + "learning_rate": 8.668647845468054e-08, + "loss": 0.2008, + "step": 4530 + }, + { + "epoch": 13.47181008902077, + "grad_norm": 0.0856231227517128, + "learning_rate": 8.665676077265973e-08, + "loss": 0.0869, + "step": 4540 + }, + { + "epoch": 13.501483679525222, + "grad_norm": 0.3743760585784912, + "learning_rate": 8.662704309063893e-08, + "loss": 0.1608, + "step": 4550 + }, + { + "epoch": 13.531157270029674, + "grad_norm": 13.917235374450684, + "learning_rate": 8.659732540861812e-08, + "loss": 0.1235, + "step": 4560 + }, + { + "epoch": 13.560830860534125, + "grad_norm": 14.658812522888184, + "learning_rate": 8.656760772659733e-08, + "loss": 0.1374, + "step": 4570 + }, + { + "epoch": 13.590504451038576, + "grad_norm": 22.579662322998047, + "learning_rate": 8.653789004457653e-08, + "loss": 0.1403, + "step": 4580 + }, + { + "epoch": 13.620178041543026, + "grad_norm": 0.46094509959220886, + "learning_rate": 8.650817236255572e-08, + "loss": 0.0534, + "step": 4590 + }, + { + "epoch": 13.649851632047477, + "grad_norm": 0.37638503313064575, + "learning_rate": 8.647845468053492e-08, + "loss": 0.1634, + "step": 4600 + }, + { + "epoch": 13.679525222551929, + "grad_norm": 13.219155311584473, + "learning_rate": 8.644873699851411e-08, + "loss": 0.248, + "step": 4610 + }, + { + "epoch": 13.70919881305638, + "grad_norm": 6.3478217124938965, + "learning_rate": 8.641901931649331e-08, + "loss": 0.2434, + "step": 4620 + }, + { + "epoch": 13.738872403560832, + "grad_norm": 6.77518892288208, + "learning_rate": 8.638930163447252e-08, + "loss": 0.0525, + "step": 4630 + }, + { + "epoch": 13.768545994065281, + "grad_norm": 0.15782105922698975, + "learning_rate": 8.635958395245171e-08, + "loss": 0.1004, + "step": 4640 + }, + { + "epoch": 13.798219584569733, + "grad_norm": 0.8929498195648193, + "learning_rate": 8.632986627043091e-08, + "loss": 0.1159, + "step": 4650 + }, + { + "epoch": 13.827893175074184, + "grad_norm": 1.554740071296692, + "learning_rate": 8.63001485884101e-08, + "loss": 0.3884, + "step": 4660 + }, + { + "epoch": 13.857566765578635, + "grad_norm": 0.1420467048883438, + "learning_rate": 8.62704309063893e-08, + "loss": 0.1993, + "step": 4670 + }, + { + "epoch": 13.887240356083087, + "grad_norm": 0.09709582477807999, + "learning_rate": 8.62407132243685e-08, + "loss": 0.0345, + "step": 4680 + }, + { + "epoch": 13.916913946587536, + "grad_norm": 0.2600967288017273, + "learning_rate": 8.62109955423477e-08, + "loss": 0.1363, + "step": 4690 + }, + { + "epoch": 13.946587537091988, + "grad_norm": 14.861351013183594, + "learning_rate": 8.61812778603269e-08, + "loss": 0.1393, + "step": 4700 + }, + { + "epoch": 13.97626112759644, + "grad_norm": 0.15369026362895966, + "learning_rate": 8.61515601783061e-08, + "loss": 0.0341, + "step": 4710 + }, + { + "epoch": 14.0, + "eval_accuracy": 0.933184855233853, + "eval_loss": 0.21791110932826996, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 11.7542, + "eval_samples_per_second": 76.398, + "eval_steps_per_second": 9.614, + "step": 4718 + }, + { + "epoch": 14.00593471810089, + "grad_norm": 3.44569993019104, + "learning_rate": 8.612184249628529e-08, + "loss": 0.3124, + "step": 4720 + }, + { + "epoch": 14.035608308605342, + "grad_norm": 0.16537100076675415, + "learning_rate": 8.609212481426449e-08, + "loss": 0.2707, + "step": 4730 + }, + { + "epoch": 14.065281899109792, + "grad_norm": 21.553791046142578, + "learning_rate": 8.606240713224368e-08, + "loss": 0.1396, + "step": 4740 + }, + { + "epoch": 14.094955489614243, + "grad_norm": 5.116982460021973, + "learning_rate": 8.603268945022289e-08, + "loss": 0.099, + "step": 4750 + }, + { + "epoch": 14.124629080118694, + "grad_norm": 4.130936622619629, + "learning_rate": 8.600297176820209e-08, + "loss": 0.0886, + "step": 4760 + }, + { + "epoch": 14.154302670623146, + "grad_norm": 7.831549644470215, + "learning_rate": 8.597325408618128e-08, + "loss": 0.1932, + "step": 4770 + }, + { + "epoch": 14.183976261127597, + "grad_norm": 26.29744529724121, + "learning_rate": 8.594353640416048e-08, + "loss": 0.1151, + "step": 4780 + }, + { + "epoch": 14.213649851632047, + "grad_norm": 7.119102478027344, + "learning_rate": 8.591381872213966e-08, + "loss": 0.3009, + "step": 4790 + }, + { + "epoch": 14.243323442136498, + "grad_norm": 6.333064556121826, + "learning_rate": 8.588410104011887e-08, + "loss": 0.1705, + "step": 4800 + }, + { + "epoch": 14.27299703264095, + "grad_norm": 0.28550052642822266, + "learning_rate": 8.585438335809806e-08, + "loss": 0.0313, + "step": 4810 + }, + { + "epoch": 14.302670623145401, + "grad_norm": 14.81749153137207, + "learning_rate": 8.582466567607726e-08, + "loss": 0.1118, + "step": 4820 + }, + { + "epoch": 14.332344213649852, + "grad_norm": 7.3748087882995605, + "learning_rate": 8.579494799405645e-08, + "loss": 0.0987, + "step": 4830 + }, + { + "epoch": 14.362017804154302, + "grad_norm": 1.4910824298858643, + "learning_rate": 8.576523031203565e-08, + "loss": 0.1356, + "step": 4840 + }, + { + "epoch": 14.391691394658753, + "grad_norm": 7.39053201675415, + "learning_rate": 8.573551263001485e-08, + "loss": 0.1948, + "step": 4850 + }, + { + "epoch": 14.421364985163205, + "grad_norm": 25.01482582092285, + "learning_rate": 8.570579494799405e-08, + "loss": 0.243, + "step": 4860 + }, + { + "epoch": 14.451038575667656, + "grad_norm": 19.681995391845703, + "learning_rate": 8.567607726597325e-08, + "loss": 0.3065, + "step": 4870 + }, + { + "epoch": 14.480712166172108, + "grad_norm": 0.27532094717025757, + "learning_rate": 8.564635958395244e-08, + "loss": 0.0093, + "step": 4880 + }, + { + "epoch": 14.510385756676557, + "grad_norm": 0.08915796130895615, + "learning_rate": 8.561664190193164e-08, + "loss": 0.1708, + "step": 4890 + }, + { + "epoch": 14.540059347181009, + "grad_norm": 0.2097565233707428, + "learning_rate": 8.558692421991084e-08, + "loss": 0.157, + "step": 4900 + }, + { + "epoch": 14.56973293768546, + "grad_norm": 5.392805099487305, + "learning_rate": 8.555720653789003e-08, + "loss": 0.0552, + "step": 4910 + }, + { + "epoch": 14.599406528189911, + "grad_norm": 11.225603103637695, + "learning_rate": 8.552748885586924e-08, + "loss": 0.224, + "step": 4920 + }, + { + "epoch": 14.629080118694361, + "grad_norm": 0.3402024805545807, + "learning_rate": 8.549777117384844e-08, + "loss": 0.0399, + "step": 4930 + }, + { + "epoch": 14.658753709198812, + "grad_norm": 14.319948196411133, + "learning_rate": 8.546805349182763e-08, + "loss": 0.2361, + "step": 4940 + }, + { + "epoch": 14.688427299703264, + "grad_norm": 0.6662145256996155, + "learning_rate": 8.543833580980683e-08, + "loss": 0.1299, + "step": 4950 + }, + { + "epoch": 14.718100890207715, + "grad_norm": 1.3279954195022583, + "learning_rate": 8.540861812778602e-08, + "loss": 0.2809, + "step": 4960 + }, + { + "epoch": 14.747774480712167, + "grad_norm": 16.631370544433594, + "learning_rate": 8.537890044576522e-08, + "loss": 0.1327, + "step": 4970 + }, + { + "epoch": 14.777448071216618, + "grad_norm": 0.2511196732521057, + "learning_rate": 8.534918276374443e-08, + "loss": 0.1092, + "step": 4980 + }, + { + "epoch": 14.807121661721068, + "grad_norm": 15.578234672546387, + "learning_rate": 8.531946508172362e-08, + "loss": 0.0574, + "step": 4990 + }, + { + "epoch": 14.836795252225519, + "grad_norm": 17.094385147094727, + "learning_rate": 8.528974739970282e-08, + "loss": 0.1929, + "step": 5000 + }, + { + "epoch": 14.86646884272997, + "grad_norm": 6.12711238861084, + "learning_rate": 8.526002971768201e-08, + "loss": 0.1257, + "step": 5010 + }, + { + "epoch": 14.896142433234422, + "grad_norm": 2.0925960540771484, + "learning_rate": 8.523031203566121e-08, + "loss": 0.0666, + "step": 5020 + }, + { + "epoch": 14.925816023738872, + "grad_norm": 29.710464477539062, + "learning_rate": 8.52005943536404e-08, + "loss": 0.1266, + "step": 5030 + }, + { + "epoch": 14.955489614243323, + "grad_norm": 1.0740097761154175, + "learning_rate": 8.517087667161961e-08, + "loss": 0.1939, + "step": 5040 + }, + { + "epoch": 14.985163204747774, + "grad_norm": 5.65964412689209, + "learning_rate": 8.514115898959881e-08, + "loss": 0.1163, + "step": 5050 + }, + { + "epoch": 15.0, + "eval_accuracy": 0.9320712694877505, + "eval_loss": 0.2183542251586914, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 11.3808, + "eval_samples_per_second": 78.905, + "eval_steps_per_second": 9.929, + "step": 5055 + }, + { + "epoch": 15.014836795252226, + "grad_norm": 7.287575721740723, + "learning_rate": 8.5111441307578e-08, + "loss": 0.2429, + "step": 5060 + }, + { + "epoch": 15.044510385756677, + "grad_norm": 1.3448513746261597, + "learning_rate": 8.50817236255572e-08, + "loss": 0.1875, + "step": 5070 + }, + { + "epoch": 15.074183976261127, + "grad_norm": 16.378055572509766, + "learning_rate": 8.50520059435364e-08, + "loss": 0.15, + "step": 5080 + }, + { + "epoch": 15.103857566765578, + "grad_norm": 27.48067855834961, + "learning_rate": 8.502228826151559e-08, + "loss": 0.2398, + "step": 5090 + }, + { + "epoch": 15.13353115727003, + "grad_norm": 4.838328838348389, + "learning_rate": 8.49925705794948e-08, + "loss": 0.1331, + "step": 5100 + }, + { + "epoch": 15.163204747774481, + "grad_norm": 3.51663875579834, + "learning_rate": 8.4962852897474e-08, + "loss": 0.0533, + "step": 5110 + }, + { + "epoch": 15.192878338278932, + "grad_norm": 0.05941024795174599, + "learning_rate": 8.493313521545319e-08, + "loss": 0.074, + "step": 5120 + }, + { + "epoch": 15.222551928783382, + "grad_norm": 2.2444064617156982, + "learning_rate": 8.490341753343239e-08, + "loss": 0.161, + "step": 5130 + }, + { + "epoch": 15.252225519287833, + "grad_norm": 6.648094177246094, + "learning_rate": 8.487369985141158e-08, + "loss": 0.1725, + "step": 5140 + }, + { + "epoch": 15.281899109792285, + "grad_norm": 8.694971084594727, + "learning_rate": 8.484398216939078e-08, + "loss": 0.2625, + "step": 5150 + }, + { + "epoch": 15.311572700296736, + "grad_norm": 0.28906384110450745, + "learning_rate": 8.481426448736999e-08, + "loss": 0.0818, + "step": 5160 + }, + { + "epoch": 15.341246290801188, + "grad_norm": 0.18401595950126648, + "learning_rate": 8.478454680534918e-08, + "loss": 0.2675, + "step": 5170 + }, + { + "epoch": 15.370919881305637, + "grad_norm": 0.08934751898050308, + "learning_rate": 8.475482912332838e-08, + "loss": 0.0709, + "step": 5180 + }, + { + "epoch": 15.400593471810089, + "grad_norm": 5.323742389678955, + "learning_rate": 8.472511144130757e-08, + "loss": 0.18, + "step": 5190 + }, + { + "epoch": 15.43026706231454, + "grad_norm": 11.872135162353516, + "learning_rate": 8.469539375928677e-08, + "loss": 0.1195, + "step": 5200 + }, + { + "epoch": 15.459940652818991, + "grad_norm": 2.5972564220428467, + "learning_rate": 8.466567607726596e-08, + "loss": 0.0598, + "step": 5210 + }, + { + "epoch": 15.489614243323443, + "grad_norm": 0.6143264174461365, + "learning_rate": 8.463595839524517e-08, + "loss": 0.1025, + "step": 5220 + }, + { + "epoch": 15.519287833827892, + "grad_norm": 29.412145614624023, + "learning_rate": 8.460624071322437e-08, + "loss": 0.2406, + "step": 5230 + }, + { + "epoch": 15.548961424332344, + "grad_norm": 7.201059341430664, + "learning_rate": 8.457652303120356e-08, + "loss": 0.1946, + "step": 5240 + }, + { + "epoch": 15.578635014836795, + "grad_norm": 21.108232498168945, + "learning_rate": 8.454680534918276e-08, + "loss": 0.1383, + "step": 5250 + }, + { + "epoch": 15.608308605341247, + "grad_norm": 2.12418794631958, + "learning_rate": 8.451708766716195e-08, + "loss": 0.0734, + "step": 5260 + }, + { + "epoch": 15.637982195845698, + "grad_norm": 9.827413558959961, + "learning_rate": 8.448736998514115e-08, + "loss": 0.1592, + "step": 5270 + }, + { + "epoch": 15.667655786350148, + "grad_norm": 0.1662551760673523, + "learning_rate": 8.445765230312036e-08, + "loss": 0.0761, + "step": 5280 + }, + { + "epoch": 15.697329376854599, + "grad_norm": 0.09902050346136093, + "learning_rate": 8.442793462109955e-08, + "loss": 0.0959, + "step": 5290 + }, + { + "epoch": 15.72700296735905, + "grad_norm": 22.60340690612793, + "learning_rate": 8.439821693907875e-08, + "loss": 0.1165, + "step": 5300 + }, + { + "epoch": 15.756676557863502, + "grad_norm": 0.6083064079284668, + "learning_rate": 8.436849925705794e-08, + "loss": 0.0727, + "step": 5310 + }, + { + "epoch": 15.786350148367953, + "grad_norm": 0.16090227663516998, + "learning_rate": 8.433878157503714e-08, + "loss": 0.1536, + "step": 5320 + }, + { + "epoch": 15.816023738872403, + "grad_norm": 17.398096084594727, + "learning_rate": 8.430906389301634e-08, + "loss": 0.2392, + "step": 5330 + }, + { + "epoch": 15.845697329376854, + "grad_norm": 14.167173385620117, + "learning_rate": 8.427934621099554e-08, + "loss": 0.266, + "step": 5340 + }, + { + "epoch": 15.875370919881306, + "grad_norm": 18.738821029663086, + "learning_rate": 8.424962852897474e-08, + "loss": 0.3097, + "step": 5350 + }, + { + "epoch": 15.905044510385757, + "grad_norm": 1.416981816291809, + "learning_rate": 8.421991084695394e-08, + "loss": 0.3116, + "step": 5360 + }, + { + "epoch": 15.934718100890208, + "grad_norm": 23.83690643310547, + "learning_rate": 8.419019316493313e-08, + "loss": 0.2631, + "step": 5370 + }, + { + "epoch": 15.964391691394658, + "grad_norm": 0.13201266527175903, + "learning_rate": 8.416047548291233e-08, + "loss": 0.0869, + "step": 5380 + }, + { + "epoch": 15.99406528189911, + "grad_norm": 4.664916038513184, + "learning_rate": 8.413075780089152e-08, + "loss": 0.2807, + "step": 5390 + }, + { + "epoch": 16.0, + "eval_accuracy": 0.9320712694877505, + "eval_loss": 0.2191184014081955, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 10.9599, + "eval_samples_per_second": 81.935, + "eval_steps_per_second": 10.31, + "step": 5392 + }, + { + "epoch": 16.023738872403563, + "grad_norm": 7.359623432159424, + "learning_rate": 8.410104011887073e-08, + "loss": 0.097, + "step": 5400 + }, + { + "epoch": 16.05341246290801, + "grad_norm": 0.08648430556058884, + "learning_rate": 8.407132243684993e-08, + "loss": 0.2055, + "step": 5410 + }, + { + "epoch": 16.083086053412462, + "grad_norm": 3.901839256286621, + "learning_rate": 8.404160475482912e-08, + "loss": 0.0548, + "step": 5420 + }, + { + "epoch": 16.112759643916913, + "grad_norm": 4.2089643478393555, + "learning_rate": 8.401188707280832e-08, + "loss": 0.1473, + "step": 5430 + }, + { + "epoch": 16.142433234421365, + "grad_norm": 15.44429874420166, + "learning_rate": 8.398216939078751e-08, + "loss": 0.2111, + "step": 5440 + }, + { + "epoch": 16.172106824925816, + "grad_norm": 22.765958786010742, + "learning_rate": 8.395245170876671e-08, + "loss": 0.1991, + "step": 5450 + }, + { + "epoch": 16.201780415430267, + "grad_norm": 25.229028701782227, + "learning_rate": 8.392273402674592e-08, + "loss": 0.0522, + "step": 5460 + }, + { + "epoch": 16.23145400593472, + "grad_norm": 3.4227893352508545, + "learning_rate": 8.389301634472511e-08, + "loss": 0.0498, + "step": 5470 + }, + { + "epoch": 16.26112759643917, + "grad_norm": 3.9687438011169434, + "learning_rate": 8.386329866270431e-08, + "loss": 0.0394, + "step": 5480 + }, + { + "epoch": 16.29080118694362, + "grad_norm": 0.26801198720932007, + "learning_rate": 8.38335809806835e-08, + "loss": 0.1375, + "step": 5490 + }, + { + "epoch": 16.320474777448073, + "grad_norm": 31.103164672851562, + "learning_rate": 8.38038632986627e-08, + "loss": 0.0542, + "step": 5500 + }, + { + "epoch": 16.35014836795252, + "grad_norm": 10.182798385620117, + "learning_rate": 8.37741456166419e-08, + "loss": 0.0836, + "step": 5510 + }, + { + "epoch": 16.379821958456972, + "grad_norm": 24.28949737548828, + "learning_rate": 8.37444279346211e-08, + "loss": 0.1686, + "step": 5520 + }, + { + "epoch": 16.409495548961424, + "grad_norm": 39.19709396362305, + "learning_rate": 8.37147102526003e-08, + "loss": 0.1599, + "step": 5530 + }, + { + "epoch": 16.439169139465875, + "grad_norm": 1.3991438150405884, + "learning_rate": 8.36849925705795e-08, + "loss": 0.159, + "step": 5540 + }, + { + "epoch": 16.468842729970326, + "grad_norm": 0.11362085491418839, + "learning_rate": 8.365527488855869e-08, + "loss": 0.1259, + "step": 5550 + }, + { + "epoch": 16.498516320474778, + "grad_norm": 0.1367943435907364, + "learning_rate": 8.362555720653789e-08, + "loss": 0.2296, + "step": 5560 + }, + { + "epoch": 16.52818991097923, + "grad_norm": 5.24260139465332, + "learning_rate": 8.359583952451708e-08, + "loss": 0.0529, + "step": 5570 + }, + { + "epoch": 16.55786350148368, + "grad_norm": 0.07251410186290741, + "learning_rate": 8.356612184249629e-08, + "loss": 0.1457, + "step": 5580 + }, + { + "epoch": 16.587537091988132, + "grad_norm": 10.372644424438477, + "learning_rate": 8.353640416047548e-08, + "loss": 0.3712, + "step": 5590 + }, + { + "epoch": 16.617210682492583, + "grad_norm": 9.296265602111816, + "learning_rate": 8.350668647845468e-08, + "loss": 0.132, + "step": 5600 + }, + { + "epoch": 16.64688427299703, + "grad_norm": 0.07070301473140717, + "learning_rate": 8.347696879643388e-08, + "loss": 0.2253, + "step": 5610 + }, + { + "epoch": 16.676557863501483, + "grad_norm": 0.7787969708442688, + "learning_rate": 8.344725111441307e-08, + "loss": 0.0605, + "step": 5620 + }, + { + "epoch": 16.706231454005934, + "grad_norm": 0.4044439196586609, + "learning_rate": 8.341753343239227e-08, + "loss": 0.0231, + "step": 5630 + }, + { + "epoch": 16.735905044510385, + "grad_norm": 2.9235448837280273, + "learning_rate": 8.338781575037148e-08, + "loss": 0.1698, + "step": 5640 + }, + { + "epoch": 16.765578635014837, + "grad_norm": 0.4720080494880676, + "learning_rate": 8.335809806835067e-08, + "loss": 0.1682, + "step": 5650 + }, + { + "epoch": 16.795252225519288, + "grad_norm": 1.6086479425430298, + "learning_rate": 8.332838038632987e-08, + "loss": 0.1479, + "step": 5660 + }, + { + "epoch": 16.82492581602374, + "grad_norm": 19.91450309753418, + "learning_rate": 8.329866270430906e-08, + "loss": 0.1927, + "step": 5670 + }, + { + "epoch": 16.85459940652819, + "grad_norm": 7.199525356292725, + "learning_rate": 8.326894502228826e-08, + "loss": 0.1024, + "step": 5680 + }, + { + "epoch": 16.884272997032642, + "grad_norm": 16.60344696044922, + "learning_rate": 8.323922734026745e-08, + "loss": 0.2484, + "step": 5690 + }, + { + "epoch": 16.91394658753709, + "grad_norm": 19.962974548339844, + "learning_rate": 8.320950965824666e-08, + "loss": 0.1185, + "step": 5700 + }, + { + "epoch": 16.94362017804154, + "grad_norm": 0.9305756092071533, + "learning_rate": 8.317979197622586e-08, + "loss": 0.2038, + "step": 5710 + }, + { + "epoch": 16.973293768545993, + "grad_norm": 0.6293435096740723, + "learning_rate": 8.315007429420505e-08, + "loss": 0.0484, + "step": 5720 + }, + { + "epoch": 17.0, + "eval_accuracy": 0.9320712694877505, + "eval_loss": 0.21858079731464386, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 11.1344, + "eval_samples_per_second": 80.651, + "eval_steps_per_second": 10.149, + "step": 5729 + }, + { + "epoch": 17.002967359050444, + "grad_norm": 9.28915023803711, + "learning_rate": 8.312035661218425e-08, + "loss": 0.2686, + "step": 5730 + }, + { + "epoch": 17.032640949554896, + "grad_norm": 14.087238311767578, + "learning_rate": 8.309063893016344e-08, + "loss": 0.1689, + "step": 5740 + }, + { + "epoch": 17.062314540059347, + "grad_norm": 1.088675856590271, + "learning_rate": 8.306092124814264e-08, + "loss": 0.1833, + "step": 5750 + }, + { + "epoch": 17.0919881305638, + "grad_norm": 2.0590171813964844, + "learning_rate": 8.303120356612185e-08, + "loss": 0.1729, + "step": 5760 + }, + { + "epoch": 17.12166172106825, + "grad_norm": 8.530868530273438, + "learning_rate": 8.300148588410104e-08, + "loss": 0.2898, + "step": 5770 + }, + { + "epoch": 17.1513353115727, + "grad_norm": 21.595458984375, + "learning_rate": 8.297176820208024e-08, + "loss": 0.1182, + "step": 5780 + }, + { + "epoch": 17.181008902077153, + "grad_norm": 12.886796951293945, + "learning_rate": 8.294205052005943e-08, + "loss": 0.1557, + "step": 5790 + }, + { + "epoch": 17.2106824925816, + "grad_norm": 0.39996176958084106, + "learning_rate": 8.291233283803863e-08, + "loss": 0.2749, + "step": 5800 + }, + { + "epoch": 17.240356083086052, + "grad_norm": 1.1692343950271606, + "learning_rate": 8.288261515601783e-08, + "loss": 0.1558, + "step": 5810 + }, + { + "epoch": 17.270029673590503, + "grad_norm": 32.03895568847656, + "learning_rate": 8.285289747399703e-08, + "loss": 0.154, + "step": 5820 + }, + { + "epoch": 17.299703264094955, + "grad_norm": 13.004961013793945, + "learning_rate": 8.282317979197623e-08, + "loss": 0.0454, + "step": 5830 + }, + { + "epoch": 17.329376854599406, + "grad_norm": 0.03988669067621231, + "learning_rate": 8.279346210995541e-08, + "loss": 0.2417, + "step": 5840 + }, + { + "epoch": 17.359050445103858, + "grad_norm": 23.260236740112305, + "learning_rate": 8.276374442793461e-08, + "loss": 0.1306, + "step": 5850 + }, + { + "epoch": 17.38872403560831, + "grad_norm": 1.9988237619400024, + "learning_rate": 8.27340267459138e-08, + "loss": 0.1163, + "step": 5860 + }, + { + "epoch": 17.41839762611276, + "grad_norm": 20.4964542388916, + "learning_rate": 8.270430906389301e-08, + "loss": 0.1828, + "step": 5870 + }, + { + "epoch": 17.448071216617212, + "grad_norm": 1.182787537574768, + "learning_rate": 8.267459138187221e-08, + "loss": 0.0557, + "step": 5880 + }, + { + "epoch": 17.477744807121663, + "grad_norm": 0.11159148067235947, + "learning_rate": 8.26448736998514e-08, + "loss": 0.1216, + "step": 5890 + }, + { + "epoch": 17.50741839762611, + "grad_norm": 0.20361927151679993, + "learning_rate": 8.26151560178306e-08, + "loss": 0.1047, + "step": 5900 + }, + { + "epoch": 17.537091988130562, + "grad_norm": 0.09152782708406448, + "learning_rate": 8.25854383358098e-08, + "loss": 0.1183, + "step": 5910 + }, + { + "epoch": 17.566765578635014, + "grad_norm": 13.532278060913086, + "learning_rate": 8.255572065378899e-08, + "loss": 0.0969, + "step": 5920 + }, + { + "epoch": 17.596439169139465, + "grad_norm": 23.647497177124023, + "learning_rate": 8.25260029717682e-08, + "loss": 0.1656, + "step": 5930 + }, + { + "epoch": 17.626112759643917, + "grad_norm": 0.5108366012573242, + "learning_rate": 8.24962852897474e-08, + "loss": 0.1617, + "step": 5940 + }, + { + "epoch": 17.655786350148368, + "grad_norm": 0.5736545324325562, + "learning_rate": 8.246656760772659e-08, + "loss": 0.0469, + "step": 5950 + }, + { + "epoch": 17.68545994065282, + "grad_norm": 0.22681237757205963, + "learning_rate": 8.243684992570578e-08, + "loss": 0.2046, + "step": 5960 + }, + { + "epoch": 17.71513353115727, + "grad_norm": 19.72728729248047, + "learning_rate": 8.240713224368498e-08, + "loss": 0.2451, + "step": 5970 + }, + { + "epoch": 17.744807121661722, + "grad_norm": 6.467017650604248, + "learning_rate": 8.237741456166419e-08, + "loss": 0.1933, + "step": 5980 + }, + { + "epoch": 17.774480712166174, + "grad_norm": 15.932929992675781, + "learning_rate": 8.234769687964338e-08, + "loss": 0.2747, + "step": 5990 + }, + { + "epoch": 17.80415430267062, + "grad_norm": 9.95218276977539, + "learning_rate": 8.231797919762258e-08, + "loss": 0.0982, + "step": 6000 + }, + { + "epoch": 17.833827893175073, + "grad_norm": 1.1411454677581787, + "learning_rate": 8.228826151560178e-08, + "loss": 0.1542, + "step": 6010 + }, + { + "epoch": 17.863501483679524, + "grad_norm": 18.346120834350586, + "learning_rate": 8.225854383358097e-08, + "loss": 0.1105, + "step": 6020 + }, + { + "epoch": 17.893175074183976, + "grad_norm": 25.788114547729492, + "learning_rate": 8.222882615156017e-08, + "loss": 0.1044, + "step": 6030 + }, + { + "epoch": 17.922848664688427, + "grad_norm": 4.759020805358887, + "learning_rate": 8.219910846953938e-08, + "loss": 0.1122, + "step": 6040 + }, + { + "epoch": 17.95252225519288, + "grad_norm": 0.4630882740020752, + "learning_rate": 8.216939078751857e-08, + "loss": 0.0533, + "step": 6050 + }, + { + "epoch": 17.98219584569733, + "grad_norm": 0.041039787232875824, + "learning_rate": 8.213967310549777e-08, + "loss": 0.028, + "step": 6060 + }, + { + "epoch": 18.0, + "eval_accuracy": 0.933184855233853, + "eval_loss": 0.2149377465248108, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 11.1329, + "eval_samples_per_second": 80.662, + "eval_steps_per_second": 10.15, + "step": 6066 + }, + { + "epoch": 18.01186943620178, + "grad_norm": 1.591378927230835, + "learning_rate": 8.210995542347696e-08, + "loss": 0.1247, + "step": 6070 + }, + { + "epoch": 18.041543026706233, + "grad_norm": 0.5668648481369019, + "learning_rate": 8.208023774145616e-08, + "loss": 0.1008, + "step": 6080 + }, + { + "epoch": 18.071216617210684, + "grad_norm": 0.26698222756385803, + "learning_rate": 8.205052005943535e-08, + "loss": 0.3284, + "step": 6090 + }, + { + "epoch": 18.100890207715132, + "grad_norm": 0.7820779085159302, + "learning_rate": 8.202080237741456e-08, + "loss": 0.1506, + "step": 6100 + }, + { + "epoch": 18.130563798219583, + "grad_norm": 3.346285343170166, + "learning_rate": 8.199108469539376e-08, + "loss": 0.2317, + "step": 6110 + }, + { + "epoch": 18.160237388724035, + "grad_norm": 0.19723504781723022, + "learning_rate": 8.196136701337295e-08, + "loss": 0.1988, + "step": 6120 + }, + { + "epoch": 18.189910979228486, + "grad_norm": 12.178500175476074, + "learning_rate": 8.193164933135215e-08, + "loss": 0.1085, + "step": 6130 + }, + { + "epoch": 18.219584569732937, + "grad_norm": 16.40948486328125, + "learning_rate": 8.190193164933134e-08, + "loss": 0.1764, + "step": 6140 + }, + { + "epoch": 18.24925816023739, + "grad_norm": 0.43775442242622375, + "learning_rate": 8.187221396731054e-08, + "loss": 0.2413, + "step": 6150 + }, + { + "epoch": 18.27893175074184, + "grad_norm": 19.412864685058594, + "learning_rate": 8.184249628528975e-08, + "loss": 0.1729, + "step": 6160 + }, + { + "epoch": 18.30860534124629, + "grad_norm": 0.09883507341146469, + "learning_rate": 8.181277860326894e-08, + "loss": 0.1106, + "step": 6170 + }, + { + "epoch": 18.338278931750743, + "grad_norm": 3.6515309810638428, + "learning_rate": 8.178306092124814e-08, + "loss": 0.0739, + "step": 6180 + }, + { + "epoch": 18.367952522255194, + "grad_norm": 8.214625358581543, + "learning_rate": 8.175334323922733e-08, + "loss": 0.098, + "step": 6190 + }, + { + "epoch": 18.397626112759642, + "grad_norm": 0.3135130703449249, + "learning_rate": 8.172362555720653e-08, + "loss": 0.3759, + "step": 6200 + }, + { + "epoch": 18.427299703264094, + "grad_norm": 28.80025291442871, + "learning_rate": 8.169390787518573e-08, + "loss": 0.1654, + "step": 6210 + }, + { + "epoch": 18.456973293768545, + "grad_norm": 46.23115539550781, + "learning_rate": 8.166419019316493e-08, + "loss": 0.2553, + "step": 6220 + }, + { + "epoch": 18.486646884272997, + "grad_norm": 3.144287109375, + "learning_rate": 8.163447251114413e-08, + "loss": 0.1203, + "step": 6230 + }, + { + "epoch": 18.516320474777448, + "grad_norm": 0.07791727036237717, + "learning_rate": 8.160475482912333e-08, + "loss": 0.1152, + "step": 6240 + }, + { + "epoch": 18.5459940652819, + "grad_norm": 0.6376421451568604, + "learning_rate": 8.157503714710252e-08, + "loss": 0.0763, + "step": 6250 + }, + { + "epoch": 18.57566765578635, + "grad_norm": 21.4111385345459, + "learning_rate": 8.154531946508172e-08, + "loss": 0.1584, + "step": 6260 + }, + { + "epoch": 18.605341246290802, + "grad_norm": 14.506896018981934, + "learning_rate": 8.151560178306091e-08, + "loss": 0.2744, + "step": 6270 + }, + { + "epoch": 18.635014836795254, + "grad_norm": 0.7033679485321045, + "learning_rate": 8.148588410104012e-08, + "loss": 0.1384, + "step": 6280 + }, + { + "epoch": 18.664688427299705, + "grad_norm": 22.93064308166504, + "learning_rate": 8.145616641901932e-08, + "loss": 0.2872, + "step": 6290 + }, + { + "epoch": 18.694362017804153, + "grad_norm": 21.714704513549805, + "learning_rate": 8.142644873699851e-08, + "loss": 0.1014, + "step": 6300 + }, + { + "epoch": 18.724035608308604, + "grad_norm": 0.1953997164964676, + "learning_rate": 8.139673105497771e-08, + "loss": 0.3992, + "step": 6310 + }, + { + "epoch": 18.753709198813056, + "grad_norm": 28.006629943847656, + "learning_rate": 8.13670133729569e-08, + "loss": 0.0813, + "step": 6320 + }, + { + "epoch": 18.783382789317507, + "grad_norm": 0.18227548897266388, + "learning_rate": 8.13372956909361e-08, + "loss": 0.0082, + "step": 6330 + }, + { + "epoch": 18.81305637982196, + "grad_norm": 0.7790112495422363, + "learning_rate": 8.130757800891531e-08, + "loss": 0.1007, + "step": 6340 + }, + { + "epoch": 18.84272997032641, + "grad_norm": 0.21034081280231476, + "learning_rate": 8.12778603268945e-08, + "loss": 0.0464, + "step": 6350 + }, + { + "epoch": 18.87240356083086, + "grad_norm": 0.28012460470199585, + "learning_rate": 8.12481426448737e-08, + "loss": 0.0867, + "step": 6360 + }, + { + "epoch": 18.902077151335313, + "grad_norm": 11.184039115905762, + "learning_rate": 8.121842496285289e-08, + "loss": 0.1183, + "step": 6370 + }, + { + "epoch": 18.931750741839764, + "grad_norm": 26.39366912841797, + "learning_rate": 8.118870728083209e-08, + "loss": 0.0849, + "step": 6380 + }, + { + "epoch": 18.961424332344215, + "grad_norm": 0.2331637144088745, + "learning_rate": 8.115898959881128e-08, + "loss": 0.2205, + "step": 6390 + }, + { + "epoch": 18.991097922848663, + "grad_norm": 0.010982674546539783, + "learning_rate": 8.112927191679049e-08, + "loss": 0.0398, + "step": 6400 + }, + { + "epoch": 19.0, + "eval_accuracy": 0.933184855233853, + "eval_loss": 0.2140466421842575, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 10.6779, + "eval_samples_per_second": 84.099, + "eval_steps_per_second": 10.583, + "step": 6403 + }, + { + "epoch": 19.020771513353115, + "grad_norm": 1.0548681020736694, + "learning_rate": 8.109955423476969e-08, + "loss": 0.1815, + "step": 6410 + }, + { + "epoch": 19.050445103857566, + "grad_norm": 1.1201095581054688, + "learning_rate": 8.106983655274888e-08, + "loss": 0.2912, + "step": 6420 + }, + { + "epoch": 19.080118694362017, + "grad_norm": 8.719401359558105, + "learning_rate": 8.104011887072808e-08, + "loss": 0.0387, + "step": 6430 + }, + { + "epoch": 19.10979228486647, + "grad_norm": 2.7585079669952393, + "learning_rate": 8.101040118870727e-08, + "loss": 0.1093, + "step": 6440 + }, + { + "epoch": 19.13946587537092, + "grad_norm": 20.856826782226562, + "learning_rate": 8.098068350668647e-08, + "loss": 0.2167, + "step": 6450 + }, + { + "epoch": 19.16913946587537, + "grad_norm": 15.611637115478516, + "learning_rate": 8.095096582466568e-08, + "loss": 0.0987, + "step": 6460 + }, + { + "epoch": 19.198813056379823, + "grad_norm": 0.6765961647033691, + "learning_rate": 8.092124814264487e-08, + "loss": 0.1212, + "step": 6470 + }, + { + "epoch": 19.228486646884274, + "grad_norm": 7.770021915435791, + "learning_rate": 8.089153046062407e-08, + "loss": 0.2062, + "step": 6480 + }, + { + "epoch": 19.258160237388726, + "grad_norm": 22.92845344543457, + "learning_rate": 8.086181277860327e-08, + "loss": 0.0755, + "step": 6490 + }, + { + "epoch": 19.287833827893174, + "grad_norm": 8.890862464904785, + "learning_rate": 8.083209509658246e-08, + "loss": 0.2156, + "step": 6500 + }, + { + "epoch": 19.317507418397625, + "grad_norm": 0.14085882902145386, + "learning_rate": 8.080237741456166e-08, + "loss": 0.0332, + "step": 6510 + }, + { + "epoch": 19.347181008902076, + "grad_norm": 4.802812576293945, + "learning_rate": 8.077265973254087e-08, + "loss": 0.1424, + "step": 6520 + }, + { + "epoch": 19.376854599406528, + "grad_norm": 7.492328643798828, + "learning_rate": 8.074294205052006e-08, + "loss": 0.0486, + "step": 6530 + }, + { + "epoch": 19.40652818991098, + "grad_norm": 0.4096105694770813, + "learning_rate": 8.071322436849926e-08, + "loss": 0.0923, + "step": 6540 + }, + { + "epoch": 19.43620178041543, + "grad_norm": 20.3904972076416, + "learning_rate": 8.068350668647845e-08, + "loss": 0.1983, + "step": 6550 + }, + { + "epoch": 19.465875370919882, + "grad_norm": 5.010003566741943, + "learning_rate": 8.065378900445765e-08, + "loss": 0.0944, + "step": 6560 + }, + { + "epoch": 19.495548961424333, + "grad_norm": 15.477222442626953, + "learning_rate": 8.062407132243684e-08, + "loss": 0.1329, + "step": 6570 + }, + { + "epoch": 19.525222551928785, + "grad_norm": 23.198400497436523, + "learning_rate": 8.059435364041605e-08, + "loss": 0.2854, + "step": 6580 + }, + { + "epoch": 19.554896142433236, + "grad_norm": 0.20918722450733185, + "learning_rate": 8.056463595839525e-08, + "loss": 0.2264, + "step": 6590 + }, + { + "epoch": 19.584569732937684, + "grad_norm": 22.630287170410156, + "learning_rate": 8.053491827637444e-08, + "loss": 0.1907, + "step": 6600 + }, + { + "epoch": 19.614243323442135, + "grad_norm": 21.776050567626953, + "learning_rate": 8.050520059435364e-08, + "loss": 0.229, + "step": 6610 + }, + { + "epoch": 19.643916913946587, + "grad_norm": 0.3052227795124054, + "learning_rate": 8.047548291233283e-08, + "loss": 0.1818, + "step": 6620 + }, + { + "epoch": 19.673590504451038, + "grad_norm": 13.572928428649902, + "learning_rate": 8.044576523031203e-08, + "loss": 0.1444, + "step": 6630 + }, + { + "epoch": 19.70326409495549, + "grad_norm": 0.3727412223815918, + "learning_rate": 8.041604754829124e-08, + "loss": 0.1713, + "step": 6640 + }, + { + "epoch": 19.73293768545994, + "grad_norm": 11.554986953735352, + "learning_rate": 8.038632986627043e-08, + "loss": 0.2462, + "step": 6650 + }, + { + "epoch": 19.762611275964392, + "grad_norm": 0.20867522060871124, + "learning_rate": 8.035661218424963e-08, + "loss": 0.1803, + "step": 6660 + }, + { + "epoch": 19.792284866468844, + "grad_norm": 8.638569831848145, + "learning_rate": 8.032689450222882e-08, + "loss": 0.125, + "step": 6670 + }, + { + "epoch": 19.821958456973295, + "grad_norm": 0.5222876667976379, + "learning_rate": 8.029717682020802e-08, + "loss": 0.0164, + "step": 6680 + }, + { + "epoch": 19.851632047477743, + "grad_norm": 0.11547917872667313, + "learning_rate": 8.026745913818722e-08, + "loss": 0.1874, + "step": 6690 + }, + { + "epoch": 19.881305637982194, + "grad_norm": 13.232226371765137, + "learning_rate": 8.023774145616642e-08, + "loss": 0.118, + "step": 6700 + }, + { + "epoch": 19.910979228486646, + "grad_norm": 15.98351764678955, + "learning_rate": 8.020802377414562e-08, + "loss": 0.1824, + "step": 6710 + }, + { + "epoch": 19.940652818991097, + "grad_norm": 18.42740821838379, + "learning_rate": 8.017830609212482e-08, + "loss": 0.1215, + "step": 6720 + }, + { + "epoch": 19.97032640949555, + "grad_norm": 6.915778636932373, + "learning_rate": 8.014858841010401e-08, + "loss": 0.099, + "step": 6730 + }, + { + "epoch": 20.0, + "grad_norm": 1.035933494567871, + "learning_rate": 8.01188707280832e-08, + "loss": 0.1974, + "step": 6740 + }, + { + "epoch": 20.0, + "eval_accuracy": 0.933184855233853, + "eval_loss": 0.21534739434719086, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 10.5006, + "eval_samples_per_second": 85.519, + "eval_steps_per_second": 10.761, + "step": 6740 + }, + { + "epoch": 20.02967359050445, + "grad_norm": 0.12552013993263245, + "learning_rate": 8.00891530460624e-08, + "loss": 0.2528, + "step": 6750 + }, + { + "epoch": 20.059347181008903, + "grad_norm": 17.374801635742188, + "learning_rate": 8.005943536404161e-08, + "loss": 0.1367, + "step": 6760 + }, + { + "epoch": 20.089020771513354, + "grad_norm": 8.37562084197998, + "learning_rate": 8.00297176820208e-08, + "loss": 0.2388, + "step": 6770 + }, + { + "epoch": 20.118694362017806, + "grad_norm": 25.436445236206055, + "learning_rate": 8e-08, + "loss": 0.1392, + "step": 6780 + }, + { + "epoch": 20.148367952522253, + "grad_norm": 18.65315055847168, + "learning_rate": 7.99702823179792e-08, + "loss": 0.121, + "step": 6790 + }, + { + "epoch": 20.178041543026705, + "grad_norm": 4.339480876922607, + "learning_rate": 7.994056463595839e-08, + "loss": 0.1922, + "step": 6800 + }, + { + "epoch": 20.207715133531156, + "grad_norm": 27.88629150390625, + "learning_rate": 7.991084695393759e-08, + "loss": 0.0354, + "step": 6810 + }, + { + "epoch": 20.237388724035608, + "grad_norm": 0.21528953313827515, + "learning_rate": 7.98811292719168e-08, + "loss": 0.2246, + "step": 6820 + }, + { + "epoch": 20.26706231454006, + "grad_norm": 0.39463624358177185, + "learning_rate": 7.985141158989599e-08, + "loss": 0.1479, + "step": 6830 + }, + { + "epoch": 20.29673590504451, + "grad_norm": 12.559308052062988, + "learning_rate": 7.982169390787519e-08, + "loss": 0.0681, + "step": 6840 + }, + { + "epoch": 20.326409495548962, + "grad_norm": 4.756796360015869, + "learning_rate": 7.979197622585438e-08, + "loss": 0.1295, + "step": 6850 + }, + { + "epoch": 20.356083086053413, + "grad_norm": 17.127124786376953, + "learning_rate": 7.976225854383358e-08, + "loss": 0.107, + "step": 6860 + }, + { + "epoch": 20.385756676557865, + "grad_norm": 3.968890428543091, + "learning_rate": 7.973254086181277e-08, + "loss": 0.1821, + "step": 6870 + }, + { + "epoch": 20.415430267062316, + "grad_norm": 23.453659057617188, + "learning_rate": 7.970282317979198e-08, + "loss": 0.1163, + "step": 6880 + }, + { + "epoch": 20.445103857566764, + "grad_norm": 0.08615180104970932, + "learning_rate": 7.967310549777117e-08, + "loss": 0.1826, + "step": 6890 + }, + { + "epoch": 20.474777448071215, + "grad_norm": 16.862640380859375, + "learning_rate": 7.964338781575036e-08, + "loss": 0.1029, + "step": 6900 + }, + { + "epoch": 20.504451038575667, + "grad_norm": 11.983022689819336, + "learning_rate": 7.961367013372956e-08, + "loss": 0.1342, + "step": 6910 + }, + { + "epoch": 20.534124629080118, + "grad_norm": 0.48963475227355957, + "learning_rate": 7.958395245170875e-08, + "loss": 0.142, + "step": 6920 + }, + { + "epoch": 20.56379821958457, + "grad_norm": 0.38220664858818054, + "learning_rate": 7.955423476968796e-08, + "loss": 0.0963, + "step": 6930 + }, + { + "epoch": 20.59347181008902, + "grad_norm": 0.29016563296318054, + "learning_rate": 7.952451708766716e-08, + "loss": 0.0652, + "step": 6940 + }, + { + "epoch": 20.623145400593472, + "grad_norm": 9.339041709899902, + "learning_rate": 7.949479940564635e-08, + "loss": 0.1145, + "step": 6950 + }, + { + "epoch": 20.652818991097924, + "grad_norm": 18.771648406982422, + "learning_rate": 7.946508172362555e-08, + "loss": 0.2595, + "step": 6960 + }, + { + "epoch": 20.682492581602375, + "grad_norm": 0.10256405919790268, + "learning_rate": 7.943536404160474e-08, + "loss": 0.1578, + "step": 6970 + }, + { + "epoch": 20.712166172106826, + "grad_norm": 17.819297790527344, + "learning_rate": 7.940564635958394e-08, + "loss": 0.1396, + "step": 6980 + }, + { + "epoch": 20.741839762611274, + "grad_norm": 14.717985153198242, + "learning_rate": 7.937592867756315e-08, + "loss": 0.1812, + "step": 6990 + }, + { + "epoch": 20.771513353115726, + "grad_norm": 1.9806702136993408, + "learning_rate": 7.934621099554234e-08, + "loss": 0.1487, + "step": 7000 + }, + { + "epoch": 20.801186943620177, + "grad_norm": 28.281179428100586, + "learning_rate": 7.931649331352154e-08, + "loss": 0.1801, + "step": 7010 + }, + { + "epoch": 20.83086053412463, + "grad_norm": 0.9086587429046631, + "learning_rate": 7.928677563150073e-08, + "loss": 0.0344, + "step": 7020 + }, + { + "epoch": 20.86053412462908, + "grad_norm": 26.046390533447266, + "learning_rate": 7.925705794947993e-08, + "loss": 0.0936, + "step": 7030 + }, + { + "epoch": 20.89020771513353, + "grad_norm": 16.786685943603516, + "learning_rate": 7.922734026745912e-08, + "loss": 0.1458, + "step": 7040 + }, + { + "epoch": 20.919881305637983, + "grad_norm": 20.843210220336914, + "learning_rate": 7.919762258543833e-08, + "loss": 0.0272, + "step": 7050 + }, + { + "epoch": 20.949554896142434, + "grad_norm": 0.6326448917388916, + "learning_rate": 7.916790490341753e-08, + "loss": 0.1812, + "step": 7060 + }, + { + "epoch": 20.979228486646885, + "grad_norm": 0.10469139367341995, + "learning_rate": 7.913818722139672e-08, + "loss": 0.213, + "step": 7070 + }, + { + "epoch": 21.0, + "eval_accuracy": 0.933184855233853, + "eval_loss": 0.21544510126113892, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 10.3176, + "eval_samples_per_second": 87.036, + "eval_steps_per_second": 10.952, + "step": 7077 + }, + { + "epoch": 21.008902077151337, + "grad_norm": 9.77711296081543, + "learning_rate": 7.910846953937592e-08, + "loss": 0.0797, + "step": 7080 + }, + { + "epoch": 21.038575667655785, + "grad_norm": 7.832613945007324, + "learning_rate": 7.907875185735512e-08, + "loss": 0.1257, + "step": 7090 + }, + { + "epoch": 21.068249258160236, + "grad_norm": 23.90308952331543, + "learning_rate": 7.904903417533431e-08, + "loss": 0.072, + "step": 7100 + }, + { + "epoch": 21.097922848664687, + "grad_norm": 17.68047523498535, + "learning_rate": 7.901931649331352e-08, + "loss": 0.1055, + "step": 7110 + }, + { + "epoch": 21.12759643916914, + "grad_norm": 1.3229273557662964, + "learning_rate": 7.898959881129272e-08, + "loss": 0.1191, + "step": 7120 + }, + { + "epoch": 21.15727002967359, + "grad_norm": 8.096397399902344, + "learning_rate": 7.895988112927191e-08, + "loss": 0.0322, + "step": 7130 + }, + { + "epoch": 21.18694362017804, + "grad_norm": 0.09091472625732422, + "learning_rate": 7.89301634472511e-08, + "loss": 0.0968, + "step": 7140 + }, + { + "epoch": 21.216617210682493, + "grad_norm": 10.851255416870117, + "learning_rate": 7.89004457652303e-08, + "loss": 0.2561, + "step": 7150 + }, + { + "epoch": 21.246290801186944, + "grad_norm": 2.228346347808838, + "learning_rate": 7.88707280832095e-08, + "loss": 0.1518, + "step": 7160 + }, + { + "epoch": 21.275964391691396, + "grad_norm": 40.2528190612793, + "learning_rate": 7.88410104011887e-08, + "loss": 0.2303, + "step": 7170 + }, + { + "epoch": 21.305637982195847, + "grad_norm": 18.78288459777832, + "learning_rate": 7.88112927191679e-08, + "loss": 0.1587, + "step": 7180 + }, + { + "epoch": 21.335311572700295, + "grad_norm": 8.22624397277832, + "learning_rate": 7.87815750371471e-08, + "loss": 0.1364, + "step": 7190 + }, + { + "epoch": 21.364985163204746, + "grad_norm": 0.46912339329719543, + "learning_rate": 7.875185735512629e-08, + "loss": 0.0223, + "step": 7200 + }, + { + "epoch": 21.394658753709198, + "grad_norm": 0.19523240625858307, + "learning_rate": 7.872213967310549e-08, + "loss": 0.2131, + "step": 7210 + }, + { + "epoch": 21.42433234421365, + "grad_norm": 32.3909797668457, + "learning_rate": 7.869242199108468e-08, + "loss": 0.1881, + "step": 7220 + }, + { + "epoch": 21.4540059347181, + "grad_norm": 6.417776584625244, + "learning_rate": 7.866270430906389e-08, + "loss": 0.0892, + "step": 7230 + }, + { + "epoch": 21.483679525222552, + "grad_norm": 13.58862590789795, + "learning_rate": 7.863298662704309e-08, + "loss": 0.1784, + "step": 7240 + }, + { + "epoch": 21.513353115727003, + "grad_norm": 2.662782907485962, + "learning_rate": 7.860326894502228e-08, + "loss": 0.2357, + "step": 7250 + }, + { + "epoch": 21.543026706231455, + "grad_norm": 0.7837857604026794, + "learning_rate": 7.857355126300148e-08, + "loss": 0.0688, + "step": 7260 + }, + { + "epoch": 21.572700296735906, + "grad_norm": 0.5658806562423706, + "learning_rate": 7.854383358098067e-08, + "loss": 0.0718, + "step": 7270 + }, + { + "epoch": 21.602373887240358, + "grad_norm": 22.6914119720459, + "learning_rate": 7.851411589895987e-08, + "loss": 0.171, + "step": 7280 + }, + { + "epoch": 21.632047477744806, + "grad_norm": 1.4794481992721558, + "learning_rate": 7.848439821693908e-08, + "loss": 0.2164, + "step": 7290 + }, + { + "epoch": 21.661721068249257, + "grad_norm": 0.27148309350013733, + "learning_rate": 7.845468053491827e-08, + "loss": 0.1706, + "step": 7300 + }, + { + "epoch": 21.69139465875371, + "grad_norm": 22.60017967224121, + "learning_rate": 7.842496285289747e-08, + "loss": 0.1365, + "step": 7310 + }, + { + "epoch": 21.72106824925816, + "grad_norm": 7.992103576660156, + "learning_rate": 7.839524517087666e-08, + "loss": 0.1382, + "step": 7320 + }, + { + "epoch": 21.75074183976261, + "grad_norm": 14.140698432922363, + "learning_rate": 7.836552748885586e-08, + "loss": 0.0955, + "step": 7330 + }, + { + "epoch": 21.780415430267063, + "grad_norm": 0.1804761439561844, + "learning_rate": 7.833580980683506e-08, + "loss": 0.021, + "step": 7340 + }, + { + "epoch": 21.810089020771514, + "grad_norm": 12.367620468139648, + "learning_rate": 7.830609212481426e-08, + "loss": 0.3405, + "step": 7350 + }, + { + "epoch": 21.839762611275965, + "grad_norm": 0.16004210710525513, + "learning_rate": 7.827637444279346e-08, + "loss": 0.1944, + "step": 7360 + }, + { + "epoch": 21.869436201780417, + "grad_norm": 14.879359245300293, + "learning_rate": 7.824665676077266e-08, + "loss": 0.2076, + "step": 7370 + }, + { + "epoch": 21.899109792284868, + "grad_norm": 11.99825382232666, + "learning_rate": 7.821693907875185e-08, + "loss": 0.2485, + "step": 7380 + }, + { + "epoch": 21.928783382789316, + "grad_norm": 0.034673284739255905, + "learning_rate": 7.818722139673105e-08, + "loss": 0.0315, + "step": 7390 + }, + { + "epoch": 21.958456973293767, + "grad_norm": 1.7360169887542725, + "learning_rate": 7.815750371471026e-08, + "loss": 0.1894, + "step": 7400 + }, + { + "epoch": 21.98813056379822, + "grad_norm": 17.615190505981445, + "learning_rate": 7.812778603268945e-08, + "loss": 0.4666, + "step": 7410 + }, + { + "epoch": 22.0, + "eval_accuracy": 0.933184855233853, + "eval_loss": 0.2166227102279663, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 10.3898, + "eval_samples_per_second": 86.431, + "eval_steps_per_second": 10.876, + "step": 7414 + }, + { + "epoch": 22.01780415430267, + "grad_norm": 3.369729995727539, + "learning_rate": 7.809806835066865e-08, + "loss": 0.1822, + "step": 7420 + }, + { + "epoch": 22.04747774480712, + "grad_norm": 0.11173631995916367, + "learning_rate": 7.806835066864784e-08, + "loss": 0.0368, + "step": 7430 + }, + { + "epoch": 22.077151335311573, + "grad_norm": 5.791134834289551, + "learning_rate": 7.803863298662704e-08, + "loss": 0.0734, + "step": 7440 + }, + { + "epoch": 22.106824925816024, + "grad_norm": 0.7364012598991394, + "learning_rate": 7.800891530460623e-08, + "loss": 0.1594, + "step": 7450 + }, + { + "epoch": 22.136498516320476, + "grad_norm": 0.15562565624713898, + "learning_rate": 7.797919762258544e-08, + "loss": 0.1762, + "step": 7460 + }, + { + "epoch": 22.166172106824927, + "grad_norm": 1.6088746786117554, + "learning_rate": 7.794947994056464e-08, + "loss": 0.1668, + "step": 7470 + }, + { + "epoch": 22.19584569732938, + "grad_norm": 0.2038792222738266, + "learning_rate": 7.791976225854383e-08, + "loss": 0.049, + "step": 7480 + }, + { + "epoch": 22.225519287833826, + "grad_norm": 1.7628040313720703, + "learning_rate": 7.789004457652303e-08, + "loss": 0.0399, + "step": 7490 + }, + { + "epoch": 22.255192878338278, + "grad_norm": 0.5219783782958984, + "learning_rate": 7.786032689450222e-08, + "loss": 0.2474, + "step": 7500 + }, + { + "epoch": 22.28486646884273, + "grad_norm": 16.272167205810547, + "learning_rate": 7.783060921248142e-08, + "loss": 0.1907, + "step": 7510 + }, + { + "epoch": 22.31454005934718, + "grad_norm": 7.069812774658203, + "learning_rate": 7.780089153046063e-08, + "loss": 0.1918, + "step": 7520 + }, + { + "epoch": 22.344213649851632, + "grad_norm": 0.12088112533092499, + "learning_rate": 7.777117384843982e-08, + "loss": 0.0441, + "step": 7530 + }, + { + "epoch": 22.373887240356083, + "grad_norm": 1.548032522201538, + "learning_rate": 7.774145616641902e-08, + "loss": 0.1088, + "step": 7540 + }, + { + "epoch": 22.403560830860535, + "grad_norm": 0.31446629762649536, + "learning_rate": 7.771173848439821e-08, + "loss": 0.0712, + "step": 7550 + }, + { + "epoch": 22.433234421364986, + "grad_norm": 30.818662643432617, + "learning_rate": 7.768202080237741e-08, + "loss": 0.136, + "step": 7560 + }, + { + "epoch": 22.462908011869438, + "grad_norm": 0.17103247344493866, + "learning_rate": 7.76523031203566e-08, + "loss": 0.0491, + "step": 7570 + }, + { + "epoch": 22.49258160237389, + "grad_norm": 0.23832190036773682, + "learning_rate": 7.762258543833581e-08, + "loss": 0.1185, + "step": 7580 + }, + { + "epoch": 22.522255192878337, + "grad_norm": 2.1837639808654785, + "learning_rate": 7.759286775631501e-08, + "loss": 0.1478, + "step": 7590 + }, + { + "epoch": 22.551928783382788, + "grad_norm": 0.637860894203186, + "learning_rate": 7.75631500742942e-08, + "loss": 0.1804, + "step": 7600 + }, + { + "epoch": 22.58160237388724, + "grad_norm": 0.15079322457313538, + "learning_rate": 7.75334323922734e-08, + "loss": 0.1034, + "step": 7610 + }, + { + "epoch": 22.61127596439169, + "grad_norm": 0.013541119173169136, + "learning_rate": 7.75037147102526e-08, + "loss": 0.1192, + "step": 7620 + }, + { + "epoch": 22.640949554896142, + "grad_norm": 0.08770602941513062, + "learning_rate": 7.747399702823179e-08, + "loss": 0.0655, + "step": 7630 + }, + { + "epoch": 22.670623145400594, + "grad_norm": 15.624839782714844, + "learning_rate": 7.7444279346211e-08, + "loss": 0.2308, + "step": 7640 + }, + { + "epoch": 22.700296735905045, + "grad_norm": 3.1651968955993652, + "learning_rate": 7.74145616641902e-08, + "loss": 0.038, + "step": 7650 + }, + { + "epoch": 22.729970326409497, + "grad_norm": 0.19387738406658173, + "learning_rate": 7.738484398216939e-08, + "loss": 0.0676, + "step": 7660 + }, + { + "epoch": 22.759643916913948, + "grad_norm": 1.3919932842254639, + "learning_rate": 7.735512630014859e-08, + "loss": 0.3443, + "step": 7670 + }, + { + "epoch": 22.7893175074184, + "grad_norm": 0.03307194262742996, + "learning_rate": 7.732540861812778e-08, + "loss": 0.0849, + "step": 7680 + }, + { + "epoch": 22.818991097922847, + "grad_norm": 20.423669815063477, + "learning_rate": 7.729569093610698e-08, + "loss": 0.1711, + "step": 7690 + }, + { + "epoch": 22.8486646884273, + "grad_norm": 0.7301819324493408, + "learning_rate": 7.726597325408619e-08, + "loss": 0.298, + "step": 7700 + }, + { + "epoch": 22.87833827893175, + "grad_norm": 3.43853759765625, + "learning_rate": 7.723625557206538e-08, + "loss": 0.0616, + "step": 7710 + }, + { + "epoch": 22.9080118694362, + "grad_norm": 0.23189309239387512, + "learning_rate": 7.720653789004458e-08, + "loss": 0.1445, + "step": 7720 + }, + { + "epoch": 22.937685459940653, + "grad_norm": 8.23648452758789, + "learning_rate": 7.717682020802377e-08, + "loss": 0.1949, + "step": 7730 + }, + { + "epoch": 22.967359050445104, + "grad_norm": 15.345218658447266, + "learning_rate": 7.714710252600297e-08, + "loss": 0.2239, + "step": 7740 + }, + { + "epoch": 22.997032640949556, + "grad_norm": 0.04854168742895126, + "learning_rate": 7.711738484398216e-08, + "loss": 0.054, + "step": 7750 + }, + { + "epoch": 23.0, + "eval_accuracy": 0.9320712694877505, + "eval_loss": 0.2184199094772339, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 10.3956, + "eval_samples_per_second": 86.383, + "eval_steps_per_second": 10.87, + "step": 7751 + }, + { + "epoch": 23.026706231454007, + "grad_norm": 31.06021499633789, + "learning_rate": 7.708766716196137e-08, + "loss": 0.0817, + "step": 7760 + }, + { + "epoch": 23.05637982195846, + "grad_norm": 16.49560546875, + "learning_rate": 7.705794947994057e-08, + "loss": 0.1127, + "step": 7770 + }, + { + "epoch": 23.08605341246291, + "grad_norm": 9.370063781738281, + "learning_rate": 7.702823179791976e-08, + "loss": 0.2238, + "step": 7780 + }, + { + "epoch": 23.115727002967358, + "grad_norm": 6.6405348777771, + "learning_rate": 7.699851411589896e-08, + "loss": 0.0553, + "step": 7790 + }, + { + "epoch": 23.14540059347181, + "grad_norm": 0.6340488195419312, + "learning_rate": 7.696879643387816e-08, + "loss": 0.0938, + "step": 7800 + }, + { + "epoch": 23.17507418397626, + "grad_norm": 0.053858377039432526, + "learning_rate": 7.693907875185735e-08, + "loss": 0.0206, + "step": 7810 + }, + { + "epoch": 23.204747774480712, + "grad_norm": 15.105128288269043, + "learning_rate": 7.690936106983656e-08, + "loss": 0.2336, + "step": 7820 + }, + { + "epoch": 23.234421364985163, + "grad_norm": 13.181090354919434, + "learning_rate": 7.687964338781575e-08, + "loss": 0.1612, + "step": 7830 + }, + { + "epoch": 23.264094955489615, + "grad_norm": 0.4163687527179718, + "learning_rate": 7.684992570579495e-08, + "loss": 0.1362, + "step": 7840 + }, + { + "epoch": 23.293768545994066, + "grad_norm": 7.15186071395874, + "learning_rate": 7.682020802377415e-08, + "loss": 0.328, + "step": 7850 + }, + { + "epoch": 23.323442136498517, + "grad_norm": 7.377724647521973, + "learning_rate": 7.679049034175334e-08, + "loss": 0.1855, + "step": 7860 + }, + { + "epoch": 23.35311572700297, + "grad_norm": 17.20374298095703, + "learning_rate": 7.676077265973254e-08, + "loss": 0.127, + "step": 7870 + }, + { + "epoch": 23.382789317507417, + "grad_norm": 22.368724822998047, + "learning_rate": 7.673105497771175e-08, + "loss": 0.2862, + "step": 7880 + }, + { + "epoch": 23.412462908011868, + "grad_norm": 25.022489547729492, + "learning_rate": 7.670133729569094e-08, + "loss": 0.1322, + "step": 7890 + }, + { + "epoch": 23.44213649851632, + "grad_norm": 0.058406565338373184, + "learning_rate": 7.667161961367014e-08, + "loss": 0.0545, + "step": 7900 + }, + { + "epoch": 23.47181008902077, + "grad_norm": 21.061307907104492, + "learning_rate": 7.664190193164933e-08, + "loss": 0.3947, + "step": 7910 + }, + { + "epoch": 23.501483679525222, + "grad_norm": 0.32571738958358765, + "learning_rate": 7.661218424962853e-08, + "loss": 0.0499, + "step": 7920 + }, + { + "epoch": 23.531157270029674, + "grad_norm": 8.551515579223633, + "learning_rate": 7.658246656760772e-08, + "loss": 0.1077, + "step": 7930 + }, + { + "epoch": 23.560830860534125, + "grad_norm": 0.1982126384973526, + "learning_rate": 7.655274888558692e-08, + "loss": 0.1631, + "step": 7940 + }, + { + "epoch": 23.590504451038576, + "grad_norm": 0.5774928331375122, + "learning_rate": 7.652303120356611e-08, + "loss": 0.0995, + "step": 7950 + }, + { + "epoch": 23.620178041543028, + "grad_norm": 19.000473022460938, + "learning_rate": 7.649331352154531e-08, + "loss": 0.1014, + "step": 7960 + }, + { + "epoch": 23.64985163204748, + "grad_norm": 0.4441705644130707, + "learning_rate": 7.64635958395245e-08, + "loss": 0.2247, + "step": 7970 + }, + { + "epoch": 23.67952522255193, + "grad_norm": 21.43285369873047, + "learning_rate": 7.64338781575037e-08, + "loss": 0.2134, + "step": 7980 + }, + { + "epoch": 23.70919881305638, + "grad_norm": 0.6455886363983154, + "learning_rate": 7.640416047548291e-08, + "loss": 0.1867, + "step": 7990 + }, + { + "epoch": 23.73887240356083, + "grad_norm": 15.808426856994629, + "learning_rate": 7.63744427934621e-08, + "loss": 0.168, + "step": 8000 + }, + { + "epoch": 23.76854599406528, + "grad_norm": 0.0720570981502533, + "learning_rate": 7.63447251114413e-08, + "loss": 0.0912, + "step": 8010 + }, + { + "epoch": 23.798219584569733, + "grad_norm": 21.297039031982422, + "learning_rate": 7.63150074294205e-08, + "loss": 0.1421, + "step": 8020 + }, + { + "epoch": 23.827893175074184, + "grad_norm": 0.7415997385978699, + "learning_rate": 7.628528974739969e-08, + "loss": 0.0303, + "step": 8030 + }, + { + "epoch": 23.857566765578635, + "grad_norm": 6.589923858642578, + "learning_rate": 7.625557206537889e-08, + "loss": 0.1587, + "step": 8040 + }, + { + "epoch": 23.887240356083087, + "grad_norm": 0.14318309724330902, + "learning_rate": 7.62258543833581e-08, + "loss": 0.1156, + "step": 8050 + }, + { + "epoch": 23.916913946587538, + "grad_norm": 7.181532382965088, + "learning_rate": 7.619613670133729e-08, + "loss": 0.0363, + "step": 8060 + }, + { + "epoch": 23.94658753709199, + "grad_norm": 0.3349094092845917, + "learning_rate": 7.616641901931649e-08, + "loss": 0.1824, + "step": 8070 + }, + { + "epoch": 23.976261127596437, + "grad_norm": 0.8572707772254944, + "learning_rate": 7.613670133729568e-08, + "loss": 0.0712, + "step": 8080 + }, + { + "epoch": 24.0, + "eval_accuracy": 0.933184855233853, + "eval_loss": 0.2181248962879181, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 10.6789, + "eval_samples_per_second": 84.091, + "eval_steps_per_second": 10.582, + "step": 8088 + }, + { + "epoch": 24.00593471810089, + "grad_norm": 3.7651467323303223, + "learning_rate": 7.610698365527488e-08, + "loss": 0.1264, + "step": 8090 + }, + { + "epoch": 24.03560830860534, + "grad_norm": 1.0303977727890015, + "learning_rate": 7.607726597325407e-08, + "loss": 0.1779, + "step": 8100 + }, + { + "epoch": 24.06528189910979, + "grad_norm": 3.843385934829712, + "learning_rate": 7.604754829123328e-08, + "loss": 0.2984, + "step": 8110 + }, + { + "epoch": 24.094955489614243, + "grad_norm": 0.4151290953159332, + "learning_rate": 7.601783060921248e-08, + "loss": 0.1933, + "step": 8120 + }, + { + "epoch": 24.124629080118694, + "grad_norm": 25.558252334594727, + "learning_rate": 7.598811292719167e-08, + "loss": 0.1066, + "step": 8130 + }, + { + "epoch": 24.154302670623146, + "grad_norm": 10.488713264465332, + "learning_rate": 7.595839524517087e-08, + "loss": 0.0298, + "step": 8140 + }, + { + "epoch": 24.183976261127597, + "grad_norm": 0.037295371294021606, + "learning_rate": 7.592867756315006e-08, + "loss": 0.0884, + "step": 8150 + }, + { + "epoch": 24.21364985163205, + "grad_norm": 9.546892166137695, + "learning_rate": 7.589895988112926e-08, + "loss": 0.0358, + "step": 8160 + }, + { + "epoch": 24.2433234421365, + "grad_norm": 5.137731552124023, + "learning_rate": 7.586924219910847e-08, + "loss": 0.091, + "step": 8170 + }, + { + "epoch": 24.272997032640948, + "grad_norm": 7.309725284576416, + "learning_rate": 7.583952451708766e-08, + "loss": 0.1195, + "step": 8180 + }, + { + "epoch": 24.3026706231454, + "grad_norm": 23.79894256591797, + "learning_rate": 7.580980683506686e-08, + "loss": 0.1142, + "step": 8190 + }, + { + "epoch": 24.33234421364985, + "grad_norm": 3.179438591003418, + "learning_rate": 7.578008915304605e-08, + "loss": 0.0253, + "step": 8200 + }, + { + "epoch": 24.362017804154302, + "grad_norm": 30.06998062133789, + "learning_rate": 7.575037147102525e-08, + "loss": 0.1117, + "step": 8210 + }, + { + "epoch": 24.391691394658753, + "grad_norm": 0.7242507338523865, + "learning_rate": 7.572065378900445e-08, + "loss": 0.1078, + "step": 8220 + }, + { + "epoch": 24.421364985163205, + "grad_norm": 36.69660949707031, + "learning_rate": 7.569093610698365e-08, + "loss": 0.1731, + "step": 8230 + }, + { + "epoch": 24.451038575667656, + "grad_norm": 0.04959619417786598, + "learning_rate": 7.566121842496285e-08, + "loss": 0.0625, + "step": 8240 + }, + { + "epoch": 24.480712166172108, + "grad_norm": 18.893564224243164, + "learning_rate": 7.563150074294205e-08, + "loss": 0.1158, + "step": 8250 + }, + { + "epoch": 24.51038575667656, + "grad_norm": 0.09951876103878021, + "learning_rate": 7.560178306092124e-08, + "loss": 0.1546, + "step": 8260 + }, + { + "epoch": 24.54005934718101, + "grad_norm": 0.7610961198806763, + "learning_rate": 7.557206537890044e-08, + "loss": 0.0607, + "step": 8270 + }, + { + "epoch": 24.56973293768546, + "grad_norm": 11.04271125793457, + "learning_rate": 7.554234769687963e-08, + "loss": 0.3257, + "step": 8280 + }, + { + "epoch": 24.59940652818991, + "grad_norm": 4.930974006652832, + "learning_rate": 7.551263001485884e-08, + "loss": 0.129, + "step": 8290 + }, + { + "epoch": 24.62908011869436, + "grad_norm": 4.987751007080078, + "learning_rate": 7.548291233283804e-08, + "loss": 0.4655, + "step": 8300 + }, + { + "epoch": 24.658753709198812, + "grad_norm": 0.2271241545677185, + "learning_rate": 7.545319465081723e-08, + "loss": 0.0334, + "step": 8310 + }, + { + "epoch": 24.688427299703264, + "grad_norm": 0.7351827621459961, + "learning_rate": 7.542347696879643e-08, + "loss": 0.0458, + "step": 8320 + }, + { + "epoch": 24.718100890207715, + "grad_norm": 25.842424392700195, + "learning_rate": 7.539375928677562e-08, + "loss": 0.1056, + "step": 8330 + }, + { + "epoch": 24.747774480712167, + "grad_norm": 0.10422148555517197, + "learning_rate": 7.536404160475482e-08, + "loss": 0.1368, + "step": 8340 + }, + { + "epoch": 24.777448071216618, + "grad_norm": 7.36377477645874, + "learning_rate": 7.533432392273403e-08, + "loss": 0.1635, + "step": 8350 + }, + { + "epoch": 24.80712166172107, + "grad_norm": 0.12825711071491241, + "learning_rate": 7.530460624071322e-08, + "loss": 0.0774, + "step": 8360 + }, + { + "epoch": 24.83679525222552, + "grad_norm": 18.120410919189453, + "learning_rate": 7.527488855869242e-08, + "loss": 0.2354, + "step": 8370 + }, + { + "epoch": 24.86646884272997, + "grad_norm": 19.350793838500977, + "learning_rate": 7.524517087667161e-08, + "loss": 0.1627, + "step": 8380 + }, + { + "epoch": 24.89614243323442, + "grad_norm": 17.472026824951172, + "learning_rate": 7.521545319465081e-08, + "loss": 0.3656, + "step": 8390 + }, + { + "epoch": 24.92581602373887, + "grad_norm": 0.7609086036682129, + "learning_rate": 7.518573551263e-08, + "loss": 0.0231, + "step": 8400 + }, + { + "epoch": 24.955489614243323, + "grad_norm": 1.1282098293304443, + "learning_rate": 7.515601783060921e-08, + "loss": 0.1523, + "step": 8410 + }, + { + "epoch": 24.985163204747774, + "grad_norm": 13.908411026000977, + "learning_rate": 7.512630014858841e-08, + "loss": 0.2754, + "step": 8420 + }, + { + "epoch": 25.0, + "eval_accuracy": 0.933184855233853, + "eval_loss": 0.2190481722354889, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 10.835, + "eval_samples_per_second": 82.88, + "eval_steps_per_second": 10.429, + "step": 8425 + }, + { + "epoch": 25.014836795252226, + "grad_norm": 5.711366653442383, + "learning_rate": 7.50965824665676e-08, + "loss": 0.1734, + "step": 8430 + }, + { + "epoch": 25.044510385756677, + "grad_norm": 0.3822547495365143, + "learning_rate": 7.50668647845468e-08, + "loss": 0.1361, + "step": 8440 + }, + { + "epoch": 25.07418397626113, + "grad_norm": 0.2058834582567215, + "learning_rate": 7.5037147102526e-08, + "loss": 0.2004, + "step": 8450 + }, + { + "epoch": 25.10385756676558, + "grad_norm": 2.1028804779052734, + "learning_rate": 7.500742942050519e-08, + "loss": 0.1498, + "step": 8460 + }, + { + "epoch": 25.13353115727003, + "grad_norm": 16.730194091796875, + "learning_rate": 7.49777117384844e-08, + "loss": 0.0908, + "step": 8470 + }, + { + "epoch": 25.16320474777448, + "grad_norm": 0.409800261259079, + "learning_rate": 7.49479940564636e-08, + "loss": 0.2379, + "step": 8480 + }, + { + "epoch": 25.19287833827893, + "grad_norm": 1.7818834781646729, + "learning_rate": 7.491827637444279e-08, + "loss": 0.0823, + "step": 8490 + }, + { + "epoch": 25.222551928783382, + "grad_norm": 0.2828046381473541, + "learning_rate": 7.488855869242199e-08, + "loss": 0.158, + "step": 8500 + }, + { + "epoch": 25.252225519287833, + "grad_norm": 21.76198387145996, + "learning_rate": 7.485884101040118e-08, + "loss": 0.107, + "step": 8510 + }, + { + "epoch": 25.281899109792285, + "grad_norm": 0.30373415350914, + "learning_rate": 7.482912332838038e-08, + "loss": 0.0288, + "step": 8520 + }, + { + "epoch": 25.311572700296736, + "grad_norm": 20.19512367248535, + "learning_rate": 7.479940564635959e-08, + "loss": 0.1132, + "step": 8530 + }, + { + "epoch": 25.341246290801188, + "grad_norm": 0.44731882214546204, + "learning_rate": 7.476968796433878e-08, + "loss": 0.0836, + "step": 8540 + }, + { + "epoch": 25.37091988130564, + "grad_norm": 0.21948304772377014, + "learning_rate": 7.473997028231798e-08, + "loss": 0.1791, + "step": 8550 + }, + { + "epoch": 25.40059347181009, + "grad_norm": 20.41098976135254, + "learning_rate": 7.471025260029717e-08, + "loss": 0.243, + "step": 8560 + }, + { + "epoch": 25.43026706231454, + "grad_norm": 1.281446099281311, + "learning_rate": 7.468053491827637e-08, + "loss": 0.053, + "step": 8570 + }, + { + "epoch": 25.45994065281899, + "grad_norm": 3.091603994369507, + "learning_rate": 7.465081723625556e-08, + "loss": 0.1909, + "step": 8580 + }, + { + "epoch": 25.48961424332344, + "grad_norm": 0.3982032835483551, + "learning_rate": 7.462109955423477e-08, + "loss": 0.1277, + "step": 8590 + }, + { + "epoch": 25.519287833827892, + "grad_norm": 6.3654961585998535, + "learning_rate": 7.459138187221397e-08, + "loss": 0.1418, + "step": 8600 + }, + { + "epoch": 25.548961424332344, + "grad_norm": 0.42458590865135193, + "learning_rate": 7.456166419019316e-08, + "loss": 0.217, + "step": 8610 + }, + { + "epoch": 25.578635014836795, + "grad_norm": 18.829214096069336, + "learning_rate": 7.453194650817236e-08, + "loss": 0.2655, + "step": 8620 + }, + { + "epoch": 25.608308605341247, + "grad_norm": 20.27936363220215, + "learning_rate": 7.450222882615155e-08, + "loss": 0.1534, + "step": 8630 + }, + { + "epoch": 25.637982195845698, + "grad_norm": 0.1091645136475563, + "learning_rate": 7.447251114413075e-08, + "loss": 0.0836, + "step": 8640 + }, + { + "epoch": 25.66765578635015, + "grad_norm": 0.5916191339492798, + "learning_rate": 7.444279346210996e-08, + "loss": 0.1793, + "step": 8650 + }, + { + "epoch": 25.6973293768546, + "grad_norm": 0.5302268266677856, + "learning_rate": 7.441307578008915e-08, + "loss": 0.0868, + "step": 8660 + }, + { + "epoch": 25.727002967359052, + "grad_norm": 0.41038036346435547, + "learning_rate": 7.438335809806835e-08, + "loss": 0.1914, + "step": 8670 + }, + { + "epoch": 25.7566765578635, + "grad_norm": 0.06608781963586807, + "learning_rate": 7.435364041604755e-08, + "loss": 0.1953, + "step": 8680 + }, + { + "epoch": 25.78635014836795, + "grad_norm": 3.98883056640625, + "learning_rate": 7.432392273402674e-08, + "loss": 0.1034, + "step": 8690 + }, + { + "epoch": 25.816023738872403, + "grad_norm": 0.3108919560909271, + "learning_rate": 7.429420505200594e-08, + "loss": 0.0575, + "step": 8700 + }, + { + "epoch": 25.845697329376854, + "grad_norm": 1.9638001918792725, + "learning_rate": 7.426448736998514e-08, + "loss": 0.0234, + "step": 8710 + }, + { + "epoch": 25.875370919881306, + "grad_norm": 0.07665849477052689, + "learning_rate": 7.423476968796434e-08, + "loss": 0.1103, + "step": 8720 + }, + { + "epoch": 25.905044510385757, + "grad_norm": 4.438623428344727, + "learning_rate": 7.420505200594354e-08, + "loss": 0.1555, + "step": 8730 + }, + { + "epoch": 25.93471810089021, + "grad_norm": 6.291286468505859, + "learning_rate": 7.417533432392273e-08, + "loss": 0.1619, + "step": 8740 + }, + { + "epoch": 25.96439169139466, + "grad_norm": 18.084665298461914, + "learning_rate": 7.414561664190193e-08, + "loss": 0.1522, + "step": 8750 + }, + { + "epoch": 25.99406528189911, + "grad_norm": 0.6373274326324463, + "learning_rate": 7.411589895988112e-08, + "loss": 0.1645, + "step": 8760 + }, + { + "epoch": 26.0, + "eval_accuracy": 0.9342984409799554, + "eval_loss": 0.21708592772483826, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.3839, + "eval_samples_per_second": 140.666, + "eval_steps_per_second": 17.701, + "step": 8762 + }, + { + "epoch": 26.023738872403563, + "grad_norm": 1.0443450212478638, + "learning_rate": 7.408618127786033e-08, + "loss": 0.1359, + "step": 8770 + }, + { + "epoch": 26.05341246290801, + "grad_norm": 0.18819975852966309, + "learning_rate": 7.405646359583953e-08, + "loss": 0.1136, + "step": 8780 + }, + { + "epoch": 26.083086053412462, + "grad_norm": 3.1687231063842773, + "learning_rate": 7.402674591381872e-08, + "loss": 0.0756, + "step": 8790 + }, + { + "epoch": 26.112759643916913, + "grad_norm": 12.076043128967285, + "learning_rate": 7.399702823179792e-08, + "loss": 0.1203, + "step": 8800 + }, + { + "epoch": 26.142433234421365, + "grad_norm": 24.848833084106445, + "learning_rate": 7.396731054977711e-08, + "loss": 0.2468, + "step": 8810 + }, + { + "epoch": 26.172106824925816, + "grad_norm": 0.08002249151468277, + "learning_rate": 7.393759286775632e-08, + "loss": 0.0972, + "step": 8820 + }, + { + "epoch": 26.201780415430267, + "grad_norm": 0.09806425869464874, + "learning_rate": 7.390787518573552e-08, + "loss": 0.0473, + "step": 8830 + }, + { + "epoch": 26.23145400593472, + "grad_norm": 12.987876892089844, + "learning_rate": 7.387815750371471e-08, + "loss": 0.2019, + "step": 8840 + }, + { + "epoch": 26.26112759643917, + "grad_norm": 0.6641417741775513, + "learning_rate": 7.384843982169391e-08, + "loss": 0.1813, + "step": 8850 + }, + { + "epoch": 26.29080118694362, + "grad_norm": 0.041195347905159, + "learning_rate": 7.38187221396731e-08, + "loss": 0.0282, + "step": 8860 + }, + { + "epoch": 26.320474777448073, + "grad_norm": 0.06364777684211731, + "learning_rate": 7.37890044576523e-08, + "loss": 0.0534, + "step": 8870 + }, + { + "epoch": 26.35014836795252, + "grad_norm": 14.903504371643066, + "learning_rate": 7.375928677563151e-08, + "loss": 0.1387, + "step": 8880 + }, + { + "epoch": 26.379821958456972, + "grad_norm": 0.36547672748565674, + "learning_rate": 7.37295690936107e-08, + "loss": 0.0495, + "step": 8890 + }, + { + "epoch": 26.409495548961424, + "grad_norm": 0.10220501571893692, + "learning_rate": 7.36998514115899e-08, + "loss": 0.1856, + "step": 8900 + }, + { + "epoch": 26.439169139465875, + "grad_norm": 0.8060777187347412, + "learning_rate": 7.36701337295691e-08, + "loss": 0.0692, + "step": 8910 + }, + { + "epoch": 26.468842729970326, + "grad_norm": 0.38477852940559387, + "learning_rate": 7.364041604754829e-08, + "loss": 0.0782, + "step": 8920 + }, + { + "epoch": 26.498516320474778, + "grad_norm": 5.296771049499512, + "learning_rate": 7.361069836552749e-08, + "loss": 0.134, + "step": 8930 + }, + { + "epoch": 26.52818991097923, + "grad_norm": 1.2627395391464233, + "learning_rate": 7.35809806835067e-08, + "loss": 0.1488, + "step": 8940 + }, + { + "epoch": 26.55786350148368, + "grad_norm": 12.677186012268066, + "learning_rate": 7.355126300148589e-08, + "loss": 0.342, + "step": 8950 + }, + { + "epoch": 26.587537091988132, + "grad_norm": 11.11467170715332, + "learning_rate": 7.352154531946509e-08, + "loss": 0.1215, + "step": 8960 + }, + { + "epoch": 26.617210682492583, + "grad_norm": 1.3812549114227295, + "learning_rate": 7.349182763744428e-08, + "loss": 0.1538, + "step": 8970 + }, + { + "epoch": 26.64688427299703, + "grad_norm": 1.626254677772522, + "learning_rate": 7.346210995542348e-08, + "loss": 0.0461, + "step": 8980 + }, + { + "epoch": 26.676557863501483, + "grad_norm": 0.14709161221981049, + "learning_rate": 7.343239227340267e-08, + "loss": 0.2656, + "step": 8990 + }, + { + "epoch": 26.706231454005934, + "grad_norm": 2.08564829826355, + "learning_rate": 7.340267459138187e-08, + "loss": 0.0091, + "step": 9000 + }, + { + "epoch": 26.735905044510385, + "grad_norm": 19.191457748413086, + "learning_rate": 7.337295690936106e-08, + "loss": 0.0663, + "step": 9010 + }, + { + "epoch": 26.765578635014837, + "grad_norm": 28.690948486328125, + "learning_rate": 7.334323922734026e-08, + "loss": 0.1794, + "step": 9020 + }, + { + "epoch": 26.795252225519288, + "grad_norm": 0.3211633563041687, + "learning_rate": 7.331352154531945e-08, + "loss": 0.1237, + "step": 9030 + }, + { + "epoch": 26.82492581602374, + "grad_norm": 1.2389260530471802, + "learning_rate": 7.328380386329865e-08, + "loss": 0.152, + "step": 9040 + }, + { + "epoch": 26.85459940652819, + "grad_norm": 0.13224166631698608, + "learning_rate": 7.325408618127786e-08, + "loss": 0.0762, + "step": 9050 + }, + { + "epoch": 26.884272997032642, + "grad_norm": 11.876014709472656, + "learning_rate": 7.322436849925705e-08, + "loss": 0.1475, + "step": 9060 + }, + { + "epoch": 26.91394658753709, + "grad_norm": 19.438047409057617, + "learning_rate": 7.319465081723625e-08, + "loss": 0.1947, + "step": 9070 + }, + { + "epoch": 26.94362017804154, + "grad_norm": 1.162598729133606, + "learning_rate": 7.316493313521544e-08, + "loss": 0.2134, + "step": 9080 + }, + { + "epoch": 26.973293768545993, + "grad_norm": 20.363035202026367, + "learning_rate": 7.313521545319464e-08, + "loss": 0.1364, + "step": 9090 + }, + { + "epoch": 27.0, + "eval_accuracy": 0.9342984409799554, + "eval_loss": 0.21612796187400818, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.3595, + "eval_samples_per_second": 141.207, + "eval_steps_per_second": 17.769, + "step": 9099 + }, + { + "epoch": 27.002967359050444, + "grad_norm": 0.1376214474439621, + "learning_rate": 7.310549777117384e-08, + "loss": 0.285, + "step": 9100 + }, + { + "epoch": 27.032640949554896, + "grad_norm": 2.602182149887085, + "learning_rate": 7.307578008915304e-08, + "loss": 0.1447, + "step": 9110 + }, + { + "epoch": 27.062314540059347, + "grad_norm": 19.86675262451172, + "learning_rate": 7.304606240713224e-08, + "loss": 0.2715, + "step": 9120 + }, + { + "epoch": 27.0919881305638, + "grad_norm": 0.16791827976703644, + "learning_rate": 7.301634472511144e-08, + "loss": 0.1011, + "step": 9130 + }, + { + "epoch": 27.12166172106825, + "grad_norm": 0.2328627109527588, + "learning_rate": 7.298662704309063e-08, + "loss": 0.077, + "step": 9140 + }, + { + "epoch": 27.1513353115727, + "grad_norm": 4.887328624725342, + "learning_rate": 7.295690936106983e-08, + "loss": 0.1583, + "step": 9150 + }, + { + "epoch": 27.181008902077153, + "grad_norm": 0.04897181689739227, + "learning_rate": 7.292719167904902e-08, + "loss": 0.1969, + "step": 9160 + }, + { + "epoch": 27.2106824925816, + "grad_norm": 0.08680576086044312, + "learning_rate": 7.289747399702823e-08, + "loss": 0.1188, + "step": 9170 + }, + { + "epoch": 27.240356083086052, + "grad_norm": 5.811917304992676, + "learning_rate": 7.286775631500743e-08, + "loss": 0.1765, + "step": 9180 + }, + { + "epoch": 27.270029673590503, + "grad_norm": 0.1709374636411667, + "learning_rate": 7.283803863298662e-08, + "loss": 0.1134, + "step": 9190 + }, + { + "epoch": 27.299703264094955, + "grad_norm": 13.11281967163086, + "learning_rate": 7.280832095096582e-08, + "loss": 0.2027, + "step": 9200 + }, + { + "epoch": 27.329376854599406, + "grad_norm": 13.69390869140625, + "learning_rate": 7.277860326894501e-08, + "loss": 0.1617, + "step": 9210 + }, + { + "epoch": 27.359050445103858, + "grad_norm": 16.85497283935547, + "learning_rate": 7.274888558692421e-08, + "loss": 0.3095, + "step": 9220 + }, + { + "epoch": 27.38872403560831, + "grad_norm": 18.251523971557617, + "learning_rate": 7.271916790490342e-08, + "loss": 0.0917, + "step": 9230 + }, + { + "epoch": 27.41839762611276, + "grad_norm": 4.406983852386475, + "learning_rate": 7.268945022288261e-08, + "loss": 0.1343, + "step": 9240 + }, + { + "epoch": 27.448071216617212, + "grad_norm": 14.804757118225098, + "learning_rate": 7.265973254086181e-08, + "loss": 0.1488, + "step": 9250 + }, + { + "epoch": 27.477744807121663, + "grad_norm": 0.039188265800476074, + "learning_rate": 7.2630014858841e-08, + "loss": 0.0108, + "step": 9260 + }, + { + "epoch": 27.50741839762611, + "grad_norm": 6.918982028961182, + "learning_rate": 7.26002971768202e-08, + "loss": 0.1224, + "step": 9270 + }, + { + "epoch": 27.537091988130562, + "grad_norm": 0.0877053365111351, + "learning_rate": 7.25705794947994e-08, + "loss": 0.0458, + "step": 9280 + }, + { + "epoch": 27.566765578635014, + "grad_norm": 0.5383042097091675, + "learning_rate": 7.25408618127786e-08, + "loss": 0.1847, + "step": 9290 + }, + { + "epoch": 27.596439169139465, + "grad_norm": 0.7012510299682617, + "learning_rate": 7.25111441307578e-08, + "loss": 0.2749, + "step": 9300 + }, + { + "epoch": 27.626112759643917, + "grad_norm": 18.36239242553711, + "learning_rate": 7.2481426448737e-08, + "loss": 0.2238, + "step": 9310 + }, + { + "epoch": 27.655786350148368, + "grad_norm": 0.14469711482524872, + "learning_rate": 7.245170876671619e-08, + "loss": 0.0319, + "step": 9320 + }, + { + "epoch": 27.68545994065282, + "grad_norm": 0.517315685749054, + "learning_rate": 7.242199108469539e-08, + "loss": 0.2179, + "step": 9330 + }, + { + "epoch": 27.71513353115727, + "grad_norm": 0.8728196620941162, + "learning_rate": 7.239227340267458e-08, + "loss": 0.0431, + "step": 9340 + }, + { + "epoch": 27.744807121661722, + "grad_norm": 15.73184585571289, + "learning_rate": 7.236255572065379e-08, + "loss": 0.212, + "step": 9350 + }, + { + "epoch": 27.774480712166174, + "grad_norm": 22.249256134033203, + "learning_rate": 7.233283803863299e-08, + "loss": 0.1816, + "step": 9360 + }, + { + "epoch": 27.80415430267062, + "grad_norm": 0.14774277806282043, + "learning_rate": 7.230312035661218e-08, + "loss": 0.0941, + "step": 9370 + }, + { + "epoch": 27.833827893175073, + "grad_norm": 6.761119365692139, + "learning_rate": 7.227340267459138e-08, + "loss": 0.1391, + "step": 9380 + }, + { + "epoch": 27.863501483679524, + "grad_norm": 0.27072492241859436, + "learning_rate": 7.224368499257057e-08, + "loss": 0.2325, + "step": 9390 + }, + { + "epoch": 27.893175074183976, + "grad_norm": 0.34211641550064087, + "learning_rate": 7.221396731054977e-08, + "loss": 0.1456, + "step": 9400 + }, + { + "epoch": 27.922848664688427, + "grad_norm": 2.435272693634033, + "learning_rate": 7.218424962852898e-08, + "loss": 0.076, + "step": 9410 + }, + { + "epoch": 27.95252225519288, + "grad_norm": 18.25292205810547, + "learning_rate": 7.215453194650817e-08, + "loss": 0.0473, + "step": 9420 + }, + { + "epoch": 27.98219584569733, + "grad_norm": 9.378666877746582, + "learning_rate": 7.212481426448737e-08, + "loss": 0.0864, + "step": 9430 + }, + { + "epoch": 28.0, + "eval_accuracy": 0.9342984409799554, + "eval_loss": 0.213875412940979, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4048, + "eval_samples_per_second": 140.208, + "eval_steps_per_second": 17.643, + "step": 9436 + }, + { + "epoch": 28.01186943620178, + "grad_norm": 0.06616491079330444, + "learning_rate": 7.209509658246656e-08, + "loss": 0.0873, + "step": 9440 + }, + { + "epoch": 28.041543026706233, + "grad_norm": 35.77646255493164, + "learning_rate": 7.206537890044576e-08, + "loss": 0.21, + "step": 9450 + }, + { + "epoch": 28.071216617210684, + "grad_norm": 0.6772125959396362, + "learning_rate": 7.203566121842495e-08, + "loss": 0.1501, + "step": 9460 + }, + { + "epoch": 28.100890207715132, + "grad_norm": 0.3155253529548645, + "learning_rate": 7.200594353640416e-08, + "loss": 0.0243, + "step": 9470 + }, + { + "epoch": 28.130563798219583, + "grad_norm": 2.0341475009918213, + "learning_rate": 7.197622585438336e-08, + "loss": 0.1403, + "step": 9480 + }, + { + "epoch": 28.160237388724035, + "grad_norm": 3.7446210384368896, + "learning_rate": 7.194650817236255e-08, + "loss": 0.3227, + "step": 9490 + }, + { + "epoch": 28.189910979228486, + "grad_norm": 50.41856002807617, + "learning_rate": 7.191679049034175e-08, + "loss": 0.1571, + "step": 9500 + }, + { + "epoch": 28.219584569732937, + "grad_norm": 12.540358543395996, + "learning_rate": 7.188707280832094e-08, + "loss": 0.1572, + "step": 9510 + }, + { + "epoch": 28.24925816023739, + "grad_norm": 6.59840726852417, + "learning_rate": 7.185735512630014e-08, + "loss": 0.104, + "step": 9520 + }, + { + "epoch": 28.27893175074184, + "grad_norm": 10.150513648986816, + "learning_rate": 7.182763744427935e-08, + "loss": 0.081, + "step": 9530 + }, + { + "epoch": 28.30860534124629, + "grad_norm": 24.812952041625977, + "learning_rate": 7.179791976225854e-08, + "loss": 0.0862, + "step": 9540 + }, + { + "epoch": 28.338278931750743, + "grad_norm": 10.284539222717285, + "learning_rate": 7.176820208023774e-08, + "loss": 0.2563, + "step": 9550 + }, + { + "epoch": 28.367952522255194, + "grad_norm": 0.39233270287513733, + "learning_rate": 7.173848439821694e-08, + "loss": 0.0997, + "step": 9560 + }, + { + "epoch": 28.397626112759642, + "grad_norm": 0.9233057498931885, + "learning_rate": 7.170876671619613e-08, + "loss": 0.141, + "step": 9570 + }, + { + "epoch": 28.427299703264094, + "grad_norm": 5.379008769989014, + "learning_rate": 7.167904903417533e-08, + "loss": 0.0525, + "step": 9580 + }, + { + "epoch": 28.456973293768545, + "grad_norm": 0.2966007590293884, + "learning_rate": 7.164933135215453e-08, + "loss": 0.0404, + "step": 9590 + }, + { + "epoch": 28.486646884272997, + "grad_norm": 0.3356594145298004, + "learning_rate": 7.161961367013373e-08, + "loss": 0.0967, + "step": 9600 + }, + { + "epoch": 28.516320474777448, + "grad_norm": 10.152678489685059, + "learning_rate": 7.158989598811293e-08, + "loss": 0.2239, + "step": 9610 + }, + { + "epoch": 28.5459940652819, + "grad_norm": 10.219745635986328, + "learning_rate": 7.156017830609212e-08, + "loss": 0.1268, + "step": 9620 + }, + { + "epoch": 28.57566765578635, + "grad_norm": 0.20077843964099884, + "learning_rate": 7.153046062407132e-08, + "loss": 0.1123, + "step": 9630 + }, + { + "epoch": 28.605341246290802, + "grad_norm": 0.318429559469223, + "learning_rate": 7.150074294205051e-08, + "loss": 0.1392, + "step": 9640 + }, + { + "epoch": 28.635014836795254, + "grad_norm": 2.5006840229034424, + "learning_rate": 7.147102526002972e-08, + "loss": 0.0334, + "step": 9650 + }, + { + "epoch": 28.664688427299705, + "grad_norm": 10.184967994689941, + "learning_rate": 7.144130757800892e-08, + "loss": 0.0606, + "step": 9660 + }, + { + "epoch": 28.694362017804153, + "grad_norm": 0.3113705813884735, + "learning_rate": 7.141158989598811e-08, + "loss": 0.1335, + "step": 9670 + }, + { + "epoch": 28.724035608308604, + "grad_norm": 0.09793965518474579, + "learning_rate": 7.138187221396731e-08, + "loss": 0.222, + "step": 9680 + }, + { + "epoch": 28.753709198813056, + "grad_norm": 14.0927734375, + "learning_rate": 7.13521545319465e-08, + "loss": 0.1713, + "step": 9690 + }, + { + "epoch": 28.783382789317507, + "grad_norm": 11.582552909851074, + "learning_rate": 7.13224368499257e-08, + "loss": 0.1192, + "step": 9700 + }, + { + "epoch": 28.81305637982196, + "grad_norm": 0.4885496497154236, + "learning_rate": 7.129271916790491e-08, + "loss": 0.1136, + "step": 9710 + }, + { + "epoch": 28.84272997032641, + "grad_norm": 20.653060913085938, + "learning_rate": 7.12630014858841e-08, + "loss": 0.1361, + "step": 9720 + }, + { + "epoch": 28.87240356083086, + "grad_norm": 0.15911254286766052, + "learning_rate": 7.12332838038633e-08, + "loss": 0.1119, + "step": 9730 + }, + { + "epoch": 28.902077151335313, + "grad_norm": 17.15968132019043, + "learning_rate": 7.12035661218425e-08, + "loss": 0.1988, + "step": 9740 + }, + { + "epoch": 28.931750741839764, + "grad_norm": 9.770752906799316, + "learning_rate": 7.117384843982169e-08, + "loss": 0.1614, + "step": 9750 + }, + { + "epoch": 28.961424332344215, + "grad_norm": 0.5269460678100586, + "learning_rate": 7.114413075780089e-08, + "loss": 0.2585, + "step": 9760 + }, + { + "epoch": 28.991097922848663, + "grad_norm": 7.4492573738098145, + "learning_rate": 7.11144130757801e-08, + "loss": 0.26, + "step": 9770 + }, + { + "epoch": 29.0, + "eval_accuracy": 0.9342984409799554, + "eval_loss": 0.2143992781639099, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.3789, + "eval_samples_per_second": 140.777, + "eval_steps_per_second": 17.715, + "step": 9773 + }, + { + "epoch": 29.020771513353115, + "grad_norm": 35.07952880859375, + "learning_rate": 7.108469539375929e-08, + "loss": 0.2681, + "step": 9780 + }, + { + "epoch": 29.050445103857566, + "grad_norm": 2.5816049575805664, + "learning_rate": 7.105497771173848e-08, + "loss": 0.0171, + "step": 9790 + }, + { + "epoch": 29.080118694362017, + "grad_norm": 1.2838541269302368, + "learning_rate": 7.102526002971768e-08, + "loss": 0.0273, + "step": 9800 + }, + { + "epoch": 29.10979228486647, + "grad_norm": 23.768911361694336, + "learning_rate": 7.099554234769688e-08, + "loss": 0.0473, + "step": 9810 + }, + { + "epoch": 29.13946587537092, + "grad_norm": 0.20700640976428986, + "learning_rate": 7.096582466567607e-08, + "loss": 0.1786, + "step": 9820 + }, + { + "epoch": 29.16913946587537, + "grad_norm": 0.22553694248199463, + "learning_rate": 7.093610698365528e-08, + "loss": 0.318, + "step": 9830 + }, + { + "epoch": 29.198813056379823, + "grad_norm": 18.547142028808594, + "learning_rate": 7.090638930163448e-08, + "loss": 0.1096, + "step": 9840 + }, + { + "epoch": 29.228486646884274, + "grad_norm": 5.8549675941467285, + "learning_rate": 7.087667161961367e-08, + "loss": 0.2535, + "step": 9850 + }, + { + "epoch": 29.258160237388726, + "grad_norm": 4.791098117828369, + "learning_rate": 7.084695393759287e-08, + "loss": 0.1674, + "step": 9860 + }, + { + "epoch": 29.287833827893174, + "grad_norm": 1.0138534307479858, + "learning_rate": 7.081723625557206e-08, + "loss": 0.0432, + "step": 9870 + }, + { + "epoch": 29.317507418397625, + "grad_norm": 0.142008438706398, + "learning_rate": 7.078751857355126e-08, + "loss": 0.2331, + "step": 9880 + }, + { + "epoch": 29.347181008902076, + "grad_norm": 0.07170213013887405, + "learning_rate": 7.075780089153047e-08, + "loss": 0.1234, + "step": 9890 + }, + { + "epoch": 29.376854599406528, + "grad_norm": 2.4687163829803467, + "learning_rate": 7.072808320950966e-08, + "loss": 0.0855, + "step": 9900 + }, + { + "epoch": 29.40652818991098, + "grad_norm": 1.3472635746002197, + "learning_rate": 7.069836552748886e-08, + "loss": 0.0895, + "step": 9910 + }, + { + "epoch": 29.43620178041543, + "grad_norm": 0.036670003086328506, + "learning_rate": 7.066864784546805e-08, + "loss": 0.1002, + "step": 9920 + }, + { + "epoch": 29.465875370919882, + "grad_norm": 4.962029457092285, + "learning_rate": 7.063893016344725e-08, + "loss": 0.1256, + "step": 9930 + }, + { + "epoch": 29.495548961424333, + "grad_norm": 0.1418740451335907, + "learning_rate": 7.060921248142644e-08, + "loss": 0.0722, + "step": 9940 + }, + { + "epoch": 29.525222551928785, + "grad_norm": 12.74774169921875, + "learning_rate": 7.057949479940565e-08, + "loss": 0.0446, + "step": 9950 + }, + { + "epoch": 29.554896142433236, + "grad_norm": 0.5510703325271606, + "learning_rate": 7.054977711738485e-08, + "loss": 0.2182, + "step": 9960 + }, + { + "epoch": 29.584569732937684, + "grad_norm": 5.627580642700195, + "learning_rate": 7.052005943536404e-08, + "loss": 0.1954, + "step": 9970 + }, + { + "epoch": 29.614243323442135, + "grad_norm": 23.736068725585938, + "learning_rate": 7.049034175334324e-08, + "loss": 0.1077, + "step": 9980 + }, + { + "epoch": 29.643916913946587, + "grad_norm": 1.1469058990478516, + "learning_rate": 7.046062407132243e-08, + "loss": 0.1585, + "step": 9990 + }, + { + "epoch": 29.673590504451038, + "grad_norm": 1.3916414976119995, + "learning_rate": 7.043090638930163e-08, + "loss": 0.1552, + "step": 10000 + }, + { + "epoch": 29.70326409495549, + "grad_norm": 0.07030478864908218, + "learning_rate": 7.040118870728084e-08, + "loss": 0.047, + "step": 10010 + }, + { + "epoch": 29.73293768545994, + "grad_norm": 14.721003532409668, + "learning_rate": 7.037147102526003e-08, + "loss": 0.0943, + "step": 10020 + }, + { + "epoch": 29.762611275964392, + "grad_norm": 33.61404037475586, + "learning_rate": 7.034175334323923e-08, + "loss": 0.2432, + "step": 10030 + }, + { + "epoch": 29.792284866468844, + "grad_norm": 0.7879377007484436, + "learning_rate": 7.031203566121841e-08, + "loss": 0.1431, + "step": 10040 + }, + { + "epoch": 29.821958456973295, + "grad_norm": 0.045274194329977036, + "learning_rate": 7.028231797919761e-08, + "loss": 0.2347, + "step": 10050 + }, + { + "epoch": 29.851632047477743, + "grad_norm": 2.9704360961914062, + "learning_rate": 7.025260029717682e-08, + "loss": 0.1449, + "step": 10060 + }, + { + "epoch": 29.881305637982194, + "grad_norm": 35.38002395629883, + "learning_rate": 7.022288261515601e-08, + "loss": 0.1022, + "step": 10070 + }, + { + "epoch": 29.910979228486646, + "grad_norm": 20.898221969604492, + "learning_rate": 7.019316493313521e-08, + "loss": 0.1312, + "step": 10080 + }, + { + "epoch": 29.940652818991097, + "grad_norm": 4.645596504211426, + "learning_rate": 7.01634472511144e-08, + "loss": 0.0422, + "step": 10090 + }, + { + "epoch": 29.97032640949555, + "grad_norm": 4.1949992179870605, + "learning_rate": 7.01337295690936e-08, + "loss": 0.1197, + "step": 10100 + }, + { + "epoch": 30.0, + "grad_norm": 0.07645130157470703, + "learning_rate": 7.01040118870728e-08, + "loss": 0.1031, + "step": 10110 + }, + { + "epoch": 30.0, + "eval_accuracy": 0.9354120267260579, + "eval_loss": 0.2152058184146881, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.3847, + "eval_samples_per_second": 140.649, + "eval_steps_per_second": 17.699, + "step": 10110 + }, + { + "epoch": 30.02967359050445, + "grad_norm": 18.407175064086914, + "learning_rate": 7.0074294205052e-08, + "loss": 0.1606, + "step": 10120 + }, + { + "epoch": 30.059347181008903, + "grad_norm": 32.93033218383789, + "learning_rate": 7.00445765230312e-08, + "loss": 0.1364, + "step": 10130 + }, + { + "epoch": 30.089020771513354, + "grad_norm": 8.53589153289795, + "learning_rate": 7.00148588410104e-08, + "loss": 0.2594, + "step": 10140 + }, + { + "epoch": 30.118694362017806, + "grad_norm": 0.39872369170188904, + "learning_rate": 6.998514115898959e-08, + "loss": 0.0469, + "step": 10150 + }, + { + "epoch": 30.148367952522253, + "grad_norm": 6.917888641357422, + "learning_rate": 6.995542347696878e-08, + "loss": 0.0789, + "step": 10160 + }, + { + "epoch": 30.178041543026705, + "grad_norm": 5.317733287811279, + "learning_rate": 6.992570579494798e-08, + "loss": 0.1094, + "step": 10170 + }, + { + "epoch": 30.207715133531156, + "grad_norm": 0.1805935651063919, + "learning_rate": 6.989598811292719e-08, + "loss": 0.0132, + "step": 10180 + }, + { + "epoch": 30.237388724035608, + "grad_norm": 0.26664167642593384, + "learning_rate": 6.986627043090638e-08, + "loss": 0.1198, + "step": 10190 + }, + { + "epoch": 30.26706231454006, + "grad_norm": 18.26850128173828, + "learning_rate": 6.983655274888558e-08, + "loss": 0.1894, + "step": 10200 + }, + { + "epoch": 30.29673590504451, + "grad_norm": 0.15071265399456024, + "learning_rate": 6.980683506686478e-08, + "loss": 0.175, + "step": 10210 + }, + { + "epoch": 30.326409495548962, + "grad_norm": 4.030440330505371, + "learning_rate": 6.977711738484397e-08, + "loss": 0.1898, + "step": 10220 + }, + { + "epoch": 30.356083086053413, + "grad_norm": 0.7381828427314758, + "learning_rate": 6.974739970282317e-08, + "loss": 0.224, + "step": 10230 + }, + { + "epoch": 30.385756676557865, + "grad_norm": 0.43963128328323364, + "learning_rate": 6.971768202080238e-08, + "loss": 0.089, + "step": 10240 + }, + { + "epoch": 30.415430267062316, + "grad_norm": 0.04306118190288544, + "learning_rate": 6.968796433878157e-08, + "loss": 0.1112, + "step": 10250 + }, + { + "epoch": 30.445103857566764, + "grad_norm": 22.21747398376465, + "learning_rate": 6.965824665676077e-08, + "loss": 0.2383, + "step": 10260 + }, + { + "epoch": 30.474777448071215, + "grad_norm": 0.10677751153707504, + "learning_rate": 6.962852897473996e-08, + "loss": 0.1435, + "step": 10270 + }, + { + "epoch": 30.504451038575667, + "grad_norm": 0.21080735325813293, + "learning_rate": 6.959881129271916e-08, + "loss": 0.0619, + "step": 10280 + }, + { + "epoch": 30.534124629080118, + "grad_norm": 0.6340047717094421, + "learning_rate": 6.956909361069835e-08, + "loss": 0.0299, + "step": 10290 + }, + { + "epoch": 30.56379821958457, + "grad_norm": 0.2504396438598633, + "learning_rate": 6.953937592867756e-08, + "loss": 0.0297, + "step": 10300 + }, + { + "epoch": 30.59347181008902, + "grad_norm": 11.46859073638916, + "learning_rate": 6.950965824665676e-08, + "loss": 0.1488, + "step": 10310 + }, + { + "epoch": 30.623145400593472, + "grad_norm": 1.7051576375961304, + "learning_rate": 6.947994056463595e-08, + "loss": 0.153, + "step": 10320 + }, + { + "epoch": 30.652818991097924, + "grad_norm": 0.4951653778553009, + "learning_rate": 6.945022288261515e-08, + "loss": 0.2455, + "step": 10330 + }, + { + "epoch": 30.682492581602375, + "grad_norm": 0.10081147402524948, + "learning_rate": 6.942050520059434e-08, + "loss": 0.1339, + "step": 10340 + }, + { + "epoch": 30.712166172106826, + "grad_norm": 18.03048324584961, + "learning_rate": 6.939078751857355e-08, + "loss": 0.1108, + "step": 10350 + }, + { + "epoch": 30.741839762611274, + "grad_norm": 25.381195068359375, + "learning_rate": 6.936106983655275e-08, + "loss": 0.3413, + "step": 10360 + }, + { + "epoch": 30.771513353115726, + "grad_norm": 0.1660955846309662, + "learning_rate": 6.933135215453194e-08, + "loss": 0.2847, + "step": 10370 + }, + { + "epoch": 30.801186943620177, + "grad_norm": 1.8214088678359985, + "learning_rate": 6.930163447251114e-08, + "loss": 0.1403, + "step": 10380 + }, + { + "epoch": 30.83086053412463, + "grad_norm": 1.269321084022522, + "learning_rate": 6.927191679049033e-08, + "loss": 0.1153, + "step": 10390 + }, + { + "epoch": 30.86053412462908, + "grad_norm": 0.10031243413686752, + "learning_rate": 6.924219910846953e-08, + "loss": 0.0866, + "step": 10400 + }, + { + "epoch": 30.89020771513353, + "grad_norm": 0.14195209741592407, + "learning_rate": 6.921248142644874e-08, + "loss": 0.0715, + "step": 10410 + }, + { + "epoch": 30.919881305637983, + "grad_norm": 0.09193539619445801, + "learning_rate": 6.918276374442793e-08, + "loss": 0.0732, + "step": 10420 + }, + { + "epoch": 30.949554896142434, + "grad_norm": 0.2509472370147705, + "learning_rate": 6.915304606240713e-08, + "loss": 0.0402, + "step": 10430 + }, + { + "epoch": 30.979228486646885, + "grad_norm": 5.327921390533447, + "learning_rate": 6.912332838038633e-08, + "loss": 0.0613, + "step": 10440 + }, + { + "epoch": 31.0, + "eval_accuracy": 0.9354120267260579, + "eval_loss": 0.21542876958847046, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.411, + "eval_samples_per_second": 140.072, + "eval_steps_per_second": 17.626, + "step": 10447 + }, + { + "epoch": 31.008902077151337, + "grad_norm": 2.1399450302124023, + "learning_rate": 6.909361069836552e-08, + "loss": 0.0529, + "step": 10450 + }, + { + "epoch": 31.038575667655785, + "grad_norm": 26.31960678100586, + "learning_rate": 6.906389301634472e-08, + "loss": 0.3618, + "step": 10460 + }, + { + "epoch": 31.068249258160236, + "grad_norm": 0.4055666923522949, + "learning_rate": 6.903417533432392e-08, + "loss": 0.2399, + "step": 10470 + }, + { + "epoch": 31.097922848664687, + "grad_norm": 12.070892333984375, + "learning_rate": 6.900445765230312e-08, + "loss": 0.152, + "step": 10480 + }, + { + "epoch": 31.12759643916914, + "grad_norm": 19.371234893798828, + "learning_rate": 6.897473997028232e-08, + "loss": 0.0609, + "step": 10490 + }, + { + "epoch": 31.15727002967359, + "grad_norm": 8.843703269958496, + "learning_rate": 6.894502228826151e-08, + "loss": 0.2641, + "step": 10500 + }, + { + "epoch": 31.18694362017804, + "grad_norm": 10.414791107177734, + "learning_rate": 6.891530460624071e-08, + "loss": 0.1953, + "step": 10510 + }, + { + "epoch": 31.216617210682493, + "grad_norm": 0.9000236392021179, + "learning_rate": 6.88855869242199e-08, + "loss": 0.1927, + "step": 10520 + }, + { + "epoch": 31.246290801186944, + "grad_norm": 17.683856964111328, + "learning_rate": 6.885586924219911e-08, + "loss": 0.0798, + "step": 10530 + }, + { + "epoch": 31.275964391691396, + "grad_norm": 0.4843658208847046, + "learning_rate": 6.882615156017831e-08, + "loss": 0.0835, + "step": 10540 + }, + { + "epoch": 31.305637982195847, + "grad_norm": 0.030627114698290825, + "learning_rate": 6.87964338781575e-08, + "loss": 0.1786, + "step": 10550 + }, + { + "epoch": 31.335311572700295, + "grad_norm": 0.02275431901216507, + "learning_rate": 6.87667161961367e-08, + "loss": 0.0715, + "step": 10560 + }, + { + "epoch": 31.364985163204746, + "grad_norm": 1.8264684677124023, + "learning_rate": 6.873699851411589e-08, + "loss": 0.0936, + "step": 10570 + }, + { + "epoch": 31.394658753709198, + "grad_norm": 0.4593985974788666, + "learning_rate": 6.870728083209509e-08, + "loss": 0.2097, + "step": 10580 + }, + { + "epoch": 31.42433234421365, + "grad_norm": 0.04579648748040199, + "learning_rate": 6.86775631500743e-08, + "loss": 0.0538, + "step": 10590 + }, + { + "epoch": 31.4540059347181, + "grad_norm": 5.442652702331543, + "learning_rate": 6.864784546805349e-08, + "loss": 0.0384, + "step": 10600 + }, + { + "epoch": 31.483679525222552, + "grad_norm": 24.255887985229492, + "learning_rate": 6.861812778603269e-08, + "loss": 0.1753, + "step": 10610 + }, + { + "epoch": 31.513353115727003, + "grad_norm": 9.804685592651367, + "learning_rate": 6.858841010401188e-08, + "loss": 0.0415, + "step": 10620 + }, + { + "epoch": 31.543026706231455, + "grad_norm": 22.275009155273438, + "learning_rate": 6.855869242199108e-08, + "loss": 0.0805, + "step": 10630 + }, + { + "epoch": 31.572700296735906, + "grad_norm": 3.58140230178833, + "learning_rate": 6.852897473997028e-08, + "loss": 0.0583, + "step": 10640 + }, + { + "epoch": 31.602373887240358, + "grad_norm": 1.1739048957824707, + "learning_rate": 6.849925705794948e-08, + "loss": 0.2018, + "step": 10650 + }, + { + "epoch": 31.632047477744806, + "grad_norm": 0.9201865196228027, + "learning_rate": 6.846953937592868e-08, + "loss": 0.1539, + "step": 10660 + }, + { + "epoch": 31.661721068249257, + "grad_norm": 1.2981352806091309, + "learning_rate": 6.843982169390787e-08, + "loss": 0.0506, + "step": 10670 + }, + { + "epoch": 31.69139465875371, + "grad_norm": 0.5527468323707581, + "learning_rate": 6.841010401188707e-08, + "loss": 0.0962, + "step": 10680 + }, + { + "epoch": 31.72106824925816, + "grad_norm": 0.8009069561958313, + "learning_rate": 6.838038632986627e-08, + "loss": 0.0537, + "step": 10690 + }, + { + "epoch": 31.75074183976261, + "grad_norm": 2.2491776943206787, + "learning_rate": 6.835066864784546e-08, + "loss": 0.2179, + "step": 10700 + }, + { + "epoch": 31.780415430267063, + "grad_norm": 2.4115114212036133, + "learning_rate": 6.832095096582467e-08, + "loss": 0.1705, + "step": 10710 + }, + { + "epoch": 31.810089020771514, + "grad_norm": 18.621286392211914, + "learning_rate": 6.829123328380387e-08, + "loss": 0.115, + "step": 10720 + }, + { + "epoch": 31.839762611275965, + "grad_norm": 6.847456932067871, + "learning_rate": 6.826151560178306e-08, + "loss": 0.132, + "step": 10730 + }, + { + "epoch": 31.869436201780417, + "grad_norm": 1.9993690252304077, + "learning_rate": 6.823179791976226e-08, + "loss": 0.0522, + "step": 10740 + }, + { + "epoch": 31.899109792284868, + "grad_norm": 13.016792297363281, + "learning_rate": 6.820208023774145e-08, + "loss": 0.1078, + "step": 10750 + }, + { + "epoch": 31.928783382789316, + "grad_norm": 0.5611691474914551, + "learning_rate": 6.817236255572065e-08, + "loss": 0.201, + "step": 10760 + }, + { + "epoch": 31.958456973293767, + "grad_norm": 7.149682521820068, + "learning_rate": 6.814264487369986e-08, + "loss": 0.083, + "step": 10770 + }, + { + "epoch": 31.98813056379822, + "grad_norm": 0.4160104990005493, + "learning_rate": 6.811292719167905e-08, + "loss": 0.2465, + "step": 10780 + }, + { + "epoch": 32.0, + "eval_accuracy": 0.9354120267260579, + "eval_loss": 0.21565334498882294, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4286, + "eval_samples_per_second": 139.689, + "eval_steps_per_second": 17.578, + "step": 10784 + }, + { + "epoch": 32.017804154302674, + "grad_norm": 14.835113525390625, + "learning_rate": 6.808320950965825e-08, + "loss": 0.2548, + "step": 10790 + }, + { + "epoch": 32.047477744807125, + "grad_norm": 0.09177793562412262, + "learning_rate": 6.805349182763744e-08, + "loss": 0.1093, + "step": 10800 + }, + { + "epoch": 32.07715133531157, + "grad_norm": 14.508550643920898, + "learning_rate": 6.802377414561664e-08, + "loss": 0.1074, + "step": 10810 + }, + { + "epoch": 32.10682492581602, + "grad_norm": 0.14697951078414917, + "learning_rate": 6.799405646359583e-08, + "loss": 0.1838, + "step": 10820 + }, + { + "epoch": 32.13649851632047, + "grad_norm": 9.639199256896973, + "learning_rate": 6.796433878157504e-08, + "loss": 0.0747, + "step": 10830 + }, + { + "epoch": 32.166172106824924, + "grad_norm": 3.3744521141052246, + "learning_rate": 6.793462109955424e-08, + "loss": 0.1067, + "step": 10840 + }, + { + "epoch": 32.195845697329375, + "grad_norm": 0.42372533679008484, + "learning_rate": 6.790490341753343e-08, + "loss": 0.1806, + "step": 10850 + }, + { + "epoch": 32.225519287833826, + "grad_norm": 23.723491668701172, + "learning_rate": 6.787518573551263e-08, + "loss": 0.1568, + "step": 10860 + }, + { + "epoch": 32.25519287833828, + "grad_norm": 0.09757594019174576, + "learning_rate": 6.784546805349182e-08, + "loss": 0.1336, + "step": 10870 + }, + { + "epoch": 32.28486646884273, + "grad_norm": 0.7805241346359253, + "learning_rate": 6.781575037147102e-08, + "loss": 0.0215, + "step": 10880 + }, + { + "epoch": 32.31454005934718, + "grad_norm": 0.33914613723754883, + "learning_rate": 6.778603268945023e-08, + "loss": 0.2067, + "step": 10890 + }, + { + "epoch": 32.34421364985163, + "grad_norm": 0.19971874356269836, + "learning_rate": 6.775631500742942e-08, + "loss": 0.2108, + "step": 10900 + }, + { + "epoch": 32.37388724035608, + "grad_norm": 0.07293481379747391, + "learning_rate": 6.772659732540862e-08, + "loss": 0.1853, + "step": 10910 + }, + { + "epoch": 32.403560830860535, + "grad_norm": 11.6432466506958, + "learning_rate": 6.769687964338782e-08, + "loss": 0.1689, + "step": 10920 + }, + { + "epoch": 32.433234421364986, + "grad_norm": 0.10581853985786438, + "learning_rate": 6.766716196136701e-08, + "loss": 0.0624, + "step": 10930 + }, + { + "epoch": 32.46290801186944, + "grad_norm": 0.0651131346821785, + "learning_rate": 6.76374442793462e-08, + "loss": 0.0431, + "step": 10940 + }, + { + "epoch": 32.49258160237389, + "grad_norm": 36.12128448486328, + "learning_rate": 6.760772659732542e-08, + "loss": 0.1154, + "step": 10950 + }, + { + "epoch": 32.52225519287834, + "grad_norm": 2.7105534076690674, + "learning_rate": 6.757800891530461e-08, + "loss": 0.0516, + "step": 10960 + }, + { + "epoch": 32.55192878338279, + "grad_norm": 3.6497366428375244, + "learning_rate": 6.75482912332838e-08, + "loss": 0.104, + "step": 10970 + }, + { + "epoch": 32.58160237388724, + "grad_norm": 13.271821975708008, + "learning_rate": 6.7518573551263e-08, + "loss": 0.1147, + "step": 10980 + }, + { + "epoch": 32.611275964391695, + "grad_norm": 15.474771499633789, + "learning_rate": 6.74888558692422e-08, + "loss": 0.1753, + "step": 10990 + }, + { + "epoch": 32.640949554896146, + "grad_norm": 12.432657241821289, + "learning_rate": 6.745913818722139e-08, + "loss": 0.1554, + "step": 11000 + }, + { + "epoch": 32.67062314540059, + "grad_norm": 3.970877170562744, + "learning_rate": 6.74294205052006e-08, + "loss": 0.0769, + "step": 11010 + }, + { + "epoch": 32.70029673590504, + "grad_norm": 1.6898128986358643, + "learning_rate": 6.73997028231798e-08, + "loss": 0.0421, + "step": 11020 + }, + { + "epoch": 32.72997032640949, + "grad_norm": 0.2796095311641693, + "learning_rate": 6.736998514115899e-08, + "loss": 0.0938, + "step": 11030 + }, + { + "epoch": 32.759643916913944, + "grad_norm": 2.4856910705566406, + "learning_rate": 6.734026745913819e-08, + "loss": 0.1956, + "step": 11040 + }, + { + "epoch": 32.789317507418396, + "grad_norm": 0.07821241766214371, + "learning_rate": 6.731054977711738e-08, + "loss": 0.13, + "step": 11050 + }, + { + "epoch": 32.81899109792285, + "grad_norm": 0.2647961378097534, + "learning_rate": 6.728083209509658e-08, + "loss": 0.0354, + "step": 11060 + }, + { + "epoch": 32.8486646884273, + "grad_norm": 15.240720748901367, + "learning_rate": 6.725111441307579e-08, + "loss": 0.0915, + "step": 11070 + }, + { + "epoch": 32.87833827893175, + "grad_norm": 17.700641632080078, + "learning_rate": 6.722139673105498e-08, + "loss": 0.1672, + "step": 11080 + }, + { + "epoch": 32.9080118694362, + "grad_norm": 14.141181945800781, + "learning_rate": 6.719167904903418e-08, + "loss": 0.1348, + "step": 11090 + }, + { + "epoch": 32.93768545994065, + "grad_norm": 29.565324783325195, + "learning_rate": 6.716196136701336e-08, + "loss": 0.3313, + "step": 11100 + }, + { + "epoch": 32.967359050445104, + "grad_norm": 0.2029770463705063, + "learning_rate": 6.713224368499256e-08, + "loss": 0.1429, + "step": 11110 + }, + { + "epoch": 32.997032640949556, + "grad_norm": 12.855354309082031, + "learning_rate": 6.710252600297177e-08, + "loss": 0.1864, + "step": 11120 + }, + { + "epoch": 33.0, + "eval_accuracy": 0.9342984409799554, + "eval_loss": 0.2169022560119629, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.3739, + "eval_samples_per_second": 140.887, + "eval_steps_per_second": 17.729, + "step": 11121 + }, + { + "epoch": 33.02670623145401, + "grad_norm": 0.2877938151359558, + "learning_rate": 6.707280832095096e-08, + "loss": 0.3319, + "step": 11130 + }, + { + "epoch": 33.05637982195846, + "grad_norm": 46.61880874633789, + "learning_rate": 6.704309063893016e-08, + "loss": 0.1644, + "step": 11140 + }, + { + "epoch": 33.08605341246291, + "grad_norm": 11.891273498535156, + "learning_rate": 6.701337295690935e-08, + "loss": 0.1252, + "step": 11150 + }, + { + "epoch": 33.11572700296736, + "grad_norm": 15.003673553466797, + "learning_rate": 6.698365527488855e-08, + "loss": 0.1203, + "step": 11160 + }, + { + "epoch": 33.14540059347181, + "grad_norm": 9.080204963684082, + "learning_rate": 6.695393759286774e-08, + "loss": 0.1557, + "step": 11170 + }, + { + "epoch": 33.175074183976264, + "grad_norm": 0.7183218598365784, + "learning_rate": 6.692421991084695e-08, + "loss": 0.2822, + "step": 11180 + }, + { + "epoch": 33.204747774480715, + "grad_norm": 5.9455342292785645, + "learning_rate": 6.689450222882615e-08, + "loss": 0.0264, + "step": 11190 + }, + { + "epoch": 33.23442136498517, + "grad_norm": 11.834450721740723, + "learning_rate": 6.686478454680534e-08, + "loss": 0.1438, + "step": 11200 + }, + { + "epoch": 33.26409495548961, + "grad_norm": 1.645504355430603, + "learning_rate": 6.683506686478454e-08, + "loss": 0.0297, + "step": 11210 + }, + { + "epoch": 33.29376854599406, + "grad_norm": 25.478788375854492, + "learning_rate": 6.680534918276373e-08, + "loss": 0.1121, + "step": 11220 + }, + { + "epoch": 33.323442136498514, + "grad_norm": 1.1987559795379639, + "learning_rate": 6.677563150074293e-08, + "loss": 0.0628, + "step": 11230 + }, + { + "epoch": 33.353115727002965, + "grad_norm": 0.05146918073296547, + "learning_rate": 6.674591381872214e-08, + "loss": 0.1223, + "step": 11240 + }, + { + "epoch": 33.38278931750742, + "grad_norm": 20.845888137817383, + "learning_rate": 6.671619613670133e-08, + "loss": 0.2609, + "step": 11250 + }, + { + "epoch": 33.41246290801187, + "grad_norm": 0.6910379528999329, + "learning_rate": 6.668647845468053e-08, + "loss": 0.1932, + "step": 11260 + }, + { + "epoch": 33.44213649851632, + "grad_norm": 2.6223971843719482, + "learning_rate": 6.665676077265972e-08, + "loss": 0.0267, + "step": 11270 + }, + { + "epoch": 33.47181008902077, + "grad_norm": 2.652859687805176, + "learning_rate": 6.662704309063892e-08, + "loss": 0.2461, + "step": 11280 + }, + { + "epoch": 33.50148367952522, + "grad_norm": 0.8244590163230896, + "learning_rate": 6.659732540861812e-08, + "loss": 0.0482, + "step": 11290 + }, + { + "epoch": 33.531157270029674, + "grad_norm": 12.9890775680542, + "learning_rate": 6.656760772659732e-08, + "loss": 0.1251, + "step": 11300 + }, + { + "epoch": 33.560830860534125, + "grad_norm": 0.044464677572250366, + "learning_rate": 6.653789004457652e-08, + "loss": 0.1, + "step": 11310 + }, + { + "epoch": 33.590504451038576, + "grad_norm": 0.34065109491348267, + "learning_rate": 6.650817236255572e-08, + "loss": 0.0113, + "step": 11320 + }, + { + "epoch": 33.62017804154303, + "grad_norm": 0.365622341632843, + "learning_rate": 6.647845468053491e-08, + "loss": 0.3178, + "step": 11330 + }, + { + "epoch": 33.64985163204748, + "grad_norm": 0.31258487701416016, + "learning_rate": 6.64487369985141e-08, + "loss": 0.1976, + "step": 11340 + }, + { + "epoch": 33.67952522255193, + "grad_norm": 8.225257873535156, + "learning_rate": 6.64190193164933e-08, + "loss": 0.1972, + "step": 11350 + }, + { + "epoch": 33.70919881305638, + "grad_norm": 0.36410441994667053, + "learning_rate": 6.638930163447251e-08, + "loss": 0.0178, + "step": 11360 + }, + { + "epoch": 33.73887240356083, + "grad_norm": 3.727329730987549, + "learning_rate": 6.63595839524517e-08, + "loss": 0.0739, + "step": 11370 + }, + { + "epoch": 33.768545994065285, + "grad_norm": 17.45574188232422, + "learning_rate": 6.63298662704309e-08, + "loss": 0.0331, + "step": 11380 + }, + { + "epoch": 33.798219584569736, + "grad_norm": 3.8069872856140137, + "learning_rate": 6.63001485884101e-08, + "loss": 0.0929, + "step": 11390 + }, + { + "epoch": 33.82789317507418, + "grad_norm": 15.318708419799805, + "learning_rate": 6.627043090638929e-08, + "loss": 0.1776, + "step": 11400 + }, + { + "epoch": 33.85756676557863, + "grad_norm": 0.7170237302780151, + "learning_rate": 6.624071322436849e-08, + "loss": 0.2378, + "step": 11410 + }, + { + "epoch": 33.88724035608308, + "grad_norm": 0.11917033046483994, + "learning_rate": 6.62109955423477e-08, + "loss": 0.0571, + "step": 11420 + }, + { + "epoch": 33.916913946587535, + "grad_norm": 2.264817953109741, + "learning_rate": 6.618127786032689e-08, + "loss": 0.1845, + "step": 11430 + }, + { + "epoch": 33.946587537091986, + "grad_norm": 21.306093215942383, + "learning_rate": 6.615156017830609e-08, + "loss": 0.1026, + "step": 11440 + }, + { + "epoch": 33.97626112759644, + "grad_norm": 0.15265806019306183, + "learning_rate": 6.612184249628528e-08, + "loss": 0.0801, + "step": 11450 + }, + { + "epoch": 34.0, + "eval_accuracy": 0.9342984409799554, + "eval_loss": 0.21683864295482635, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4548, + "eval_samples_per_second": 139.122, + "eval_steps_per_second": 17.506, + "step": 11458 + }, + { + "epoch": 34.00593471810089, + "grad_norm": 0.16715925931930542, + "learning_rate": 6.609212481426448e-08, + "loss": 0.1415, + "step": 11460 + }, + { + "epoch": 34.03560830860534, + "grad_norm": 3.2949681282043457, + "learning_rate": 6.606240713224367e-08, + "loss": 0.1001, + "step": 11470 + }, + { + "epoch": 34.06528189910979, + "grad_norm": 1.0207877159118652, + "learning_rate": 6.603268945022288e-08, + "loss": 0.1254, + "step": 11480 + }, + { + "epoch": 34.09495548961424, + "grad_norm": 0.586432695388794, + "learning_rate": 6.600297176820208e-08, + "loss": 0.02, + "step": 11490 + }, + { + "epoch": 34.124629080118694, + "grad_norm": 2.111340284347534, + "learning_rate": 6.597325408618127e-08, + "loss": 0.1796, + "step": 11500 + }, + { + "epoch": 34.154302670623146, + "grad_norm": 0.8200799226760864, + "learning_rate": 6.594353640416047e-08, + "loss": 0.0086, + "step": 11510 + }, + { + "epoch": 34.1839762611276, + "grad_norm": 5.858191967010498, + "learning_rate": 6.591381872213967e-08, + "loss": 0.0369, + "step": 11520 + }, + { + "epoch": 34.21364985163205, + "grad_norm": 22.88271141052246, + "learning_rate": 6.588410104011886e-08, + "loss": 0.1083, + "step": 11530 + }, + { + "epoch": 34.2433234421365, + "grad_norm": 5.103590488433838, + "learning_rate": 6.585438335809807e-08, + "loss": 0.0291, + "step": 11540 + }, + { + "epoch": 34.27299703264095, + "grad_norm": 0.22018510103225708, + "learning_rate": 6.582466567607726e-08, + "loss": 0.0623, + "step": 11550 + }, + { + "epoch": 34.3026706231454, + "grad_norm": 35.32367706298828, + "learning_rate": 6.579494799405646e-08, + "loss": 0.0878, + "step": 11560 + }, + { + "epoch": 34.332344213649854, + "grad_norm": 28.14548683166504, + "learning_rate": 6.576523031203566e-08, + "loss": 0.0704, + "step": 11570 + }, + { + "epoch": 34.362017804154306, + "grad_norm": 0.592730700969696, + "learning_rate": 6.573551263001485e-08, + "loss": 0.1003, + "step": 11580 + }, + { + "epoch": 34.39169139465876, + "grad_norm": 16.464004516601562, + "learning_rate": 6.570579494799405e-08, + "loss": 0.1123, + "step": 11590 + }, + { + "epoch": 34.4213649851632, + "grad_norm": 2.835178852081299, + "learning_rate": 6.567607726597326e-08, + "loss": 0.0876, + "step": 11600 + }, + { + "epoch": 34.45103857566765, + "grad_norm": 40.948143005371094, + "learning_rate": 6.564635958395245e-08, + "loss": 0.1868, + "step": 11610 + }, + { + "epoch": 34.480712166172104, + "grad_norm": 12.819151878356934, + "learning_rate": 6.561664190193165e-08, + "loss": 0.0751, + "step": 11620 + }, + { + "epoch": 34.510385756676556, + "grad_norm": 16.01125144958496, + "learning_rate": 6.558692421991084e-08, + "loss": 0.1098, + "step": 11630 + }, + { + "epoch": 34.54005934718101, + "grad_norm": 16.244918823242188, + "learning_rate": 6.555720653789004e-08, + "loss": 0.1466, + "step": 11640 + }, + { + "epoch": 34.56973293768546, + "grad_norm": 9.997687339782715, + "learning_rate": 6.552748885586923e-08, + "loss": 0.2007, + "step": 11650 + }, + { + "epoch": 34.59940652818991, + "grad_norm": 0.41997459530830383, + "learning_rate": 6.549777117384844e-08, + "loss": 0.0791, + "step": 11660 + }, + { + "epoch": 34.62908011869436, + "grad_norm": 1.4188556671142578, + "learning_rate": 6.546805349182764e-08, + "loss": 0.1031, + "step": 11670 + }, + { + "epoch": 34.65875370919881, + "grad_norm": 23.641469955444336, + "learning_rate": 6.543833580980683e-08, + "loss": 0.3617, + "step": 11680 + }, + { + "epoch": 34.688427299703264, + "grad_norm": 4.543605804443359, + "learning_rate": 6.540861812778603e-08, + "loss": 0.2071, + "step": 11690 + }, + { + "epoch": 34.718100890207715, + "grad_norm": 2.1229164600372314, + "learning_rate": 6.537890044576522e-08, + "loss": 0.0307, + "step": 11700 + }, + { + "epoch": 34.74777448071217, + "grad_norm": 5.018290042877197, + "learning_rate": 6.534918276374442e-08, + "loss": 0.0109, + "step": 11710 + }, + { + "epoch": 34.77744807121662, + "grad_norm": 1.5329301357269287, + "learning_rate": 6.531946508172363e-08, + "loss": 0.0462, + "step": 11720 + }, + { + "epoch": 34.80712166172107, + "grad_norm": 21.17132568359375, + "learning_rate": 6.528974739970282e-08, + "loss": 0.0186, + "step": 11730 + }, + { + "epoch": 34.83679525222552, + "grad_norm": 0.6158196330070496, + "learning_rate": 6.526002971768202e-08, + "loss": 0.1362, + "step": 11740 + }, + { + "epoch": 34.86646884272997, + "grad_norm": 14.483859062194824, + "learning_rate": 6.523031203566121e-08, + "loss": 0.1754, + "step": 11750 + }, + { + "epoch": 34.896142433234424, + "grad_norm": 0.14224819839000702, + "learning_rate": 6.520059435364041e-08, + "loss": 0.1176, + "step": 11760 + }, + { + "epoch": 34.925816023738875, + "grad_norm": 5.2746148109436035, + "learning_rate": 6.517087667161962e-08, + "loss": 0.3131, + "step": 11770 + }, + { + "epoch": 34.95548961424333, + "grad_norm": 7.889867305755615, + "learning_rate": 6.514115898959881e-08, + "loss": 0.0545, + "step": 11780 + }, + { + "epoch": 34.98516320474778, + "grad_norm": 0.9784253239631653, + "learning_rate": 6.511144130757801e-08, + "loss": 0.0318, + "step": 11790 + }, + { + "epoch": 35.0, + "eval_accuracy": 0.9342984409799554, + "eval_loss": 0.21539244055747986, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4255, + "eval_samples_per_second": 139.755, + "eval_steps_per_second": 17.586, + "step": 11795 + }, + { + "epoch": 35.01483679525222, + "grad_norm": 9.25466251373291, + "learning_rate": 6.50817236255572e-08, + "loss": 0.338, + "step": 11800 + }, + { + "epoch": 35.04451038575667, + "grad_norm": 17.401199340820312, + "learning_rate": 6.50520059435364e-08, + "loss": 0.0852, + "step": 11810 + }, + { + "epoch": 35.074183976261125, + "grad_norm": 7.955389022827148, + "learning_rate": 6.50222882615156e-08, + "loss": 0.127, + "step": 11820 + }, + { + "epoch": 35.103857566765576, + "grad_norm": 8.46923828125, + "learning_rate": 6.49925705794948e-08, + "loss": 0.1402, + "step": 11830 + }, + { + "epoch": 35.13353115727003, + "grad_norm": 30.20545196533203, + "learning_rate": 6.4962852897474e-08, + "loss": 0.2422, + "step": 11840 + }, + { + "epoch": 35.16320474777448, + "grad_norm": 0.08352695405483246, + "learning_rate": 6.49331352154532e-08, + "loss": 0.0747, + "step": 11850 + }, + { + "epoch": 35.19287833827893, + "grad_norm": 0.6065623760223389, + "learning_rate": 6.490341753343239e-08, + "loss": 0.0721, + "step": 11860 + }, + { + "epoch": 35.22255192878338, + "grad_norm": 0.2181304693222046, + "learning_rate": 6.487369985141159e-08, + "loss": 0.1896, + "step": 11870 + }, + { + "epoch": 35.25222551928783, + "grad_norm": 21.025768280029297, + "learning_rate": 6.484398216939078e-08, + "loss": 0.1352, + "step": 11880 + }, + { + "epoch": 35.281899109792285, + "grad_norm": 34.33517074584961, + "learning_rate": 6.481426448736999e-08, + "loss": 0.1422, + "step": 11890 + }, + { + "epoch": 35.311572700296736, + "grad_norm": 32.51298141479492, + "learning_rate": 6.478454680534919e-08, + "loss": 0.1901, + "step": 11900 + }, + { + "epoch": 35.34124629080119, + "grad_norm": 0.12153037637472153, + "learning_rate": 6.475482912332838e-08, + "loss": 0.0971, + "step": 11910 + }, + { + "epoch": 35.37091988130564, + "grad_norm": 5.756996154785156, + "learning_rate": 6.472511144130758e-08, + "loss": 0.0669, + "step": 11920 + }, + { + "epoch": 35.40059347181009, + "grad_norm": 11.262227058410645, + "learning_rate": 6.469539375928677e-08, + "loss": 0.1179, + "step": 11930 + }, + { + "epoch": 35.43026706231454, + "grad_norm": 2.530487060546875, + "learning_rate": 6.466567607726597e-08, + "loss": 0.137, + "step": 11940 + }, + { + "epoch": 35.45994065281899, + "grad_norm": 1.3342033624649048, + "learning_rate": 6.463595839524518e-08, + "loss": 0.0403, + "step": 11950 + }, + { + "epoch": 35.489614243323444, + "grad_norm": 0.08854734897613525, + "learning_rate": 6.460624071322437e-08, + "loss": 0.2118, + "step": 11960 + }, + { + "epoch": 35.519287833827896, + "grad_norm": 3.9277520179748535, + "learning_rate": 6.457652303120357e-08, + "loss": 0.1711, + "step": 11970 + }, + { + "epoch": 35.54896142433235, + "grad_norm": 2.9391472339630127, + "learning_rate": 6.454680534918276e-08, + "loss": 0.0936, + "step": 11980 + }, + { + "epoch": 35.5786350148368, + "grad_norm": 1.100213646888733, + "learning_rate": 6.451708766716196e-08, + "loss": 0.0685, + "step": 11990 + }, + { + "epoch": 35.60830860534124, + "grad_norm": 0.3320392966270447, + "learning_rate": 6.448736998514116e-08, + "loss": 0.1258, + "step": 12000 + }, + { + "epoch": 35.637982195845694, + "grad_norm": 0.035895995795726776, + "learning_rate": 6.445765230312036e-08, + "loss": 0.1663, + "step": 12010 + }, + { + "epoch": 35.667655786350146, + "grad_norm": 5.921454906463623, + "learning_rate": 6.442793462109956e-08, + "loss": 0.0697, + "step": 12020 + }, + { + "epoch": 35.6973293768546, + "grad_norm": 19.610403060913086, + "learning_rate": 6.439821693907876e-08, + "loss": 0.3198, + "step": 12030 + }, + { + "epoch": 35.72700296735905, + "grad_norm": 0.6008217334747314, + "learning_rate": 6.436849925705795e-08, + "loss": 0.1298, + "step": 12040 + }, + { + "epoch": 35.7566765578635, + "grad_norm": 0.0337984599173069, + "learning_rate": 6.433878157503715e-08, + "loss": 0.0448, + "step": 12050 + }, + { + "epoch": 35.78635014836795, + "grad_norm": 0.9008129239082336, + "learning_rate": 6.430906389301634e-08, + "loss": 0.1551, + "step": 12060 + }, + { + "epoch": 35.8160237388724, + "grad_norm": 24.367717742919922, + "learning_rate": 6.427934621099555e-08, + "loss": 0.0979, + "step": 12070 + }, + { + "epoch": 35.845697329376854, + "grad_norm": 0.4326264560222626, + "learning_rate": 6.424962852897475e-08, + "loss": 0.1007, + "step": 12080 + }, + { + "epoch": 35.875370919881306, + "grad_norm": 1.6984367370605469, + "learning_rate": 6.421991084695394e-08, + "loss": 0.0464, + "step": 12090 + }, + { + "epoch": 35.90504451038576, + "grad_norm": 9.139761924743652, + "learning_rate": 6.419019316493314e-08, + "loss": 0.2867, + "step": 12100 + }, + { + "epoch": 35.93471810089021, + "grad_norm": 22.040620803833008, + "learning_rate": 6.416047548291233e-08, + "loss": 0.1915, + "step": 12110 + }, + { + "epoch": 35.96439169139466, + "grad_norm": 0.13384293019771576, + "learning_rate": 6.413075780089153e-08, + "loss": 0.0697, + "step": 12120 + }, + { + "epoch": 35.99406528189911, + "grad_norm": 0.649105966091156, + "learning_rate": 6.410104011887074e-08, + "loss": 0.1244, + "step": 12130 + }, + { + "epoch": 36.0, + "eval_accuracy": 0.9354120267260579, + "eval_loss": 0.21495579183101654, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.3922, + "eval_samples_per_second": 140.483, + "eval_steps_per_second": 17.678, + "step": 12132 + }, + { + "epoch": 36.02373887240356, + "grad_norm": 1.6163402795791626, + "learning_rate": 6.407132243684993e-08, + "loss": 0.1706, + "step": 12140 + }, + { + "epoch": 36.053412462908014, + "grad_norm": 15.416520118713379, + "learning_rate": 6.404160475482911e-08, + "loss": 0.2097, + "step": 12150 + }, + { + "epoch": 36.083086053412465, + "grad_norm": 19.945707321166992, + "learning_rate": 6.401188707280831e-08, + "loss": 0.2492, + "step": 12160 + }, + { + "epoch": 36.11275964391692, + "grad_norm": 2.2224879264831543, + "learning_rate": 6.39821693907875e-08, + "loss": 0.2526, + "step": 12170 + }, + { + "epoch": 36.14243323442137, + "grad_norm": 4.408667087554932, + "learning_rate": 6.395245170876671e-08, + "loss": 0.0962, + "step": 12180 + }, + { + "epoch": 36.17210682492582, + "grad_norm": 0.7182313203811646, + "learning_rate": 6.392273402674591e-08, + "loss": 0.097, + "step": 12190 + }, + { + "epoch": 36.201780415430264, + "grad_norm": 1.497220754623413, + "learning_rate": 6.38930163447251e-08, + "loss": 0.25, + "step": 12200 + }, + { + "epoch": 36.231454005934715, + "grad_norm": 31.34697723388672, + "learning_rate": 6.38632986627043e-08, + "loss": 0.111, + "step": 12210 + }, + { + "epoch": 36.26112759643917, + "grad_norm": 3.6075263023376465, + "learning_rate": 6.38335809806835e-08, + "loss": 0.087, + "step": 12220 + }, + { + "epoch": 36.29080118694362, + "grad_norm": 30.367908477783203, + "learning_rate": 6.380386329866269e-08, + "loss": 0.2929, + "step": 12230 + }, + { + "epoch": 36.32047477744807, + "grad_norm": 5.103381156921387, + "learning_rate": 6.37741456166419e-08, + "loss": 0.0966, + "step": 12240 + }, + { + "epoch": 36.35014836795252, + "grad_norm": 0.9199526309967041, + "learning_rate": 6.37444279346211e-08, + "loss": 0.0275, + "step": 12250 + }, + { + "epoch": 36.37982195845697, + "grad_norm": 25.720836639404297, + "learning_rate": 6.371471025260029e-08, + "loss": 0.22, + "step": 12260 + }, + { + "epoch": 36.409495548961424, + "grad_norm": 0.05689868703484535, + "learning_rate": 6.368499257057949e-08, + "loss": 0.1554, + "step": 12270 + }, + { + "epoch": 36.439169139465875, + "grad_norm": 0.6062152981758118, + "learning_rate": 6.365527488855868e-08, + "loss": 0.1042, + "step": 12280 + }, + { + "epoch": 36.468842729970326, + "grad_norm": 0.11303484439849854, + "learning_rate": 6.362555720653788e-08, + "loss": 0.0382, + "step": 12290 + }, + { + "epoch": 36.49851632047478, + "grad_norm": 0.1379605084657669, + "learning_rate": 6.359583952451709e-08, + "loss": 0.0988, + "step": 12300 + }, + { + "epoch": 36.52818991097923, + "grad_norm": 0.2162492275238037, + "learning_rate": 6.356612184249628e-08, + "loss": 0.0409, + "step": 12310 + }, + { + "epoch": 36.55786350148368, + "grad_norm": 17.757644653320312, + "learning_rate": 6.353640416047548e-08, + "loss": 0.1823, + "step": 12320 + }, + { + "epoch": 36.58753709198813, + "grad_norm": 0.8914327621459961, + "learning_rate": 6.350668647845467e-08, + "loss": 0.3189, + "step": 12330 + }, + { + "epoch": 36.61721068249258, + "grad_norm": 10.96381950378418, + "learning_rate": 6.347696879643387e-08, + "loss": 0.0625, + "step": 12340 + }, + { + "epoch": 36.646884272997035, + "grad_norm": 18.950212478637695, + "learning_rate": 6.344725111441306e-08, + "loss": 0.1771, + "step": 12350 + }, + { + "epoch": 36.676557863501486, + "grad_norm": 12.005958557128906, + "learning_rate": 6.341753343239227e-08, + "loss": 0.0714, + "step": 12360 + }, + { + "epoch": 36.70623145400594, + "grad_norm": 0.5899513363838196, + "learning_rate": 6.338781575037147e-08, + "loss": 0.213, + "step": 12370 + }, + { + "epoch": 36.73590504451039, + "grad_norm": 1.4470969438552856, + "learning_rate": 6.335809806835066e-08, + "loss": 0.0794, + "step": 12380 + }, + { + "epoch": 36.76557863501483, + "grad_norm": 1.377669334411621, + "learning_rate": 6.332838038632986e-08, + "loss": 0.0823, + "step": 12390 + }, + { + "epoch": 36.795252225519285, + "grad_norm": 5.422667026519775, + "learning_rate": 6.329866270430906e-08, + "loss": 0.1833, + "step": 12400 + }, + { + "epoch": 36.824925816023736, + "grad_norm": 5.177680015563965, + "learning_rate": 6.326894502228825e-08, + "loss": 0.1044, + "step": 12410 + }, + { + "epoch": 36.85459940652819, + "grad_norm": 0.6981093287467957, + "learning_rate": 6.323922734026746e-08, + "loss": 0.0599, + "step": 12420 + }, + { + "epoch": 36.88427299703264, + "grad_norm": 4.665019512176514, + "learning_rate": 6.320950965824665e-08, + "loss": 0.078, + "step": 12430 + }, + { + "epoch": 36.91394658753709, + "grad_norm": 0.12102646380662918, + "learning_rate": 6.317979197622585e-08, + "loss": 0.0901, + "step": 12440 + }, + { + "epoch": 36.94362017804154, + "grad_norm": 6.363934516906738, + "learning_rate": 6.315007429420505e-08, + "loss": 0.237, + "step": 12450 + }, + { + "epoch": 36.97329376854599, + "grad_norm": 0.1967965066432953, + "learning_rate": 6.312035661218424e-08, + "loss": 0.2186, + "step": 12460 + }, + { + "epoch": 37.0, + "eval_accuracy": 0.9376391982182628, + "eval_loss": 0.21330556273460388, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.3914, + "eval_samples_per_second": 140.501, + "eval_steps_per_second": 17.68, + "step": 12469 + }, + { + "epoch": 37.002967359050444, + "grad_norm": 25.94045066833496, + "learning_rate": 6.309063893016344e-08, + "loss": 0.0285, + "step": 12470 + }, + { + "epoch": 37.032640949554896, + "grad_norm": 0.5640539526939392, + "learning_rate": 6.306092124814265e-08, + "loss": 0.0479, + "step": 12480 + }, + { + "epoch": 37.06231454005935, + "grad_norm": 14.364142417907715, + "learning_rate": 6.303120356612184e-08, + "loss": 0.0314, + "step": 12490 + }, + { + "epoch": 37.0919881305638, + "grad_norm": 26.387121200561523, + "learning_rate": 6.300148588410104e-08, + "loss": 0.1699, + "step": 12500 + }, + { + "epoch": 37.12166172106825, + "grad_norm": 12.893073081970215, + "learning_rate": 6.297176820208023e-08, + "loss": 0.1505, + "step": 12510 + }, + { + "epoch": 37.1513353115727, + "grad_norm": 28.494434356689453, + "learning_rate": 6.294205052005943e-08, + "loss": 0.2861, + "step": 12520 + }, + { + "epoch": 37.18100890207715, + "grad_norm": 0.058965012431144714, + "learning_rate": 6.291233283803862e-08, + "loss": 0.1924, + "step": 12530 + }, + { + "epoch": 37.210682492581604, + "grad_norm": 0.2849510908126831, + "learning_rate": 6.288261515601783e-08, + "loss": 0.0713, + "step": 12540 + }, + { + "epoch": 37.240356083086056, + "grad_norm": 3.319641590118408, + "learning_rate": 6.285289747399703e-08, + "loss": 0.081, + "step": 12550 + }, + { + "epoch": 37.27002967359051, + "grad_norm": 8.695155143737793, + "learning_rate": 6.282317979197622e-08, + "loss": 0.192, + "step": 12560 + }, + { + "epoch": 37.29970326409496, + "grad_norm": 13.08652114868164, + "learning_rate": 6.279346210995542e-08, + "loss": 0.1585, + "step": 12570 + }, + { + "epoch": 37.32937685459941, + "grad_norm": 0.8089465498924255, + "learning_rate": 6.276374442793461e-08, + "loss": 0.1223, + "step": 12580 + }, + { + "epoch": 37.359050445103854, + "grad_norm": 22.945438385009766, + "learning_rate": 6.273402674591381e-08, + "loss": 0.107, + "step": 12590 + }, + { + "epoch": 37.388724035608305, + "grad_norm": 0.2552558481693268, + "learning_rate": 6.270430906389302e-08, + "loss": 0.15, + "step": 12600 + }, + { + "epoch": 37.41839762611276, + "grad_norm": 1.4434412717819214, + "learning_rate": 6.267459138187221e-08, + "loss": 0.0554, + "step": 12610 + }, + { + "epoch": 37.44807121661721, + "grad_norm": 25.277193069458008, + "learning_rate": 6.264487369985141e-08, + "loss": 0.0332, + "step": 12620 + }, + { + "epoch": 37.47774480712166, + "grad_norm": 7.935237407684326, + "learning_rate": 6.26151560178306e-08, + "loss": 0.1073, + "step": 12630 + }, + { + "epoch": 37.50741839762611, + "grad_norm": 0.38012707233428955, + "learning_rate": 6.25854383358098e-08, + "loss": 0.0769, + "step": 12640 + }, + { + "epoch": 37.53709198813056, + "grad_norm": 16.772241592407227, + "learning_rate": 6.2555720653789e-08, + "loss": 0.1382, + "step": 12650 + }, + { + "epoch": 37.566765578635014, + "grad_norm": 19.272356033325195, + "learning_rate": 6.25260029717682e-08, + "loss": 0.0366, + "step": 12660 + }, + { + "epoch": 37.596439169139465, + "grad_norm": 0.1118217259645462, + "learning_rate": 6.24962852897474e-08, + "loss": 0.1741, + "step": 12670 + }, + { + "epoch": 37.62611275964392, + "grad_norm": 21.22469139099121, + "learning_rate": 6.24665676077266e-08, + "loss": 0.0923, + "step": 12680 + }, + { + "epoch": 37.65578635014837, + "grad_norm": 0.32484909892082214, + "learning_rate": 6.243684992570579e-08, + "loss": 0.0686, + "step": 12690 + }, + { + "epoch": 37.68545994065282, + "grad_norm": 0.19637347757816315, + "learning_rate": 6.240713224368499e-08, + "loss": 0.1893, + "step": 12700 + }, + { + "epoch": 37.71513353115727, + "grad_norm": 0.11365742981433868, + "learning_rate": 6.237741456166418e-08, + "loss": 0.1134, + "step": 12710 + }, + { + "epoch": 37.74480712166172, + "grad_norm": 26.72102928161621, + "learning_rate": 6.234769687964339e-08, + "loss": 0.1606, + "step": 12720 + }, + { + "epoch": 37.774480712166174, + "grad_norm": 0.22417041659355164, + "learning_rate": 6.231797919762259e-08, + "loss": 0.0473, + "step": 12730 + }, + { + "epoch": 37.804154302670625, + "grad_norm": 3.707141637802124, + "learning_rate": 6.228826151560178e-08, + "loss": 0.0897, + "step": 12740 + }, + { + "epoch": 37.833827893175076, + "grad_norm": 0.3214769959449768, + "learning_rate": 6.225854383358098e-08, + "loss": 0.2802, + "step": 12750 + }, + { + "epoch": 37.86350148367953, + "grad_norm": 2.4424564838409424, + "learning_rate": 6.222882615156017e-08, + "loss": 0.2048, + "step": 12760 + }, + { + "epoch": 37.89317507418398, + "grad_norm": 0.21107935905456543, + "learning_rate": 6.219910846953937e-08, + "loss": 0.141, + "step": 12770 + }, + { + "epoch": 37.92284866468843, + "grad_norm": 0.10310006141662598, + "learning_rate": 6.216939078751858e-08, + "loss": 0.1143, + "step": 12780 + }, + { + "epoch": 37.952522255192875, + "grad_norm": 0.6239740252494812, + "learning_rate": 6.213967310549777e-08, + "loss": 0.1378, + "step": 12790 + }, + { + "epoch": 37.982195845697326, + "grad_norm": 0.13136813044548035, + "learning_rate": 6.210995542347697e-08, + "loss": 0.0558, + "step": 12800 + }, + { + "epoch": 38.0, + "eval_accuracy": 0.9376391982182628, + "eval_loss": 0.21442252397537231, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4042, + "eval_samples_per_second": 140.221, + "eval_steps_per_second": 17.645, + "step": 12806 + }, + { + "epoch": 38.01186943620178, + "grad_norm": 0.12485942244529724, + "learning_rate": 6.208023774145616e-08, + "loss": 0.0306, + "step": 12810 + }, + { + "epoch": 38.04154302670623, + "grad_norm": 8.10720443725586, + "learning_rate": 6.205052005943536e-08, + "loss": 0.1007, + "step": 12820 + }, + { + "epoch": 38.07121661721068, + "grad_norm": 0.04081455618143082, + "learning_rate": 6.202080237741455e-08, + "loss": 0.1239, + "step": 12830 + }, + { + "epoch": 38.10089020771513, + "grad_norm": 0.23625168204307556, + "learning_rate": 6.199108469539376e-08, + "loss": 0.1687, + "step": 12840 + }, + { + "epoch": 38.13056379821958, + "grad_norm": 8.466800689697266, + "learning_rate": 6.196136701337296e-08, + "loss": 0.2164, + "step": 12850 + }, + { + "epoch": 38.160237388724035, + "grad_norm": 29.431215286254883, + "learning_rate": 6.193164933135215e-08, + "loss": 0.1339, + "step": 12860 + }, + { + "epoch": 38.189910979228486, + "grad_norm": 0.21640785038471222, + "learning_rate": 6.190193164933135e-08, + "loss": 0.0771, + "step": 12870 + }, + { + "epoch": 38.21958456973294, + "grad_norm": 0.16919468343257904, + "learning_rate": 6.187221396731055e-08, + "loss": 0.1919, + "step": 12880 + }, + { + "epoch": 38.24925816023739, + "grad_norm": 0.039273764938116074, + "learning_rate": 6.184249628528974e-08, + "loss": 0.0691, + "step": 12890 + }, + { + "epoch": 38.27893175074184, + "grad_norm": 0.04706452786922455, + "learning_rate": 6.181277860326895e-08, + "loss": 0.1735, + "step": 12900 + }, + { + "epoch": 38.30860534124629, + "grad_norm": 0.24082796275615692, + "learning_rate": 6.178306092124815e-08, + "loss": 0.1926, + "step": 12910 + }, + { + "epoch": 38.33827893175074, + "grad_norm": 0.903948962688446, + "learning_rate": 6.175334323922734e-08, + "loss": 0.0399, + "step": 12920 + }, + { + "epoch": 38.367952522255194, + "grad_norm": 10.357869148254395, + "learning_rate": 6.172362555720654e-08, + "loss": 0.0479, + "step": 12930 + }, + { + "epoch": 38.397626112759646, + "grad_norm": 0.38585174083709717, + "learning_rate": 6.169390787518573e-08, + "loss": 0.0887, + "step": 12940 + }, + { + "epoch": 38.4272997032641, + "grad_norm": 1.0532981157302856, + "learning_rate": 6.166419019316493e-08, + "loss": 0.0979, + "step": 12950 + }, + { + "epoch": 38.45697329376855, + "grad_norm": 9.199470520019531, + "learning_rate": 6.163447251114414e-08, + "loss": 0.0939, + "step": 12960 + }, + { + "epoch": 38.486646884273, + "grad_norm": 1.1750718355178833, + "learning_rate": 6.160475482912333e-08, + "loss": 0.0479, + "step": 12970 + }, + { + "epoch": 38.51632047477745, + "grad_norm": 0.33509141206741333, + "learning_rate": 6.157503714710253e-08, + "loss": 0.1741, + "step": 12980 + }, + { + "epoch": 38.545994065281896, + "grad_norm": 12.233824729919434, + "learning_rate": 6.154531946508172e-08, + "loss": 0.1886, + "step": 12990 + }, + { + "epoch": 38.57566765578635, + "grad_norm": 22.89804458618164, + "learning_rate": 6.151560178306092e-08, + "loss": 0.1397, + "step": 13000 + }, + { + "epoch": 38.6053412462908, + "grad_norm": 0.32731854915618896, + "learning_rate": 6.148588410104011e-08, + "loss": 0.1696, + "step": 13010 + }, + { + "epoch": 38.63501483679525, + "grad_norm": 0.017913781106472015, + "learning_rate": 6.145616641901932e-08, + "loss": 0.1133, + "step": 13020 + }, + { + "epoch": 38.6646884272997, + "grad_norm": 0.1449405401945114, + "learning_rate": 6.142644873699852e-08, + "loss": 0.2055, + "step": 13030 + }, + { + "epoch": 38.69436201780415, + "grad_norm": 0.0963745191693306, + "learning_rate": 6.139673105497771e-08, + "loss": 0.0846, + "step": 13040 + }, + { + "epoch": 38.724035608308604, + "grad_norm": 0.1028711125254631, + "learning_rate": 6.136701337295691e-08, + "loss": 0.2542, + "step": 13050 + }, + { + "epoch": 38.753709198813056, + "grad_norm": 14.612128257751465, + "learning_rate": 6.13372956909361e-08, + "loss": 0.2817, + "step": 13060 + }, + { + "epoch": 38.78338278931751, + "grad_norm": 20.479768753051758, + "learning_rate": 6.13075780089153e-08, + "loss": 0.1594, + "step": 13070 + }, + { + "epoch": 38.81305637982196, + "grad_norm": 2.4801437854766846, + "learning_rate": 6.127786032689451e-08, + "loss": 0.2349, + "step": 13080 + }, + { + "epoch": 38.84272997032641, + "grad_norm": 0.7339556813240051, + "learning_rate": 6.12481426448737e-08, + "loss": 0.0847, + "step": 13090 + }, + { + "epoch": 38.87240356083086, + "grad_norm": 0.027926743030548096, + "learning_rate": 6.12184249628529e-08, + "loss": 0.1254, + "step": 13100 + }, + { + "epoch": 38.90207715133531, + "grad_norm": 1.1706894636154175, + "learning_rate": 6.11887072808321e-08, + "loss": 0.105, + "step": 13110 + }, + { + "epoch": 38.931750741839764, + "grad_norm": 17.08843994140625, + "learning_rate": 6.115898959881129e-08, + "loss": 0.2058, + "step": 13120 + }, + { + "epoch": 38.961424332344215, + "grad_norm": 0.36468303203582764, + "learning_rate": 6.112927191679049e-08, + "loss": 0.1327, + "step": 13130 + }, + { + "epoch": 38.99109792284867, + "grad_norm": 22.28550910949707, + "learning_rate": 6.10995542347697e-08, + "loss": 0.1598, + "step": 13140 + }, + { + "epoch": 39.0, + "eval_accuracy": 0.9398663697104677, + "eval_loss": 0.2147417962551117, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.3891, + "eval_samples_per_second": 140.552, + "eval_steps_per_second": 17.686, + "step": 13143 + }, + { + "epoch": 39.02077151335312, + "grad_norm": 5.662286281585693, + "learning_rate": 6.106983655274889e-08, + "loss": 0.0606, + "step": 13150 + }, + { + "epoch": 39.05044510385757, + "grad_norm": 0.05046107992529869, + "learning_rate": 6.104011887072809e-08, + "loss": 0.0988, + "step": 13160 + }, + { + "epoch": 39.08011869436202, + "grad_norm": 0.25559741258621216, + "learning_rate": 6.101040118870728e-08, + "loss": 0.2696, + "step": 13170 + }, + { + "epoch": 39.10979228486647, + "grad_norm": 9.109511375427246, + "learning_rate": 6.098068350668648e-08, + "loss": 0.0197, + "step": 13180 + }, + { + "epoch": 39.13946587537092, + "grad_norm": 12.258157730102539, + "learning_rate": 6.095096582466567e-08, + "loss": 0.1078, + "step": 13190 + }, + { + "epoch": 39.16913946587537, + "grad_norm": 28.300128936767578, + "learning_rate": 6.092124814264487e-08, + "loss": 0.1205, + "step": 13200 + }, + { + "epoch": 39.19881305637982, + "grad_norm": 1.8155003786087036, + "learning_rate": 6.089153046062406e-08, + "loss": 0.0575, + "step": 13210 + }, + { + "epoch": 39.22848664688427, + "grad_norm": 1.0335309505462646, + "learning_rate": 6.086181277860326e-08, + "loss": 0.1906, + "step": 13220 + }, + { + "epoch": 39.25816023738872, + "grad_norm": 3.9303767681121826, + "learning_rate": 6.083209509658245e-08, + "loss": 0.0258, + "step": 13230 + }, + { + "epoch": 39.287833827893174, + "grad_norm": 2.9609358310699463, + "learning_rate": 6.080237741456166e-08, + "loss": 0.1645, + "step": 13240 + }, + { + "epoch": 39.317507418397625, + "grad_norm": 48.448463439941406, + "learning_rate": 6.077265973254086e-08, + "loss": 0.2804, + "step": 13250 + }, + { + "epoch": 39.347181008902076, + "grad_norm": 3.402069568634033, + "learning_rate": 6.074294205052005e-08, + "loss": 0.023, + "step": 13260 + }, + { + "epoch": 39.37685459940653, + "grad_norm": 12.537109375, + "learning_rate": 6.071322436849925e-08, + "loss": 0.132, + "step": 13270 + }, + { + "epoch": 39.40652818991098, + "grad_norm": 3.228895425796509, + "learning_rate": 6.068350668647845e-08, + "loss": 0.1172, + "step": 13280 + }, + { + "epoch": 39.43620178041543, + "grad_norm": 0.057582613080739975, + "learning_rate": 6.065378900445764e-08, + "loss": 0.1602, + "step": 13290 + }, + { + "epoch": 39.46587537091988, + "grad_norm": 0.1342124044895172, + "learning_rate": 6.062407132243685e-08, + "loss": 0.0808, + "step": 13300 + }, + { + "epoch": 39.49554896142433, + "grad_norm": 29.790531158447266, + "learning_rate": 6.059435364041604e-08, + "loss": 0.2845, + "step": 13310 + }, + { + "epoch": 39.525222551928785, + "grad_norm": 8.546236038208008, + "learning_rate": 6.056463595839524e-08, + "loss": 0.1547, + "step": 13320 + }, + { + "epoch": 39.554896142433236, + "grad_norm": 4.283113956451416, + "learning_rate": 6.053491827637444e-08, + "loss": 0.1797, + "step": 13330 + }, + { + "epoch": 39.58456973293769, + "grad_norm": 0.16012486815452576, + "learning_rate": 6.050520059435363e-08, + "loss": 0.1003, + "step": 13340 + }, + { + "epoch": 39.61424332344214, + "grad_norm": 36.8537712097168, + "learning_rate": 6.047548291233283e-08, + "loss": 0.2052, + "step": 13350 + }, + { + "epoch": 39.64391691394659, + "grad_norm": 0.18168111145496368, + "learning_rate": 6.044576523031204e-08, + "loss": 0.1747, + "step": 13360 + }, + { + "epoch": 39.67359050445104, + "grad_norm": 0.3722153902053833, + "learning_rate": 6.041604754829123e-08, + "loss": 0.0484, + "step": 13370 + }, + { + "epoch": 39.703264094955486, + "grad_norm": 0.08642571419477463, + "learning_rate": 6.038632986627043e-08, + "loss": 0.1004, + "step": 13380 + }, + { + "epoch": 39.73293768545994, + "grad_norm": 11.248104095458984, + "learning_rate": 6.035661218424962e-08, + "loss": 0.067, + "step": 13390 + }, + { + "epoch": 39.76261127596439, + "grad_norm": 4.771637439727783, + "learning_rate": 6.032689450222882e-08, + "loss": 0.1252, + "step": 13400 + }, + { + "epoch": 39.79228486646884, + "grad_norm": 3.365781784057617, + "learning_rate": 6.029717682020801e-08, + "loss": 0.1009, + "step": 13410 + }, + { + "epoch": 39.82195845697329, + "grad_norm": 0.6600086688995361, + "learning_rate": 6.026745913818722e-08, + "loss": 0.1886, + "step": 13420 + }, + { + "epoch": 39.85163204747774, + "grad_norm": 0.09436894953250885, + "learning_rate": 6.023774145616642e-08, + "loss": 0.1085, + "step": 13430 + }, + { + "epoch": 39.881305637982194, + "grad_norm": 1.3392868041992188, + "learning_rate": 6.020802377414561e-08, + "loss": 0.0666, + "step": 13440 + }, + { + "epoch": 39.910979228486646, + "grad_norm": 0.24763616919517517, + "learning_rate": 6.017830609212481e-08, + "loss": 0.0978, + "step": 13450 + }, + { + "epoch": 39.9406528189911, + "grad_norm": 1.9114867448806763, + "learning_rate": 6.0148588410104e-08, + "loss": 0.2681, + "step": 13460 + }, + { + "epoch": 39.97032640949555, + "grad_norm": 11.607654571533203, + "learning_rate": 6.01188707280832e-08, + "loss": 0.1651, + "step": 13470 + }, + { + "epoch": 40.0, + "grad_norm": 0.06624920666217804, + "learning_rate": 6.008915304606241e-08, + "loss": 0.1114, + "step": 13480 + }, + { + "epoch": 40.0, + "eval_accuracy": 0.9398663697104677, + "eval_loss": 0.21430984139442444, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.3763, + "eval_samples_per_second": 140.835, + "eval_steps_per_second": 17.722, + "step": 13480 + }, + { + "epoch": 40.02967359050445, + "grad_norm": 15.755172729492188, + "learning_rate": 6.00594353640416e-08, + "loss": 0.0649, + "step": 13490 + }, + { + "epoch": 40.0593471810089, + "grad_norm": 0.30547386407852173, + "learning_rate": 6.00297176820208e-08, + "loss": 0.0869, + "step": 13500 + }, + { + "epoch": 40.089020771513354, + "grad_norm": 0.6331618428230286, + "learning_rate": 6e-08, + "loss": 0.0981, + "step": 13510 + }, + { + "epoch": 40.118694362017806, + "grad_norm": 18.424678802490234, + "learning_rate": 5.997028231797919e-08, + "loss": 0.179, + "step": 13520 + }, + { + "epoch": 40.14836795252226, + "grad_norm": 41.89717483520508, + "learning_rate": 5.994056463595839e-08, + "loss": 0.1991, + "step": 13530 + }, + { + "epoch": 40.17804154302671, + "grad_norm": 16.253318786621094, + "learning_rate": 5.99108469539376e-08, + "loss": 0.0772, + "step": 13540 + }, + { + "epoch": 40.20771513353116, + "grad_norm": 21.82659339904785, + "learning_rate": 5.988112927191679e-08, + "loss": 0.0932, + "step": 13550 + }, + { + "epoch": 40.23738872403561, + "grad_norm": 0.3248034119606018, + "learning_rate": 5.985141158989599e-08, + "loss": 0.2369, + "step": 13560 + }, + { + "epoch": 40.26706231454006, + "grad_norm": 1.805308222770691, + "learning_rate": 5.982169390787518e-08, + "loss": 0.0814, + "step": 13570 + }, + { + "epoch": 40.29673590504451, + "grad_norm": 0.45236143469810486, + "learning_rate": 5.979197622585438e-08, + "loss": 0.1108, + "step": 13580 + }, + { + "epoch": 40.32640949554896, + "grad_norm": 0.5702680349349976, + "learning_rate": 5.976225854383357e-08, + "loss": 0.0499, + "step": 13590 + }, + { + "epoch": 40.35608308605341, + "grad_norm": 9.56475830078125, + "learning_rate": 5.973254086181278e-08, + "loss": 0.0674, + "step": 13600 + }, + { + "epoch": 40.38575667655786, + "grad_norm": 42.59004211425781, + "learning_rate": 5.970282317979198e-08, + "loss": 0.1887, + "step": 13610 + }, + { + "epoch": 40.41543026706231, + "grad_norm": 17.11397361755371, + "learning_rate": 5.967310549777117e-08, + "loss": 0.1334, + "step": 13620 + }, + { + "epoch": 40.445103857566764, + "grad_norm": 18.96884536743164, + "learning_rate": 5.964338781575037e-08, + "loss": 0.0782, + "step": 13630 + }, + { + "epoch": 40.474777448071215, + "grad_norm": 26.928346633911133, + "learning_rate": 5.961367013372956e-08, + "loss": 0.0953, + "step": 13640 + }, + { + "epoch": 40.50445103857567, + "grad_norm": 12.994820594787598, + "learning_rate": 5.9583952451708765e-08, + "loss": 0.2121, + "step": 13650 + }, + { + "epoch": 40.53412462908012, + "grad_norm": 0.5416554808616638, + "learning_rate": 5.955423476968796e-08, + "loss": 0.0823, + "step": 13660 + }, + { + "epoch": 40.56379821958457, + "grad_norm": 25.92374038696289, + "learning_rate": 5.9524517087667156e-08, + "loss": 0.1976, + "step": 13670 + }, + { + "epoch": 40.59347181008902, + "grad_norm": 0.20100322365760803, + "learning_rate": 5.949479940564636e-08, + "loss": 0.1145, + "step": 13680 + }, + { + "epoch": 40.62314540059347, + "grad_norm": 0.9218740463256836, + "learning_rate": 5.9465081723625553e-08, + "loss": 0.1578, + "step": 13690 + }, + { + "epoch": 40.652818991097924, + "grad_norm": 20.612337112426758, + "learning_rate": 5.943536404160475e-08, + "loss": 0.1882, + "step": 13700 + }, + { + "epoch": 40.682492581602375, + "grad_norm": 3.4071264266967773, + "learning_rate": 5.940564635958395e-08, + "loss": 0.1216, + "step": 13710 + }, + { + "epoch": 40.712166172106826, + "grad_norm": 1.6182498931884766, + "learning_rate": 5.9375928677563147e-08, + "loss": 0.1637, + "step": 13720 + }, + { + "epoch": 40.74183976261128, + "grad_norm": 0.4618373215198517, + "learning_rate": 5.934621099554234e-08, + "loss": 0.2095, + "step": 13730 + }, + { + "epoch": 40.77151335311573, + "grad_norm": 13.857564926147461, + "learning_rate": 5.9316493313521544e-08, + "loss": 0.0755, + "step": 13740 + }, + { + "epoch": 40.80118694362018, + "grad_norm": 0.21656440198421478, + "learning_rate": 5.928677563150074e-08, + "loss": 0.0123, + "step": 13750 + }, + { + "epoch": 40.83086053412463, + "grad_norm": 0.09302521497011185, + "learning_rate": 5.9257057949479935e-08, + "loss": 0.0433, + "step": 13760 + }, + { + "epoch": 40.86053412462908, + "grad_norm": 4.791576862335205, + "learning_rate": 5.922734026745914e-08, + "loss": 0.1798, + "step": 13770 + }, + { + "epoch": 40.89020771513353, + "grad_norm": 32.16770553588867, + "learning_rate": 5.919762258543833e-08, + "loss": 0.0489, + "step": 13780 + }, + { + "epoch": 40.91988130563798, + "grad_norm": 25.5787410736084, + "learning_rate": 5.916790490341753e-08, + "loss": 0.0984, + "step": 13790 + }, + { + "epoch": 40.94955489614243, + "grad_norm": 27.453475952148438, + "learning_rate": 5.913818722139673e-08, + "loss": 0.2755, + "step": 13800 + }, + { + "epoch": 40.97922848664688, + "grad_norm": 0.08432696014642715, + "learning_rate": 5.9108469539375926e-08, + "loss": 0.1611, + "step": 13810 + }, + { + "epoch": 41.0, + "eval_accuracy": 0.9387527839643652, + "eval_loss": 0.216311514377594, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.3922, + "eval_samples_per_second": 140.484, + "eval_steps_per_second": 17.678, + "step": 13817 + }, + { + "epoch": 41.00890207715133, + "grad_norm": 0.22628122568130493, + "learning_rate": 5.907875185735512e-08, + "loss": 0.1954, + "step": 13820 + }, + { + "epoch": 41.038575667655785, + "grad_norm": 35.84254837036133, + "learning_rate": 5.9049034175334324e-08, + "loss": 0.2745, + "step": 13830 + }, + { + "epoch": 41.068249258160236, + "grad_norm": 8.233789443969727, + "learning_rate": 5.901931649331352e-08, + "loss": 0.1309, + "step": 13840 + }, + { + "epoch": 41.09792284866469, + "grad_norm": 0.9112330079078674, + "learning_rate": 5.8989598811292715e-08, + "loss": 0.0766, + "step": 13850 + }, + { + "epoch": 41.12759643916914, + "grad_norm": 0.5012003183364868, + "learning_rate": 5.895988112927192e-08, + "loss": 0.1922, + "step": 13860 + }, + { + "epoch": 41.15727002967359, + "grad_norm": 0.22161629796028137, + "learning_rate": 5.893016344725111e-08, + "loss": 0.0371, + "step": 13870 + }, + { + "epoch": 41.18694362017804, + "grad_norm": 0.16659773886203766, + "learning_rate": 5.890044576523031e-08, + "loss": 0.1351, + "step": 13880 + }, + { + "epoch": 41.21661721068249, + "grad_norm": 10.003440856933594, + "learning_rate": 5.887072808320951e-08, + "loss": 0.1631, + "step": 13890 + }, + { + "epoch": 41.246290801186944, + "grad_norm": 0.1723717898130417, + "learning_rate": 5.8841010401188705e-08, + "loss": 0.0304, + "step": 13900 + }, + { + "epoch": 41.275964391691396, + "grad_norm": 2.9914841651916504, + "learning_rate": 5.88112927191679e-08, + "loss": 0.0988, + "step": 13910 + }, + { + "epoch": 41.30563798219585, + "grad_norm": 14.285638809204102, + "learning_rate": 5.87815750371471e-08, + "loss": 0.1115, + "step": 13920 + }, + { + "epoch": 41.3353115727003, + "grad_norm": 1.5906716585159302, + "learning_rate": 5.87518573551263e-08, + "loss": 0.333, + "step": 13930 + }, + { + "epoch": 41.36498516320475, + "grad_norm": 2.6748101711273193, + "learning_rate": 5.87221396731055e-08, + "loss": 0.1103, + "step": 13940 + }, + { + "epoch": 41.3946587537092, + "grad_norm": 0.26675471663475037, + "learning_rate": 5.8692421991084696e-08, + "loss": 0.2556, + "step": 13950 + }, + { + "epoch": 41.42433234421365, + "grad_norm": 0.46109727025032043, + "learning_rate": 5.866270430906389e-08, + "loss": 0.0222, + "step": 13960 + }, + { + "epoch": 41.454005934718104, + "grad_norm": 13.879071235656738, + "learning_rate": 5.8632986627043094e-08, + "loss": 0.3145, + "step": 13970 + }, + { + "epoch": 41.48367952522255, + "grad_norm": 1.0346206426620483, + "learning_rate": 5.860326894502229e-08, + "loss": 0.0964, + "step": 13980 + }, + { + "epoch": 41.513353115727, + "grad_norm": 24.201801300048828, + "learning_rate": 5.8573551263001485e-08, + "loss": 0.1004, + "step": 13990 + }, + { + "epoch": 41.54302670623145, + "grad_norm": 0.058345451951026917, + "learning_rate": 5.854383358098069e-08, + "loss": 0.1001, + "step": 14000 + }, + { + "epoch": 41.5727002967359, + "grad_norm": 0.18827471137046814, + "learning_rate": 5.851411589895988e-08, + "loss": 0.1733, + "step": 14010 + }, + { + "epoch": 41.602373887240354, + "grad_norm": 0.4404458701610565, + "learning_rate": 5.848439821693908e-08, + "loss": 0.0787, + "step": 14020 + }, + { + "epoch": 41.632047477744806, + "grad_norm": 1.5757932662963867, + "learning_rate": 5.845468053491828e-08, + "loss": 0.1517, + "step": 14030 + }, + { + "epoch": 41.66172106824926, + "grad_norm": 0.27554380893707275, + "learning_rate": 5.8424962852897476e-08, + "loss": 0.125, + "step": 14040 + }, + { + "epoch": 41.69139465875371, + "grad_norm": 10.882274627685547, + "learning_rate": 5.839524517087667e-08, + "loss": 0.1367, + "step": 14050 + }, + { + "epoch": 41.72106824925816, + "grad_norm": 0.18490727245807648, + "learning_rate": 5.836552748885587e-08, + "loss": 0.0577, + "step": 14060 + }, + { + "epoch": 41.75074183976261, + "grad_norm": 1.4265599250793457, + "learning_rate": 5.833580980683507e-08, + "loss": 0.1626, + "step": 14070 + }, + { + "epoch": 41.78041543026706, + "grad_norm": 3.4994375705718994, + "learning_rate": 5.8306092124814264e-08, + "loss": 0.1641, + "step": 14080 + }, + { + "epoch": 41.810089020771514, + "grad_norm": 0.13201963901519775, + "learning_rate": 5.8276374442793466e-08, + "loss": 0.1478, + "step": 14090 + }, + { + "epoch": 41.839762611275965, + "grad_norm": 30.411331176757812, + "learning_rate": 5.824665676077266e-08, + "loss": 0.0764, + "step": 14100 + }, + { + "epoch": 41.86943620178042, + "grad_norm": 0.3485267162322998, + "learning_rate": 5.821693907875186e-08, + "loss": 0.0654, + "step": 14110 + }, + { + "epoch": 41.89910979228487, + "grad_norm": 0.6999891400337219, + "learning_rate": 5.818722139673106e-08, + "loss": 0.0968, + "step": 14120 + }, + { + "epoch": 41.92878338278932, + "grad_norm": 0.6748274564743042, + "learning_rate": 5.8157503714710255e-08, + "loss": 0.1827, + "step": 14130 + }, + { + "epoch": 41.95845697329377, + "grad_norm": 23.37346649169922, + "learning_rate": 5.812778603268945e-08, + "loss": 0.0885, + "step": 14140 + }, + { + "epoch": 41.98813056379822, + "grad_norm": 13.14158821105957, + "learning_rate": 5.809806835066865e-08, + "loss": 0.1406, + "step": 14150 + }, + { + "epoch": 42.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.21634596586227417, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.3773, + "eval_samples_per_second": 140.812, + "eval_steps_per_second": 17.719, + "step": 14154 + }, + { + "epoch": 42.017804154302674, + "grad_norm": 3.2302095890045166, + "learning_rate": 5.806835066864785e-08, + "loss": 0.1363, + "step": 14160 + }, + { + "epoch": 42.047477744807125, + "grad_norm": 30.471839904785156, + "learning_rate": 5.8038632986627044e-08, + "loss": 0.0878, + "step": 14170 + }, + { + "epoch": 42.07715133531157, + "grad_norm": 14.864005088806152, + "learning_rate": 5.8008915304606246e-08, + "loss": 0.1479, + "step": 14180 + }, + { + "epoch": 42.10682492581602, + "grad_norm": 1.919960379600525, + "learning_rate": 5.797919762258544e-08, + "loss": 0.1329, + "step": 14190 + }, + { + "epoch": 42.13649851632047, + "grad_norm": 0.5917404890060425, + "learning_rate": 5.794947994056464e-08, + "loss": 0.1855, + "step": 14200 + }, + { + "epoch": 42.166172106824924, + "grad_norm": 9.250659942626953, + "learning_rate": 5.791976225854384e-08, + "loss": 0.3017, + "step": 14210 + }, + { + "epoch": 42.195845697329375, + "grad_norm": 0.04010183736681938, + "learning_rate": 5.7890044576523035e-08, + "loss": 0.0572, + "step": 14220 + }, + { + "epoch": 42.225519287833826, + "grad_norm": 0.5380052328109741, + "learning_rate": 5.786032689450223e-08, + "loss": 0.1351, + "step": 14230 + }, + { + "epoch": 42.25519287833828, + "grad_norm": 24.217151641845703, + "learning_rate": 5.783060921248143e-08, + "loss": 0.1364, + "step": 14240 + }, + { + "epoch": 42.28486646884273, + "grad_norm": 1.2853831052780151, + "learning_rate": 5.7800891530460614e-08, + "loss": 0.1792, + "step": 14250 + }, + { + "epoch": 42.31454005934718, + "grad_norm": 0.9396013021469116, + "learning_rate": 5.7771173848439817e-08, + "loss": 0.092, + "step": 14260 + }, + { + "epoch": 42.34421364985163, + "grad_norm": 43.835914611816406, + "learning_rate": 5.774145616641901e-08, + "loss": 0.0564, + "step": 14270 + }, + { + "epoch": 42.37388724035608, + "grad_norm": 1.277904748916626, + "learning_rate": 5.771173848439821e-08, + "loss": 0.0508, + "step": 14280 + }, + { + "epoch": 42.403560830860535, + "grad_norm": 24.073429107666016, + "learning_rate": 5.768202080237741e-08, + "loss": 0.1279, + "step": 14290 + }, + { + "epoch": 42.433234421364986, + "grad_norm": 0.2394813746213913, + "learning_rate": 5.7652303120356605e-08, + "loss": 0.0118, + "step": 14300 + }, + { + "epoch": 42.46290801186944, + "grad_norm": 2.153155565261841, + "learning_rate": 5.76225854383358e-08, + "loss": 0.2286, + "step": 14310 + }, + { + "epoch": 42.49258160237389, + "grad_norm": 4.6165008544921875, + "learning_rate": 5.7592867756315e-08, + "loss": 0.105, + "step": 14320 + }, + { + "epoch": 42.52225519287834, + "grad_norm": 19.356531143188477, + "learning_rate": 5.75631500742942e-08, + "loss": 0.2929, + "step": 14330 + }, + { + "epoch": 42.55192878338279, + "grad_norm": 0.05769629403948784, + "learning_rate": 5.7533432392273394e-08, + "loss": 0.1599, + "step": 14340 + }, + { + "epoch": 42.58160237388724, + "grad_norm": 35.07514572143555, + "learning_rate": 5.7503714710252596e-08, + "loss": 0.1758, + "step": 14350 + }, + { + "epoch": 42.611275964391695, + "grad_norm": 2.708401679992676, + "learning_rate": 5.747399702823179e-08, + "loss": 0.2006, + "step": 14360 + }, + { + "epoch": 42.640949554896146, + "grad_norm": 0.43432655930519104, + "learning_rate": 5.744427934621099e-08, + "loss": 0.0993, + "step": 14370 + }, + { + "epoch": 42.67062314540059, + "grad_norm": 2.252474308013916, + "learning_rate": 5.741456166419019e-08, + "loss": 0.1215, + "step": 14380 + }, + { + "epoch": 42.70029673590504, + "grad_norm": 0.22341325879096985, + "learning_rate": 5.7384843982169385e-08, + "loss": 0.082, + "step": 14390 + }, + { + "epoch": 42.72997032640949, + "grad_norm": 12.576260566711426, + "learning_rate": 5.735512630014858e-08, + "loss": 0.0825, + "step": 14400 + }, + { + "epoch": 42.759643916913944, + "grad_norm": 0.16938528418540955, + "learning_rate": 5.732540861812778e-08, + "loss": 0.1777, + "step": 14410 + }, + { + "epoch": 42.789317507418396, + "grad_norm": 0.254700630903244, + "learning_rate": 5.729569093610698e-08, + "loss": 0.2347, + "step": 14420 + }, + { + "epoch": 42.81899109792285, + "grad_norm": 12.05561637878418, + "learning_rate": 5.726597325408617e-08, + "loss": 0.0842, + "step": 14430 + }, + { + "epoch": 42.8486646884273, + "grad_norm": 9.257681846618652, + "learning_rate": 5.7236255572065375e-08, + "loss": 0.1039, + "step": 14440 + }, + { + "epoch": 42.87833827893175, + "grad_norm": 23.891191482543945, + "learning_rate": 5.720653789004457e-08, + "loss": 0.1481, + "step": 14450 + }, + { + "epoch": 42.9080118694362, + "grad_norm": 2.9251699447631836, + "learning_rate": 5.7176820208023766e-08, + "loss": 0.0963, + "step": 14460 + }, + { + "epoch": 42.93768545994065, + "grad_norm": 0.04088553786277771, + "learning_rate": 5.714710252600297e-08, + "loss": 0.0212, + "step": 14470 + }, + { + "epoch": 42.967359050445104, + "grad_norm": 24.534875869750977, + "learning_rate": 5.7117384843982164e-08, + "loss": 0.1956, + "step": 14480 + }, + { + "epoch": 42.997032640949556, + "grad_norm": 0.22329719364643097, + "learning_rate": 5.708766716196136e-08, + "loss": 0.1485, + "step": 14490 + }, + { + "epoch": 43.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.21706107258796692, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.381, + "eval_samples_per_second": 140.729, + "eval_steps_per_second": 17.709, + "step": 14491 + }, + { + "epoch": 43.02670623145401, + "grad_norm": 14.792704582214355, + "learning_rate": 5.705794947994056e-08, + "loss": 0.1141, + "step": 14500 + }, + { + "epoch": 43.05637982195846, + "grad_norm": 5.351875305175781, + "learning_rate": 5.702823179791976e-08, + "loss": 0.2071, + "step": 14510 + }, + { + "epoch": 43.08605341246291, + "grad_norm": 0.3951844871044159, + "learning_rate": 5.699851411589895e-08, + "loss": 0.1123, + "step": 14520 + }, + { + "epoch": 43.11572700296736, + "grad_norm": 0.2710784375667572, + "learning_rate": 5.6968796433878155e-08, + "loss": 0.1893, + "step": 14530 + }, + { + "epoch": 43.14540059347181, + "grad_norm": 0.057233598083257675, + "learning_rate": 5.693907875185735e-08, + "loss": 0.0709, + "step": 14540 + }, + { + "epoch": 43.175074183976264, + "grad_norm": 2.0287387371063232, + "learning_rate": 5.6909361069836546e-08, + "loss": 0.0125, + "step": 14550 + }, + { + "epoch": 43.204747774480715, + "grad_norm": 0.02577878162264824, + "learning_rate": 5.687964338781575e-08, + "loss": 0.2059, + "step": 14560 + }, + { + "epoch": 43.23442136498517, + "grad_norm": 0.016067614778876305, + "learning_rate": 5.6849925705794943e-08, + "loss": 0.0192, + "step": 14570 + }, + { + "epoch": 43.26409495548961, + "grad_norm": 0.15090984106063843, + "learning_rate": 5.682020802377414e-08, + "loss": 0.0884, + "step": 14580 + }, + { + "epoch": 43.29376854599406, + "grad_norm": 0.46075817942619324, + "learning_rate": 5.679049034175334e-08, + "loss": 0.1241, + "step": 14590 + }, + { + "epoch": 43.323442136498514, + "grad_norm": 0.549602746963501, + "learning_rate": 5.6760772659732537e-08, + "loss": 0.192, + "step": 14600 + }, + { + "epoch": 43.353115727002965, + "grad_norm": 0.43751761317253113, + "learning_rate": 5.673105497771173e-08, + "loss": 0.1048, + "step": 14610 + }, + { + "epoch": 43.38278931750742, + "grad_norm": 0.051166146993637085, + "learning_rate": 5.6701337295690934e-08, + "loss": 0.0932, + "step": 14620 + }, + { + "epoch": 43.41246290801187, + "grad_norm": 1.6542890071868896, + "learning_rate": 5.667161961367013e-08, + "loss": 0.1767, + "step": 14630 + }, + { + "epoch": 43.44213649851632, + "grad_norm": 6.84234094619751, + "learning_rate": 5.6641901931649325e-08, + "loss": 0.1445, + "step": 14640 + }, + { + "epoch": 43.47181008902077, + "grad_norm": 0.2606462240219116, + "learning_rate": 5.661218424962853e-08, + "loss": 0.3228, + "step": 14650 + }, + { + "epoch": 43.50148367952522, + "grad_norm": 7.124850749969482, + "learning_rate": 5.658246656760772e-08, + "loss": 0.1818, + "step": 14660 + }, + { + "epoch": 43.531157270029674, + "grad_norm": 0.16096651554107666, + "learning_rate": 5.655274888558692e-08, + "loss": 0.1064, + "step": 14670 + }, + { + "epoch": 43.560830860534125, + "grad_norm": 0.061637360602617264, + "learning_rate": 5.652303120356612e-08, + "loss": 0.0396, + "step": 14680 + }, + { + "epoch": 43.590504451038576, + "grad_norm": 34.11690902709961, + "learning_rate": 5.6493313521545316e-08, + "loss": 0.3291, + "step": 14690 + }, + { + "epoch": 43.62017804154303, + "grad_norm": 20.85869026184082, + "learning_rate": 5.646359583952451e-08, + "loss": 0.0933, + "step": 14700 + }, + { + "epoch": 43.64985163204748, + "grad_norm": 19.4937744140625, + "learning_rate": 5.6433878157503714e-08, + "loss": 0.0833, + "step": 14710 + }, + { + "epoch": 43.67952522255193, + "grad_norm": 9.231477737426758, + "learning_rate": 5.640416047548291e-08, + "loss": 0.1218, + "step": 14720 + }, + { + "epoch": 43.70919881305638, + "grad_norm": 13.731213569641113, + "learning_rate": 5.6374442793462105e-08, + "loss": 0.0813, + "step": 14730 + }, + { + "epoch": 43.73887240356083, + "grad_norm": 8.603341102600098, + "learning_rate": 5.634472511144131e-08, + "loss": 0.0647, + "step": 14740 + }, + { + "epoch": 43.768545994065285, + "grad_norm": 23.207216262817383, + "learning_rate": 5.63150074294205e-08, + "loss": 0.1769, + "step": 14750 + }, + { + "epoch": 43.798219584569736, + "grad_norm": 0.04011416807770729, + "learning_rate": 5.62852897473997e-08, + "loss": 0.1842, + "step": 14760 + }, + { + "epoch": 43.82789317507418, + "grad_norm": 7.597365856170654, + "learning_rate": 5.62555720653789e-08, + "loss": 0.1456, + "step": 14770 + }, + { + "epoch": 43.85756676557863, + "grad_norm": 5.669887065887451, + "learning_rate": 5.6225854383358095e-08, + "loss": 0.044, + "step": 14780 + }, + { + "epoch": 43.88724035608308, + "grad_norm": 31.03801727294922, + "learning_rate": 5.619613670133729e-08, + "loss": 0.1224, + "step": 14790 + }, + { + "epoch": 43.916913946587535, + "grad_norm": 0.09371539950370789, + "learning_rate": 5.616641901931649e-08, + "loss": 0.0693, + "step": 14800 + }, + { + "epoch": 43.946587537091986, + "grad_norm": 0.4759451746940613, + "learning_rate": 5.613670133729569e-08, + "loss": 0.0964, + "step": 14810 + }, + { + "epoch": 43.97626112759644, + "grad_norm": 1.3949414491653442, + "learning_rate": 5.6106983655274884e-08, + "loss": 0.0669, + "step": 14820 + }, + { + "epoch": 44.0, + "eval_accuracy": 0.9354120267260579, + "eval_loss": 0.216058611869812, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.413, + "eval_samples_per_second": 140.028, + "eval_steps_per_second": 17.62, + "step": 14828 + }, + { + "epoch": 44.00593471810089, + "grad_norm": 4.004786014556885, + "learning_rate": 5.6077265973254086e-08, + "loss": 0.1105, + "step": 14830 + }, + { + "epoch": 44.03560830860534, + "grad_norm": 0.05671630799770355, + "learning_rate": 5.604754829123328e-08, + "loss": 0.087, + "step": 14840 + }, + { + "epoch": 44.06528189910979, + "grad_norm": 12.49288272857666, + "learning_rate": 5.601783060921248e-08, + "loss": 0.2238, + "step": 14850 + }, + { + "epoch": 44.09495548961424, + "grad_norm": 23.95133399963379, + "learning_rate": 5.598811292719168e-08, + "loss": 0.0569, + "step": 14860 + }, + { + "epoch": 44.124629080118694, + "grad_norm": 4.707876205444336, + "learning_rate": 5.5958395245170875e-08, + "loss": 0.1306, + "step": 14870 + }, + { + "epoch": 44.154302670623146, + "grad_norm": 0.13932177424430847, + "learning_rate": 5.592867756315007e-08, + "loss": 0.0315, + "step": 14880 + }, + { + "epoch": 44.1839762611276, + "grad_norm": 0.17878668010234833, + "learning_rate": 5.589895988112927e-08, + "loss": 0.0895, + "step": 14890 + }, + { + "epoch": 44.21364985163205, + "grad_norm": 13.573840141296387, + "learning_rate": 5.586924219910847e-08, + "loss": 0.1463, + "step": 14900 + }, + { + "epoch": 44.2433234421365, + "grad_norm": 22.46234130859375, + "learning_rate": 5.5839524517087664e-08, + "loss": 0.2177, + "step": 14910 + }, + { + "epoch": 44.27299703264095, + "grad_norm": 0.560173511505127, + "learning_rate": 5.5809806835066866e-08, + "loss": 0.0914, + "step": 14920 + }, + { + "epoch": 44.3026706231454, + "grad_norm": 0.24218808114528656, + "learning_rate": 5.578008915304606e-08, + "loss": 0.066, + "step": 14930 + }, + { + "epoch": 44.332344213649854, + "grad_norm": 4.442209243774414, + "learning_rate": 5.5750371471025257e-08, + "loss": 0.0453, + "step": 14940 + }, + { + "epoch": 44.362017804154306, + "grad_norm": 4.684684753417969, + "learning_rate": 5.572065378900446e-08, + "loss": 0.1396, + "step": 14950 + }, + { + "epoch": 44.39169139465876, + "grad_norm": 0.14059008657932281, + "learning_rate": 5.5690936106983654e-08, + "loss": 0.139, + "step": 14960 + }, + { + "epoch": 44.4213649851632, + "grad_norm": 29.814241409301758, + "learning_rate": 5.566121842496285e-08, + "loss": 0.1804, + "step": 14970 + }, + { + "epoch": 44.45103857566765, + "grad_norm": 5.223556995391846, + "learning_rate": 5.563150074294205e-08, + "loss": 0.0942, + "step": 14980 + }, + { + "epoch": 44.480712166172104, + "grad_norm": 23.62541961669922, + "learning_rate": 5.560178306092125e-08, + "loss": 0.149, + "step": 14990 + }, + { + "epoch": 44.510385756676556, + "grad_norm": 19.68107032775879, + "learning_rate": 5.557206537890044e-08, + "loss": 0.1757, + "step": 15000 + }, + { + "epoch": 44.54005934718101, + "grad_norm": 1.0169438123703003, + "learning_rate": 5.5542347696879645e-08, + "loss": 0.0765, + "step": 15010 + }, + { + "epoch": 44.56973293768546, + "grad_norm": 1.3169103860855103, + "learning_rate": 5.551263001485884e-08, + "loss": 0.1808, + "step": 15020 + }, + { + "epoch": 44.59940652818991, + "grad_norm": 0.01617828756570816, + "learning_rate": 5.5482912332838036e-08, + "loss": 0.1456, + "step": 15030 + }, + { + "epoch": 44.62908011869436, + "grad_norm": 2.9004364013671875, + "learning_rate": 5.545319465081724e-08, + "loss": 0.1997, + "step": 15040 + }, + { + "epoch": 44.65875370919881, + "grad_norm": 5.0402045249938965, + "learning_rate": 5.5423476968796434e-08, + "loss": 0.1095, + "step": 15050 + }, + { + "epoch": 44.688427299703264, + "grad_norm": 1.3393409252166748, + "learning_rate": 5.539375928677563e-08, + "loss": 0.0792, + "step": 15060 + }, + { + "epoch": 44.718100890207715, + "grad_norm": 4.1023759841918945, + "learning_rate": 5.536404160475483e-08, + "loss": 0.1195, + "step": 15070 + }, + { + "epoch": 44.74777448071217, + "grad_norm": 0.07596477121114731, + "learning_rate": 5.533432392273403e-08, + "loss": 0.0751, + "step": 15080 + }, + { + "epoch": 44.77744807121662, + "grad_norm": 15.187714576721191, + "learning_rate": 5.530460624071322e-08, + "loss": 0.1395, + "step": 15090 + }, + { + "epoch": 44.80712166172107, + "grad_norm": 0.1191861480474472, + "learning_rate": 5.5274888558692425e-08, + "loss": 0.1737, + "step": 15100 + }, + { + "epoch": 44.83679525222552, + "grad_norm": 0.18649892508983612, + "learning_rate": 5.524517087667162e-08, + "loss": 0.2512, + "step": 15110 + }, + { + "epoch": 44.86646884272997, + "grad_norm": 1.3505641222000122, + "learning_rate": 5.5215453194650816e-08, + "loss": 0.0644, + "step": 15120 + }, + { + "epoch": 44.896142433234424, + "grad_norm": 0.44012022018432617, + "learning_rate": 5.518573551263002e-08, + "loss": 0.2919, + "step": 15130 + }, + { + "epoch": 44.925816023738875, + "grad_norm": 0.13727468252182007, + "learning_rate": 5.515601783060921e-08, + "loss": 0.068, + "step": 15140 + }, + { + "epoch": 44.95548961424333, + "grad_norm": 0.11818359792232513, + "learning_rate": 5.512630014858841e-08, + "loss": 0.073, + "step": 15150 + }, + { + "epoch": 44.98516320474778, + "grad_norm": 26.8052921295166, + "learning_rate": 5.509658246656761e-08, + "loss": 0.0896, + "step": 15160 + }, + { + "epoch": 45.0, + "eval_accuracy": 0.9354120267260579, + "eval_loss": 0.21658813953399658, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.3772, + "eval_samples_per_second": 140.813, + "eval_steps_per_second": 17.719, + "step": 15165 + }, + { + "epoch": 45.01483679525222, + "grad_norm": 1.3133299350738525, + "learning_rate": 5.5066864784546806e-08, + "loss": 0.0979, + "step": 15170 + }, + { + "epoch": 45.04451038575667, + "grad_norm": 1.8585262298583984, + "learning_rate": 5.5037147102526e-08, + "loss": 0.2128, + "step": 15180 + }, + { + "epoch": 45.074183976261125, + "grad_norm": 11.573284149169922, + "learning_rate": 5.5007429420505204e-08, + "loss": 0.1201, + "step": 15190 + }, + { + "epoch": 45.103857566765576, + "grad_norm": 0.46482500433921814, + "learning_rate": 5.49777117384844e-08, + "loss": 0.0468, + "step": 15200 + }, + { + "epoch": 45.13353115727003, + "grad_norm": 25.420026779174805, + "learning_rate": 5.4947994056463595e-08, + "loss": 0.0994, + "step": 15210 + }, + { + "epoch": 45.16320474777448, + "grad_norm": 11.030900955200195, + "learning_rate": 5.49182763744428e-08, + "loss": 0.1539, + "step": 15220 + }, + { + "epoch": 45.19287833827893, + "grad_norm": 5.034533500671387, + "learning_rate": 5.488855869242199e-08, + "loss": 0.1186, + "step": 15230 + }, + { + "epoch": 45.22255192878338, + "grad_norm": 0.08379362523555756, + "learning_rate": 5.485884101040119e-08, + "loss": 0.1182, + "step": 15240 + }, + { + "epoch": 45.25222551928783, + "grad_norm": 23.93302345275879, + "learning_rate": 5.482912332838039e-08, + "loss": 0.1338, + "step": 15250 + }, + { + "epoch": 45.281899109792285, + "grad_norm": 1.4666907787322998, + "learning_rate": 5.4799405646359586e-08, + "loss": 0.1646, + "step": 15260 + }, + { + "epoch": 45.311572700296736, + "grad_norm": 0.543218195438385, + "learning_rate": 5.476968796433878e-08, + "loss": 0.0659, + "step": 15270 + }, + { + "epoch": 45.34124629080119, + "grad_norm": 12.238872528076172, + "learning_rate": 5.4739970282317983e-08, + "loss": 0.1104, + "step": 15280 + }, + { + "epoch": 45.37091988130564, + "grad_norm": 0.13234202563762665, + "learning_rate": 5.471025260029718e-08, + "loss": 0.1349, + "step": 15290 + }, + { + "epoch": 45.40059347181009, + "grad_norm": 1.9932081699371338, + "learning_rate": 5.468053491827637e-08, + "loss": 0.1094, + "step": 15300 + }, + { + "epoch": 45.43026706231454, + "grad_norm": 13.328103065490723, + "learning_rate": 5.465081723625556e-08, + "loss": 0.2651, + "step": 15310 + }, + { + "epoch": 45.45994065281899, + "grad_norm": 8.435056686401367, + "learning_rate": 5.462109955423476e-08, + "loss": 0.1571, + "step": 15320 + }, + { + "epoch": 45.489614243323444, + "grad_norm": 25.721866607666016, + "learning_rate": 5.459138187221396e-08, + "loss": 0.1452, + "step": 15330 + }, + { + "epoch": 45.519287833827896, + "grad_norm": 0.3235155940055847, + "learning_rate": 5.4561664190193156e-08, + "loss": 0.0982, + "step": 15340 + }, + { + "epoch": 45.54896142433235, + "grad_norm": 2.9249820709228516, + "learning_rate": 5.453194650817235e-08, + "loss": 0.072, + "step": 15350 + }, + { + "epoch": 45.5786350148368, + "grad_norm": 0.10884355008602142, + "learning_rate": 5.4502228826151554e-08, + "loss": 0.0872, + "step": 15360 + }, + { + "epoch": 45.60830860534124, + "grad_norm": 0.6670544147491455, + "learning_rate": 5.447251114413075e-08, + "loss": 0.0511, + "step": 15370 + }, + { + "epoch": 45.637982195845694, + "grad_norm": 0.08741656690835953, + "learning_rate": 5.4442793462109945e-08, + "loss": 0.1391, + "step": 15380 + }, + { + "epoch": 45.667655786350146, + "grad_norm": 0.10427907109260559, + "learning_rate": 5.441307578008915e-08, + "loss": 0.0709, + "step": 15390 + }, + { + "epoch": 45.6973293768546, + "grad_norm": 0.3116246163845062, + "learning_rate": 5.438335809806834e-08, + "loss": 0.3235, + "step": 15400 + }, + { + "epoch": 45.72700296735905, + "grad_norm": 0.3999086022377014, + "learning_rate": 5.435364041604754e-08, + "loss": 0.084, + "step": 15410 + }, + { + "epoch": 45.7566765578635, + "grad_norm": 0.08946636319160461, + "learning_rate": 5.432392273402674e-08, + "loss": 0.1088, + "step": 15420 + }, + { + "epoch": 45.78635014836795, + "grad_norm": 0.2102331966161728, + "learning_rate": 5.4294205052005936e-08, + "loss": 0.1023, + "step": 15430 + }, + { + "epoch": 45.8160237388724, + "grad_norm": 33.68644714355469, + "learning_rate": 5.426448736998514e-08, + "loss": 0.1584, + "step": 15440 + }, + { + "epoch": 45.845697329376854, + "grad_norm": 0.13862526416778564, + "learning_rate": 5.4234769687964333e-08, + "loss": 0.0998, + "step": 15450 + }, + { + "epoch": 45.875370919881306, + "grad_norm": 0.2303808033466339, + "learning_rate": 5.420505200594353e-08, + "loss": 0.1496, + "step": 15460 + }, + { + "epoch": 45.90504451038576, + "grad_norm": 0.04839453846216202, + "learning_rate": 5.417533432392273e-08, + "loss": 0.1463, + "step": 15470 + }, + { + "epoch": 45.93471810089021, + "grad_norm": 35.69575500488281, + "learning_rate": 5.4145616641901927e-08, + "loss": 0.1516, + "step": 15480 + }, + { + "epoch": 45.96439169139466, + "grad_norm": 0.19682061672210693, + "learning_rate": 5.411589895988112e-08, + "loss": 0.1373, + "step": 15490 + }, + { + "epoch": 45.99406528189911, + "grad_norm": 2.0283966064453125, + "learning_rate": 5.4086181277860324e-08, + "loss": 0.0398, + "step": 15500 + }, + { + "epoch": 46.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.21634621918201447, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4508, + "eval_samples_per_second": 139.208, + "eval_steps_per_second": 17.517, + "step": 15502 + }, + { + "epoch": 46.02373887240356, + "grad_norm": 22.162120819091797, + "learning_rate": 5.405646359583952e-08, + "loss": 0.1007, + "step": 15510 + }, + { + "epoch": 46.053412462908014, + "grad_norm": 10.726767539978027, + "learning_rate": 5.4026745913818715e-08, + "loss": 0.1115, + "step": 15520 + }, + { + "epoch": 46.083086053412465, + "grad_norm": 0.49519988894462585, + "learning_rate": 5.399702823179792e-08, + "loss": 0.0761, + "step": 15530 + }, + { + "epoch": 46.11275964391692, + "grad_norm": 19.520936965942383, + "learning_rate": 5.396731054977711e-08, + "loss": 0.116, + "step": 15540 + }, + { + "epoch": 46.14243323442137, + "grad_norm": 28.2784423828125, + "learning_rate": 5.393759286775631e-08, + "loss": 0.0651, + "step": 15550 + }, + { + "epoch": 46.17210682492582, + "grad_norm": 0.34402093291282654, + "learning_rate": 5.390787518573551e-08, + "loss": 0.2101, + "step": 15560 + }, + { + "epoch": 46.201780415430264, + "grad_norm": 0.35318100452423096, + "learning_rate": 5.3878157503714706e-08, + "loss": 0.0286, + "step": 15570 + }, + { + "epoch": 46.231454005934715, + "grad_norm": 1.078883409500122, + "learning_rate": 5.38484398216939e-08, + "loss": 0.0736, + "step": 15580 + }, + { + "epoch": 46.26112759643917, + "grad_norm": 4.65336799621582, + "learning_rate": 5.3818722139673104e-08, + "loss": 0.3495, + "step": 15590 + }, + { + "epoch": 46.29080118694362, + "grad_norm": 26.353235244750977, + "learning_rate": 5.37890044576523e-08, + "loss": 0.0711, + "step": 15600 + }, + { + "epoch": 46.32047477744807, + "grad_norm": 15.620075225830078, + "learning_rate": 5.3759286775631495e-08, + "loss": 0.1347, + "step": 15610 + }, + { + "epoch": 46.35014836795252, + "grad_norm": 18.730243682861328, + "learning_rate": 5.37295690936107e-08, + "loss": 0.138, + "step": 15620 + }, + { + "epoch": 46.37982195845697, + "grad_norm": 11.024364471435547, + "learning_rate": 5.369985141158989e-08, + "loss": 0.2893, + "step": 15630 + }, + { + "epoch": 46.409495548961424, + "grad_norm": 3.0710813999176025, + "learning_rate": 5.367013372956909e-08, + "loss": 0.1474, + "step": 15640 + }, + { + "epoch": 46.439169139465875, + "grad_norm": 0.09054834395647049, + "learning_rate": 5.364041604754829e-08, + "loss": 0.2096, + "step": 15650 + }, + { + "epoch": 46.468842729970326, + "grad_norm": 0.19153709709644318, + "learning_rate": 5.3610698365527485e-08, + "loss": 0.1976, + "step": 15660 + }, + { + "epoch": 46.49851632047478, + "grad_norm": 0.803075909614563, + "learning_rate": 5.358098068350668e-08, + "loss": 0.1346, + "step": 15670 + }, + { + "epoch": 46.52818991097923, + "grad_norm": 34.9867057800293, + "learning_rate": 5.355126300148588e-08, + "loss": 0.4965, + "step": 15680 + }, + { + "epoch": 46.55786350148368, + "grad_norm": 0.012762948870658875, + "learning_rate": 5.352154531946508e-08, + "loss": 0.2135, + "step": 15690 + }, + { + "epoch": 46.58753709198813, + "grad_norm": 1.1131112575531006, + "learning_rate": 5.3491827637444274e-08, + "loss": 0.147, + "step": 15700 + }, + { + "epoch": 46.61721068249258, + "grad_norm": 0.027392152696847916, + "learning_rate": 5.3462109955423476e-08, + "loss": 0.1336, + "step": 15710 + }, + { + "epoch": 46.646884272997035, + "grad_norm": 0.2855999171733856, + "learning_rate": 5.343239227340267e-08, + "loss": 0.0661, + "step": 15720 + }, + { + "epoch": 46.676557863501486, + "grad_norm": 6.921504974365234, + "learning_rate": 5.340267459138187e-08, + "loss": 0.1895, + "step": 15730 + }, + { + "epoch": 46.70623145400594, + "grad_norm": 0.11567987501621246, + "learning_rate": 5.337295690936107e-08, + "loss": 0.1225, + "step": 15740 + }, + { + "epoch": 46.73590504451039, + "grad_norm": 5.026867866516113, + "learning_rate": 5.3343239227340265e-08, + "loss": 0.0138, + "step": 15750 + }, + { + "epoch": 46.76557863501483, + "grad_norm": 0.13589195907115936, + "learning_rate": 5.331352154531946e-08, + "loss": 0.0345, + "step": 15760 + }, + { + "epoch": 46.795252225519285, + "grad_norm": 0.7843965291976929, + "learning_rate": 5.328380386329866e-08, + "loss": 0.1175, + "step": 15770 + }, + { + "epoch": 46.824925816023736, + "grad_norm": 16.163616180419922, + "learning_rate": 5.325408618127786e-08, + "loss": 0.1496, + "step": 15780 + }, + { + "epoch": 46.85459940652819, + "grad_norm": 17.038578033447266, + "learning_rate": 5.3224368499257054e-08, + "loss": 0.115, + "step": 15790 + }, + { + "epoch": 46.88427299703264, + "grad_norm": 10.422926902770996, + "learning_rate": 5.3194650817236256e-08, + "loss": 0.0476, + "step": 15800 + }, + { + "epoch": 46.91394658753709, + "grad_norm": 0.15780656039714813, + "learning_rate": 5.316493313521545e-08, + "loss": 0.0857, + "step": 15810 + }, + { + "epoch": 46.94362017804154, + "grad_norm": 0.5017779469490051, + "learning_rate": 5.3135215453194647e-08, + "loss": 0.144, + "step": 15820 + }, + { + "epoch": 46.97329376854599, + "grad_norm": 0.1333116739988327, + "learning_rate": 5.310549777117385e-08, + "loss": 0.2162, + "step": 15830 + }, + { + "epoch": 47.0, + "eval_accuracy": 0.9387527839643652, + "eval_loss": 0.21495358645915985, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4252, + "eval_samples_per_second": 139.762, + "eval_steps_per_second": 17.587, + "step": 15839 + }, + { + "epoch": 47.002967359050444, + "grad_norm": 0.4618281126022339, + "learning_rate": 5.3075780089153044e-08, + "loss": 0.1299, + "step": 15840 + }, + { + "epoch": 47.032640949554896, + "grad_norm": 23.447547912597656, + "learning_rate": 5.304606240713224e-08, + "loss": 0.0523, + "step": 15850 + }, + { + "epoch": 47.06231454005935, + "grad_norm": 2.924910545349121, + "learning_rate": 5.301634472511144e-08, + "loss": 0.0811, + "step": 15860 + }, + { + "epoch": 47.0919881305638, + "grad_norm": 21.168272018432617, + "learning_rate": 5.298662704309064e-08, + "loss": 0.0824, + "step": 15870 + }, + { + "epoch": 47.12166172106825, + "grad_norm": 24.46132469177246, + "learning_rate": 5.295690936106983e-08, + "loss": 0.2462, + "step": 15880 + }, + { + "epoch": 47.1513353115727, + "grad_norm": 1.214876413345337, + "learning_rate": 5.2927191679049035e-08, + "loss": 0.0404, + "step": 15890 + }, + { + "epoch": 47.18100890207715, + "grad_norm": 5.6905388832092285, + "learning_rate": 5.289747399702823e-08, + "loss": 0.1123, + "step": 15900 + }, + { + "epoch": 47.210682492581604, + "grad_norm": 0.12871259450912476, + "learning_rate": 5.2867756315007426e-08, + "loss": 0.2443, + "step": 15910 + }, + { + "epoch": 47.240356083086056, + "grad_norm": 11.176782608032227, + "learning_rate": 5.283803863298663e-08, + "loss": 0.1196, + "step": 15920 + }, + { + "epoch": 47.27002967359051, + "grad_norm": 17.62759780883789, + "learning_rate": 5.2808320950965824e-08, + "loss": 0.0623, + "step": 15930 + }, + { + "epoch": 47.29970326409496, + "grad_norm": 21.132585525512695, + "learning_rate": 5.277860326894502e-08, + "loss": 0.4206, + "step": 15940 + }, + { + "epoch": 47.32937685459941, + "grad_norm": 12.714109420776367, + "learning_rate": 5.274888558692422e-08, + "loss": 0.0575, + "step": 15950 + }, + { + "epoch": 47.359050445103854, + "grad_norm": 0.07116535305976868, + "learning_rate": 5.271916790490342e-08, + "loss": 0.0328, + "step": 15960 + }, + { + "epoch": 47.388724035608305, + "grad_norm": 0.8279727697372437, + "learning_rate": 5.268945022288261e-08, + "loss": 0.1738, + "step": 15970 + }, + { + "epoch": 47.41839762611276, + "grad_norm": 12.997517585754395, + "learning_rate": 5.2659732540861814e-08, + "loss": 0.079, + "step": 15980 + }, + { + "epoch": 47.44807121661721, + "grad_norm": 23.596513748168945, + "learning_rate": 5.263001485884101e-08, + "loss": 0.19, + "step": 15990 + }, + { + "epoch": 47.47774480712166, + "grad_norm": 0.9486403465270996, + "learning_rate": 5.2600297176820206e-08, + "loss": 0.0594, + "step": 16000 + }, + { + "epoch": 47.50741839762611, + "grad_norm": 12.788954734802246, + "learning_rate": 5.257057949479941e-08, + "loss": 0.2127, + "step": 16010 + }, + { + "epoch": 47.53709198813056, + "grad_norm": 3.6194963455200195, + "learning_rate": 5.25408618127786e-08, + "loss": 0.2362, + "step": 16020 + }, + { + "epoch": 47.566765578635014, + "grad_norm": 13.14094352722168, + "learning_rate": 5.25111441307578e-08, + "loss": 0.1929, + "step": 16030 + }, + { + "epoch": 47.596439169139465, + "grad_norm": 6.189344882965088, + "learning_rate": 5.2481426448737e-08, + "loss": 0.254, + "step": 16040 + }, + { + "epoch": 47.62611275964392, + "grad_norm": 0.47985193133354187, + "learning_rate": 5.2451708766716196e-08, + "loss": 0.1538, + "step": 16050 + }, + { + "epoch": 47.65578635014837, + "grad_norm": 16.396726608276367, + "learning_rate": 5.242199108469539e-08, + "loss": 0.0112, + "step": 16060 + }, + { + "epoch": 47.68545994065282, + "grad_norm": 0.11378441751003265, + "learning_rate": 5.2392273402674594e-08, + "loss": 0.1268, + "step": 16070 + }, + { + "epoch": 47.71513353115727, + "grad_norm": 0.18873144686222076, + "learning_rate": 5.236255572065379e-08, + "loss": 0.1332, + "step": 16080 + }, + { + "epoch": 47.74480712166172, + "grad_norm": 3.973259210586548, + "learning_rate": 5.2332838038632985e-08, + "loss": 0.0661, + "step": 16090 + }, + { + "epoch": 47.774480712166174, + "grad_norm": 1.4226669073104858, + "learning_rate": 5.230312035661219e-08, + "loss": 0.0889, + "step": 16100 + }, + { + "epoch": 47.804154302670625, + "grad_norm": 0.014157813973724842, + "learning_rate": 5.227340267459138e-08, + "loss": 0.29, + "step": 16110 + }, + { + "epoch": 47.833827893175076, + "grad_norm": 0.11147283762693405, + "learning_rate": 5.224368499257058e-08, + "loss": 0.0501, + "step": 16120 + }, + { + "epoch": 47.86350148367953, + "grad_norm": 0.024849580600857735, + "learning_rate": 5.221396731054978e-08, + "loss": 0.0686, + "step": 16130 + }, + { + "epoch": 47.89317507418398, + "grad_norm": 32.79576110839844, + "learning_rate": 5.2184249628528976e-08, + "loss": 0.1952, + "step": 16140 + }, + { + "epoch": 47.92284866468843, + "grad_norm": 6.132491111755371, + "learning_rate": 5.215453194650817e-08, + "loss": 0.2297, + "step": 16150 + }, + { + "epoch": 47.952522255192875, + "grad_norm": 5.660188674926758, + "learning_rate": 5.2124814264487373e-08, + "loss": 0.2296, + "step": 16160 + }, + { + "epoch": 47.982195845697326, + "grad_norm": 0.46877506375312805, + "learning_rate": 5.209509658246657e-08, + "loss": 0.1125, + "step": 16170 + }, + { + "epoch": 48.0, + "eval_accuracy": 0.9376391982182628, + "eval_loss": 0.2152387946844101, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4097, + "eval_samples_per_second": 140.1, + "eval_steps_per_second": 17.63, + "step": 16176 + }, + { + "epoch": 48.01186943620178, + "grad_norm": 1.4412373304367065, + "learning_rate": 5.2065378900445764e-08, + "loss": 0.0527, + "step": 16180 + }, + { + "epoch": 48.04154302670623, + "grad_norm": 23.187875747680664, + "learning_rate": 5.2035661218424966e-08, + "loss": 0.1065, + "step": 16190 + }, + { + "epoch": 48.07121661721068, + "grad_norm": 3.96274733543396, + "learning_rate": 5.200594353640416e-08, + "loss": 0.0981, + "step": 16200 + }, + { + "epoch": 48.10089020771513, + "grad_norm": 26.154644012451172, + "learning_rate": 5.197622585438336e-08, + "loss": 0.1573, + "step": 16210 + }, + { + "epoch": 48.13056379821958, + "grad_norm": 0.03977694362401962, + "learning_rate": 5.194650817236256e-08, + "loss": 0.1114, + "step": 16220 + }, + { + "epoch": 48.160237388724035, + "grad_norm": 30.106239318847656, + "learning_rate": 5.1916790490341755e-08, + "loss": 0.2009, + "step": 16230 + }, + { + "epoch": 48.189910979228486, + "grad_norm": 2.2174997329711914, + "learning_rate": 5.188707280832095e-08, + "loss": 0.1021, + "step": 16240 + }, + { + "epoch": 48.21958456973294, + "grad_norm": 21.587839126586914, + "learning_rate": 5.185735512630015e-08, + "loss": 0.0768, + "step": 16250 + }, + { + "epoch": 48.24925816023739, + "grad_norm": 0.021659675985574722, + "learning_rate": 5.182763744427935e-08, + "loss": 0.0293, + "step": 16260 + }, + { + "epoch": 48.27893175074184, + "grad_norm": 11.553271293640137, + "learning_rate": 5.1797919762258544e-08, + "loss": 0.112, + "step": 16270 + }, + { + "epoch": 48.30860534124629, + "grad_norm": 1.6398833990097046, + "learning_rate": 5.1768202080237746e-08, + "loss": 0.198, + "step": 16280 + }, + { + "epoch": 48.33827893175074, + "grad_norm": 0.18100570142269135, + "learning_rate": 5.173848439821694e-08, + "loss": 0.0481, + "step": 16290 + }, + { + "epoch": 48.367952522255194, + "grad_norm": 1.2032248973846436, + "learning_rate": 5.170876671619614e-08, + "loss": 0.0454, + "step": 16300 + }, + { + "epoch": 48.397626112759646, + "grad_norm": 20.38185691833496, + "learning_rate": 5.167904903417534e-08, + "loss": 0.0608, + "step": 16310 + }, + { + "epoch": 48.4272997032641, + "grad_norm": 0.2795165479183197, + "learning_rate": 5.1649331352154535e-08, + "loss": 0.0893, + "step": 16320 + }, + { + "epoch": 48.45697329376855, + "grad_norm": 13.565400123596191, + "learning_rate": 5.161961367013373e-08, + "loss": 0.2159, + "step": 16330 + }, + { + "epoch": 48.486646884273, + "grad_norm": 0.6898295879364014, + "learning_rate": 5.158989598811293e-08, + "loss": 0.0856, + "step": 16340 + }, + { + "epoch": 48.51632047477745, + "grad_norm": 0.2843184769153595, + "learning_rate": 5.1560178306092114e-08, + "loss": 0.1411, + "step": 16350 + }, + { + "epoch": 48.545994065281896, + "grad_norm": 0.15256699919700623, + "learning_rate": 5.1530460624071317e-08, + "loss": 0.1102, + "step": 16360 + }, + { + "epoch": 48.57566765578635, + "grad_norm": 0.6682108640670776, + "learning_rate": 5.150074294205051e-08, + "loss": 0.0905, + "step": 16370 + }, + { + "epoch": 48.6053412462908, + "grad_norm": 0.3731585741043091, + "learning_rate": 5.147102526002971e-08, + "loss": 0.0792, + "step": 16380 + }, + { + "epoch": 48.63501483679525, + "grad_norm": 17.357967376708984, + "learning_rate": 5.144130757800891e-08, + "loss": 0.1973, + "step": 16390 + }, + { + "epoch": 48.6646884272997, + "grad_norm": 0.10166790336370468, + "learning_rate": 5.1411589895988105e-08, + "loss": 0.0505, + "step": 16400 + }, + { + "epoch": 48.69436201780415, + "grad_norm": 10.09204387664795, + "learning_rate": 5.13818722139673e-08, + "loss": 0.0801, + "step": 16410 + }, + { + "epoch": 48.724035608308604, + "grad_norm": 20.897958755493164, + "learning_rate": 5.13521545319465e-08, + "loss": 0.0998, + "step": 16420 + }, + { + "epoch": 48.753709198813056, + "grad_norm": 0.5123386979103088, + "learning_rate": 5.13224368499257e-08, + "loss": 0.1937, + "step": 16430 + }, + { + "epoch": 48.78338278931751, + "grad_norm": 23.21985626220703, + "learning_rate": 5.1292719167904894e-08, + "loss": 0.203, + "step": 16440 + }, + { + "epoch": 48.81305637982196, + "grad_norm": 0.6679745316505432, + "learning_rate": 5.1263001485884096e-08, + "loss": 0.2985, + "step": 16450 + }, + { + "epoch": 48.84272997032641, + "grad_norm": 0.14492544531822205, + "learning_rate": 5.123328380386329e-08, + "loss": 0.1142, + "step": 16460 + }, + { + "epoch": 48.87240356083086, + "grad_norm": 22.184814453125, + "learning_rate": 5.120356612184249e-08, + "loss": 0.0373, + "step": 16470 + }, + { + "epoch": 48.90207715133531, + "grad_norm": 0.9699323177337646, + "learning_rate": 5.117384843982169e-08, + "loss": 0.2115, + "step": 16480 + }, + { + "epoch": 48.931750741839764, + "grad_norm": 0.07130200415849686, + "learning_rate": 5.1144130757800885e-08, + "loss": 0.1396, + "step": 16490 + }, + { + "epoch": 48.961424332344215, + "grad_norm": 0.05964432656764984, + "learning_rate": 5.111441307578008e-08, + "loss": 0.0417, + "step": 16500 + }, + { + "epoch": 48.99109792284867, + "grad_norm": 19.631088256835938, + "learning_rate": 5.108469539375928e-08, + "loss": 0.2186, + "step": 16510 + }, + { + "epoch": 49.0, + "eval_accuracy": 0.9376391982182628, + "eval_loss": 0.21550168097019196, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4444, + "eval_samples_per_second": 139.346, + "eval_steps_per_second": 17.535, + "step": 16513 + }, + { + "epoch": 49.02077151335312, + "grad_norm": 3.1347455978393555, + "learning_rate": 5.105497771173848e-08, + "loss": 0.1192, + "step": 16520 + }, + { + "epoch": 49.05044510385757, + "grad_norm": 0.014873781241476536, + "learning_rate": 5.102526002971767e-08, + "loss": 0.0988, + "step": 16530 + }, + { + "epoch": 49.08011869436202, + "grad_norm": 0.032999180257320404, + "learning_rate": 5.0995542347696875e-08, + "loss": 0.2089, + "step": 16540 + }, + { + "epoch": 49.10979228486647, + "grad_norm": 3.205204486846924, + "learning_rate": 5.096582466567607e-08, + "loss": 0.1139, + "step": 16550 + }, + { + "epoch": 49.13946587537092, + "grad_norm": 6.010907173156738, + "learning_rate": 5.0936106983655266e-08, + "loss": 0.1248, + "step": 16560 + }, + { + "epoch": 49.16913946587537, + "grad_norm": 4.526744842529297, + "learning_rate": 5.090638930163447e-08, + "loss": 0.0907, + "step": 16570 + }, + { + "epoch": 49.19881305637982, + "grad_norm": 5.962198257446289, + "learning_rate": 5.0876671619613664e-08, + "loss": 0.1307, + "step": 16580 + }, + { + "epoch": 49.22848664688427, + "grad_norm": 0.6678928732872009, + "learning_rate": 5.084695393759286e-08, + "loss": 0.054, + "step": 16590 + }, + { + "epoch": 49.25816023738872, + "grad_norm": 10.566190719604492, + "learning_rate": 5.081723625557206e-08, + "loss": 0.1322, + "step": 16600 + }, + { + "epoch": 49.287833827893174, + "grad_norm": 1.418585181236267, + "learning_rate": 5.078751857355126e-08, + "loss": 0.1893, + "step": 16610 + }, + { + "epoch": 49.317507418397625, + "grad_norm": 0.11207262426614761, + "learning_rate": 5.075780089153045e-08, + "loss": 0.2091, + "step": 16620 + }, + { + "epoch": 49.347181008902076, + "grad_norm": 0.2192656248807907, + "learning_rate": 5.0728083209509655e-08, + "loss": 0.0345, + "step": 16630 + }, + { + "epoch": 49.37685459940653, + "grad_norm": 0.3981482982635498, + "learning_rate": 5.069836552748885e-08, + "loss": 0.1417, + "step": 16640 + }, + { + "epoch": 49.40652818991098, + "grad_norm": 0.9440770149230957, + "learning_rate": 5.0668647845468046e-08, + "loss": 0.0113, + "step": 16650 + }, + { + "epoch": 49.43620178041543, + "grad_norm": 1.999841570854187, + "learning_rate": 5.063893016344725e-08, + "loss": 0.0467, + "step": 16660 + }, + { + "epoch": 49.46587537091988, + "grad_norm": 0.14728444814682007, + "learning_rate": 5.0609212481426444e-08, + "loss": 0.1394, + "step": 16670 + }, + { + "epoch": 49.49554896142433, + "grad_norm": 1.4735740423202515, + "learning_rate": 5.057949479940564e-08, + "loss": 0.1454, + "step": 16680 + }, + { + "epoch": 49.525222551928785, + "grad_norm": 2.3318047523498535, + "learning_rate": 5.054977711738484e-08, + "loss": 0.2064, + "step": 16690 + }, + { + "epoch": 49.554896142433236, + "grad_norm": 7.936349868774414, + "learning_rate": 5.0520059435364037e-08, + "loss": 0.0122, + "step": 16700 + }, + { + "epoch": 49.58456973293769, + "grad_norm": 1.2083709239959717, + "learning_rate": 5.049034175334323e-08, + "loss": 0.0432, + "step": 16710 + }, + { + "epoch": 49.61424332344214, + "grad_norm": 0.7646023035049438, + "learning_rate": 5.0460624071322434e-08, + "loss": 0.1193, + "step": 16720 + }, + { + "epoch": 49.64391691394659, + "grad_norm": 0.47022977471351624, + "learning_rate": 5.043090638930163e-08, + "loss": 0.145, + "step": 16730 + }, + { + "epoch": 49.67359050445104, + "grad_norm": 23.29509925842285, + "learning_rate": 5.0401188707280825e-08, + "loss": 0.0602, + "step": 16740 + }, + { + "epoch": 49.703264094955486, + "grad_norm": 8.746318817138672, + "learning_rate": 5.037147102526003e-08, + "loss": 0.0848, + "step": 16750 + }, + { + "epoch": 49.73293768545994, + "grad_norm": 0.23059499263763428, + "learning_rate": 5.034175334323922e-08, + "loss": 0.1202, + "step": 16760 + }, + { + "epoch": 49.76261127596439, + "grad_norm": 7.272520065307617, + "learning_rate": 5.031203566121842e-08, + "loss": 0.0887, + "step": 16770 + }, + { + "epoch": 49.79228486646884, + "grad_norm": 0.03718450665473938, + "learning_rate": 5.028231797919762e-08, + "loss": 0.0149, + "step": 16780 + }, + { + "epoch": 49.82195845697329, + "grad_norm": 0.08075323700904846, + "learning_rate": 5.0252600297176816e-08, + "loss": 0.1563, + "step": 16790 + }, + { + "epoch": 49.85163204747774, + "grad_norm": 1.1455299854278564, + "learning_rate": 5.022288261515601e-08, + "loss": 0.3049, + "step": 16800 + }, + { + "epoch": 49.881305637982194, + "grad_norm": 25.477001190185547, + "learning_rate": 5.0193164933135214e-08, + "loss": 0.0506, + "step": 16810 + }, + { + "epoch": 49.910979228486646, + "grad_norm": 0.21012915670871735, + "learning_rate": 5.016344725111441e-08, + "loss": 0.0196, + "step": 16820 + }, + { + "epoch": 49.9406528189911, + "grad_norm": 6.624281406402588, + "learning_rate": 5.0133729569093605e-08, + "loss": 0.1561, + "step": 16830 + }, + { + "epoch": 49.97032640949555, + "grad_norm": 5.852079391479492, + "learning_rate": 5.010401188707281e-08, + "loss": 0.2763, + "step": 16840 + }, + { + "epoch": 50.0, + "grad_norm": 32.61520767211914, + "learning_rate": 5.0074294205052e-08, + "loss": 0.1517, + "step": 16850 + }, + { + "epoch": 50.0, + "eval_accuracy": 0.9387527839643652, + "eval_loss": 0.21467024087905884, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.3724, + "eval_samples_per_second": 140.92, + "eval_steps_per_second": 17.733, + "step": 16850 + }, + { + "epoch": 50.02967359050445, + "grad_norm": 0.8358116149902344, + "learning_rate": 5.00445765230312e-08, + "loss": 0.0189, + "step": 16860 + }, + { + "epoch": 50.0593471810089, + "grad_norm": 23.692110061645508, + "learning_rate": 5.00148588410104e-08, + "loss": 0.0784, + "step": 16870 + }, + { + "epoch": 50.089020771513354, + "grad_norm": 13.013814926147461, + "learning_rate": 4.9985141158989595e-08, + "loss": 0.2013, + "step": 16880 + }, + { + "epoch": 50.118694362017806, + "grad_norm": 26.055011749267578, + "learning_rate": 4.99554234769688e-08, + "loss": 0.1872, + "step": 16890 + }, + { + "epoch": 50.14836795252226, + "grad_norm": 7.37661075592041, + "learning_rate": 4.992570579494799e-08, + "loss": 0.0211, + "step": 16900 + }, + { + "epoch": 50.17804154302671, + "grad_norm": 0.046912938356399536, + "learning_rate": 4.989598811292719e-08, + "loss": 0.1109, + "step": 16910 + }, + { + "epoch": 50.20771513353116, + "grad_norm": 0.0522104911506176, + "learning_rate": 4.986627043090639e-08, + "loss": 0.0575, + "step": 16920 + }, + { + "epoch": 50.23738872403561, + "grad_norm": 4.3547444343566895, + "learning_rate": 4.9836552748885586e-08, + "loss": 0.0965, + "step": 16930 + }, + { + "epoch": 50.26706231454006, + "grad_norm": 3.168205738067627, + "learning_rate": 4.980683506686478e-08, + "loss": 0.0399, + "step": 16940 + }, + { + "epoch": 50.29673590504451, + "grad_norm": 17.390626907348633, + "learning_rate": 4.9777117384843984e-08, + "loss": 0.0712, + "step": 16950 + }, + { + "epoch": 50.32640949554896, + "grad_norm": 1.475823998451233, + "learning_rate": 4.974739970282318e-08, + "loss": 0.1721, + "step": 16960 + }, + { + "epoch": 50.35608308605341, + "grad_norm": 18.49626350402832, + "learning_rate": 4.9717682020802375e-08, + "loss": 0.1681, + "step": 16970 + }, + { + "epoch": 50.38575667655786, + "grad_norm": 0.029805028811097145, + "learning_rate": 4.968796433878158e-08, + "loss": 0.134, + "step": 16980 + }, + { + "epoch": 50.41543026706231, + "grad_norm": 7.499514102935791, + "learning_rate": 4.965824665676077e-08, + "loss": 0.2298, + "step": 16990 + }, + { + "epoch": 50.445103857566764, + "grad_norm": 0.10663921386003494, + "learning_rate": 4.962852897473997e-08, + "loss": 0.2824, + "step": 17000 + }, + { + "epoch": 50.474777448071215, + "grad_norm": 0.9048697352409363, + "learning_rate": 4.959881129271917e-08, + "loss": 0.1257, + "step": 17010 + }, + { + "epoch": 50.50445103857567, + "grad_norm": 1.0663954019546509, + "learning_rate": 4.9569093610698366e-08, + "loss": 0.1847, + "step": 17020 + }, + { + "epoch": 50.53412462908012, + "grad_norm": 0.7726712822914124, + "learning_rate": 4.953937592867756e-08, + "loss": 0.2582, + "step": 17030 + }, + { + "epoch": 50.56379821958457, + "grad_norm": 24.890085220336914, + "learning_rate": 4.9509658246656763e-08, + "loss": 0.1337, + "step": 17040 + }, + { + "epoch": 50.59347181008902, + "grad_norm": 10.706290245056152, + "learning_rate": 4.947994056463596e-08, + "loss": 0.0273, + "step": 17050 + }, + { + "epoch": 50.62314540059347, + "grad_norm": 19.70595359802246, + "learning_rate": 4.9450222882615154e-08, + "loss": 0.1908, + "step": 17060 + }, + { + "epoch": 50.652818991097924, + "grad_norm": 0.3706263601779938, + "learning_rate": 4.9420505200594356e-08, + "loss": 0.178, + "step": 17070 + }, + { + "epoch": 50.682492581602375, + "grad_norm": 29.093524932861328, + "learning_rate": 4.939078751857355e-08, + "loss": 0.1124, + "step": 17080 + }, + { + "epoch": 50.712166172106826, + "grad_norm": 0.8723905086517334, + "learning_rate": 4.936106983655275e-08, + "loss": 0.0676, + "step": 17090 + }, + { + "epoch": 50.74183976261128, + "grad_norm": 2.0415871143341064, + "learning_rate": 4.933135215453195e-08, + "loss": 0.0374, + "step": 17100 + }, + { + "epoch": 50.77151335311573, + "grad_norm": 9.756440162658691, + "learning_rate": 4.9301634472511145e-08, + "loss": 0.0968, + "step": 17110 + }, + { + "epoch": 50.80118694362018, + "grad_norm": 1.1791406869888306, + "learning_rate": 4.927191679049034e-08, + "loss": 0.1105, + "step": 17120 + }, + { + "epoch": 50.83086053412463, + "grad_norm": 13.14554214477539, + "learning_rate": 4.924219910846954e-08, + "loss": 0.1913, + "step": 17130 + }, + { + "epoch": 50.86053412462908, + "grad_norm": 2.9980385303497314, + "learning_rate": 4.921248142644873e-08, + "loss": 0.2487, + "step": 17140 + }, + { + "epoch": 50.89020771513353, + "grad_norm": 20.3254451751709, + "learning_rate": 4.918276374442793e-08, + "loss": 0.0848, + "step": 17150 + }, + { + "epoch": 50.91988130563798, + "grad_norm": 1.5445979833602905, + "learning_rate": 4.915304606240713e-08, + "loss": 0.0407, + "step": 17160 + }, + { + "epoch": 50.94955489614243, + "grad_norm": 7.252246856689453, + "learning_rate": 4.9123328380386325e-08, + "loss": 0.0215, + "step": 17170 + }, + { + "epoch": 50.97922848664688, + "grad_norm": 0.12611857056617737, + "learning_rate": 4.909361069836552e-08, + "loss": 0.12, + "step": 17180 + }, + { + "epoch": 51.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.21472297608852386, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.3993, + "eval_samples_per_second": 140.328, + "eval_steps_per_second": 17.658, + "step": 17187 + }, + { + "epoch": 51.00890207715133, + "grad_norm": 10.329874038696289, + "learning_rate": 4.906389301634472e-08, + "loss": 0.0857, + "step": 17190 + }, + { + "epoch": 51.038575667655785, + "grad_norm": 23.54206657409668, + "learning_rate": 4.903417533432392e-08, + "loss": 0.1937, + "step": 17200 + }, + { + "epoch": 51.068249258160236, + "grad_norm": 0.08602197468280792, + "learning_rate": 4.9004457652303113e-08, + "loss": 0.2581, + "step": 17210 + }, + { + "epoch": 51.09792284866469, + "grad_norm": 13.050220489501953, + "learning_rate": 4.8974739970282316e-08, + "loss": 0.0918, + "step": 17220 + }, + { + "epoch": 51.12759643916914, + "grad_norm": 0.019280022010207176, + "learning_rate": 4.894502228826151e-08, + "loss": 0.0231, + "step": 17230 + }, + { + "epoch": 51.15727002967359, + "grad_norm": 0.9664857983589172, + "learning_rate": 4.8915304606240707e-08, + "loss": 0.0285, + "step": 17240 + }, + { + "epoch": 51.18694362017804, + "grad_norm": 0.14094841480255127, + "learning_rate": 4.888558692421991e-08, + "loss": 0.1066, + "step": 17250 + }, + { + "epoch": 51.21661721068249, + "grad_norm": 0.2799805998802185, + "learning_rate": 4.8855869242199104e-08, + "loss": 0.0778, + "step": 17260 + }, + { + "epoch": 51.246290801186944, + "grad_norm": 2.6953928470611572, + "learning_rate": 4.88261515601783e-08, + "loss": 0.0867, + "step": 17270 + }, + { + "epoch": 51.275964391691396, + "grad_norm": 5.554049015045166, + "learning_rate": 4.87964338781575e-08, + "loss": 0.1607, + "step": 17280 + }, + { + "epoch": 51.30563798219585, + "grad_norm": 0.9602094888687134, + "learning_rate": 4.87667161961367e-08, + "loss": 0.0845, + "step": 17290 + }, + { + "epoch": 51.3353115727003, + "grad_norm": 6.4245829582214355, + "learning_rate": 4.873699851411589e-08, + "loss": 0.1017, + "step": 17300 + }, + { + "epoch": 51.36498516320475, + "grad_norm": 0.16743648052215576, + "learning_rate": 4.8707280832095095e-08, + "loss": 0.0332, + "step": 17310 + }, + { + "epoch": 51.3946587537092, + "grad_norm": 1.1781718730926514, + "learning_rate": 4.867756315007429e-08, + "loss": 0.2762, + "step": 17320 + }, + { + "epoch": 51.42433234421365, + "grad_norm": 0.08699502050876617, + "learning_rate": 4.8647845468053486e-08, + "loss": 0.0189, + "step": 17330 + }, + { + "epoch": 51.454005934718104, + "grad_norm": 1.894694209098816, + "learning_rate": 4.861812778603269e-08, + "loss": 0.0461, + "step": 17340 + }, + { + "epoch": 51.48367952522255, + "grad_norm": 15.14892578125, + "learning_rate": 4.8588410104011884e-08, + "loss": 0.0742, + "step": 17350 + }, + { + "epoch": 51.513353115727, + "grad_norm": 1.1362648010253906, + "learning_rate": 4.855869242199108e-08, + "loss": 0.0569, + "step": 17360 + }, + { + "epoch": 51.54302670623145, + "grad_norm": 2.941601037979126, + "learning_rate": 4.852897473997028e-08, + "loss": 0.0677, + "step": 17370 + }, + { + "epoch": 51.5727002967359, + "grad_norm": 7.638850212097168, + "learning_rate": 4.849925705794948e-08, + "loss": 0.1884, + "step": 17380 + }, + { + "epoch": 51.602373887240354, + "grad_norm": 0.34345099329948425, + "learning_rate": 4.846953937592867e-08, + "loss": 0.096, + "step": 17390 + }, + { + "epoch": 51.632047477744806, + "grad_norm": 6.40854549407959, + "learning_rate": 4.8439821693907874e-08, + "loss": 0.1579, + "step": 17400 + }, + { + "epoch": 51.66172106824926, + "grad_norm": 7.019104957580566, + "learning_rate": 4.841010401188707e-08, + "loss": 0.1391, + "step": 17410 + }, + { + "epoch": 51.69139465875371, + "grad_norm": 0.1263841688632965, + "learning_rate": 4.8380386329866265e-08, + "loss": 0.1549, + "step": 17420 + }, + { + "epoch": 51.72106824925816, + "grad_norm": 0.7122487425804138, + "learning_rate": 4.835066864784547e-08, + "loss": 0.0134, + "step": 17430 + }, + { + "epoch": 51.75074183976261, + "grad_norm": 29.82644271850586, + "learning_rate": 4.832095096582466e-08, + "loss": 0.1825, + "step": 17440 + }, + { + "epoch": 51.78041543026706, + "grad_norm": 13.365633964538574, + "learning_rate": 4.829123328380386e-08, + "loss": 0.0626, + "step": 17450 + }, + { + "epoch": 51.810089020771514, + "grad_norm": 12.48543643951416, + "learning_rate": 4.826151560178306e-08, + "loss": 0.095, + "step": 17460 + }, + { + "epoch": 51.839762611275965, + "grad_norm": 24.76251983642578, + "learning_rate": 4.8231797919762256e-08, + "loss": 0.0481, + "step": 17470 + }, + { + "epoch": 51.86943620178042, + "grad_norm": 1.793227195739746, + "learning_rate": 4.820208023774145e-08, + "loss": 0.1517, + "step": 17480 + }, + { + "epoch": 51.89910979228487, + "grad_norm": 0.8583541512489319, + "learning_rate": 4.8172362555720654e-08, + "loss": 0.0804, + "step": 17490 + }, + { + "epoch": 51.92878338278932, + "grad_norm": 19.491729736328125, + "learning_rate": 4.814264487369985e-08, + "loss": 0.0841, + "step": 17500 + }, + { + "epoch": 51.95845697329377, + "grad_norm": 0.47182202339172363, + "learning_rate": 4.8112927191679045e-08, + "loss": 0.2723, + "step": 17510 + }, + { + "epoch": 51.98813056379822, + "grad_norm": 3.0573573112487793, + "learning_rate": 4.808320950965825e-08, + "loss": 0.1466, + "step": 17520 + }, + { + "epoch": 52.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.21579141914844513, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.3925, + "eval_samples_per_second": 140.476, + "eval_steps_per_second": 17.677, + "step": 17524 + }, + { + "epoch": 52.017804154302674, + "grad_norm": 6.013158798217773, + "learning_rate": 4.805349182763744e-08, + "loss": 0.1433, + "step": 17530 + }, + { + "epoch": 52.047477744807125, + "grad_norm": 2.589423894882202, + "learning_rate": 4.802377414561664e-08, + "loss": 0.2028, + "step": 17540 + }, + { + "epoch": 52.07715133531157, + "grad_norm": 1.173121690750122, + "learning_rate": 4.799405646359584e-08, + "loss": 0.0552, + "step": 17550 + }, + { + "epoch": 52.10682492581602, + "grad_norm": 9.217979431152344, + "learning_rate": 4.7964338781575036e-08, + "loss": 0.1322, + "step": 17560 + }, + { + "epoch": 52.13649851632047, + "grad_norm": 0.23394060134887695, + "learning_rate": 4.793462109955423e-08, + "loss": 0.2052, + "step": 17570 + }, + { + "epoch": 52.166172106824924, + "grad_norm": 1.2165412902832031, + "learning_rate": 4.790490341753343e-08, + "loss": 0.2851, + "step": 17580 + }, + { + "epoch": 52.195845697329375, + "grad_norm": 0.04559088125824928, + "learning_rate": 4.787518573551263e-08, + "loss": 0.1432, + "step": 17590 + }, + { + "epoch": 52.225519287833826, + "grad_norm": 0.23518282175064087, + "learning_rate": 4.784546805349183e-08, + "loss": 0.0573, + "step": 17600 + }, + { + "epoch": 52.25519287833828, + "grad_norm": 26.808229446411133, + "learning_rate": 4.7815750371471026e-08, + "loss": 0.1975, + "step": 17610 + }, + { + "epoch": 52.28486646884273, + "grad_norm": 0.1649000644683838, + "learning_rate": 4.778603268945022e-08, + "loss": 0.1042, + "step": 17620 + }, + { + "epoch": 52.31454005934718, + "grad_norm": 7.017621994018555, + "learning_rate": 4.7756315007429424e-08, + "loss": 0.125, + "step": 17630 + }, + { + "epoch": 52.34421364985163, + "grad_norm": 3.0812735557556152, + "learning_rate": 4.772659732540862e-08, + "loss": 0.2309, + "step": 17640 + }, + { + "epoch": 52.37388724035608, + "grad_norm": 19.32549476623535, + "learning_rate": 4.7696879643387815e-08, + "loss": 0.2046, + "step": 17650 + }, + { + "epoch": 52.403560830860535, + "grad_norm": 0.06391577422618866, + "learning_rate": 4.766716196136702e-08, + "loss": 0.1415, + "step": 17660 + }, + { + "epoch": 52.433234421364986, + "grad_norm": 17.47604751586914, + "learning_rate": 4.7637444279346206e-08, + "loss": 0.0471, + "step": 17670 + }, + { + "epoch": 52.46290801186944, + "grad_norm": 0.36303091049194336, + "learning_rate": 4.76077265973254e-08, + "loss": 0.0779, + "step": 17680 + }, + { + "epoch": 52.49258160237389, + "grad_norm": 0.425492525100708, + "learning_rate": 4.7578008915304604e-08, + "loss": 0.0241, + "step": 17690 + }, + { + "epoch": 52.52225519287834, + "grad_norm": 0.7421813011169434, + "learning_rate": 4.75482912332838e-08, + "loss": 0.1546, + "step": 17700 + }, + { + "epoch": 52.55192878338279, + "grad_norm": 0.7607003450393677, + "learning_rate": 4.7518573551262995e-08, + "loss": 0.086, + "step": 17710 + }, + { + "epoch": 52.58160237388724, + "grad_norm": 0.31621578335762024, + "learning_rate": 4.74888558692422e-08, + "loss": 0.0437, + "step": 17720 + }, + { + "epoch": 52.611275964391695, + "grad_norm": 0.6049529314041138, + "learning_rate": 4.745913818722139e-08, + "loss": 0.1329, + "step": 17730 + }, + { + "epoch": 52.640949554896146, + "grad_norm": 0.1794736087322235, + "learning_rate": 4.742942050520059e-08, + "loss": 0.1177, + "step": 17740 + }, + { + "epoch": 52.67062314540059, + "grad_norm": 0.620769202709198, + "learning_rate": 4.739970282317979e-08, + "loss": 0.0237, + "step": 17750 + }, + { + "epoch": 52.70029673590504, + "grad_norm": 0.6835030913352966, + "learning_rate": 4.7369985141158985e-08, + "loss": 0.1159, + "step": 17760 + }, + { + "epoch": 52.72997032640949, + "grad_norm": 20.63369369506836, + "learning_rate": 4.734026745913818e-08, + "loss": 0.2486, + "step": 17770 + }, + { + "epoch": 52.759643916913944, + "grad_norm": 6.866409778594971, + "learning_rate": 4.731054977711738e-08, + "loss": 0.138, + "step": 17780 + }, + { + "epoch": 52.789317507418396, + "grad_norm": 16.915674209594727, + "learning_rate": 4.728083209509658e-08, + "loss": 0.2144, + "step": 17790 + }, + { + "epoch": 52.81899109792285, + "grad_norm": 4.962776184082031, + "learning_rate": 4.7251114413075774e-08, + "loss": 0.0481, + "step": 17800 + }, + { + "epoch": 52.8486646884273, + "grad_norm": 7.785134792327881, + "learning_rate": 4.7221396731054976e-08, + "loss": 0.0539, + "step": 17810 + }, + { + "epoch": 52.87833827893175, + "grad_norm": 0.7961817383766174, + "learning_rate": 4.719167904903417e-08, + "loss": 0.1904, + "step": 17820 + }, + { + "epoch": 52.9080118694362, + "grad_norm": 1.23796546459198, + "learning_rate": 4.716196136701337e-08, + "loss": 0.1197, + "step": 17830 + }, + { + "epoch": 52.93768545994065, + "grad_norm": 22.088932037353516, + "learning_rate": 4.713224368499257e-08, + "loss": 0.1578, + "step": 17840 + }, + { + "epoch": 52.967359050445104, + "grad_norm": 0.34752142429351807, + "learning_rate": 4.7102526002971765e-08, + "loss": 0.0699, + "step": 17850 + }, + { + "epoch": 52.997032640949556, + "grad_norm": 0.24164463579654694, + "learning_rate": 4.707280832095096e-08, + "loss": 0.1885, + "step": 17860 + }, + { + "epoch": 53.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.2150038480758667, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4259, + "eval_samples_per_second": 139.746, + "eval_steps_per_second": 17.585, + "step": 17861 + }, + { + "epoch": 53.02670623145401, + "grad_norm": 0.07548333704471588, + "learning_rate": 4.704309063893016e-08, + "loss": 0.1424, + "step": 17870 + }, + { + "epoch": 53.05637982195846, + "grad_norm": 1.8956226110458374, + "learning_rate": 4.701337295690936e-08, + "loss": 0.0858, + "step": 17880 + }, + { + "epoch": 53.08605341246291, + "grad_norm": 5.67612886428833, + "learning_rate": 4.6983655274888554e-08, + "loss": 0.0937, + "step": 17890 + }, + { + "epoch": 53.11572700296736, + "grad_norm": 18.611127853393555, + "learning_rate": 4.6953937592867756e-08, + "loss": 0.0886, + "step": 17900 + }, + { + "epoch": 53.14540059347181, + "grad_norm": 4.882875442504883, + "learning_rate": 4.692421991084695e-08, + "loss": 0.1246, + "step": 17910 + }, + { + "epoch": 53.175074183976264, + "grad_norm": 2.4630086421966553, + "learning_rate": 4.6894502228826147e-08, + "loss": 0.1196, + "step": 17920 + }, + { + "epoch": 53.204747774480715, + "grad_norm": 1.5022709369659424, + "learning_rate": 4.686478454680535e-08, + "loss": 0.1424, + "step": 17930 + }, + { + "epoch": 53.23442136498517, + "grad_norm": 2.6222424507141113, + "learning_rate": 4.6835066864784544e-08, + "loss": 0.1253, + "step": 17940 + }, + { + "epoch": 53.26409495548961, + "grad_norm": 24.638835906982422, + "learning_rate": 4.680534918276374e-08, + "loss": 0.1237, + "step": 17950 + }, + { + "epoch": 53.29376854599406, + "grad_norm": 15.805230140686035, + "learning_rate": 4.677563150074294e-08, + "loss": 0.1871, + "step": 17960 + }, + { + "epoch": 53.323442136498514, + "grad_norm": 0.09708958864212036, + "learning_rate": 4.674591381872214e-08, + "loss": 0.0679, + "step": 17970 + }, + { + "epoch": 53.353115727002965, + "grad_norm": 29.99150276184082, + "learning_rate": 4.671619613670133e-08, + "loss": 0.1009, + "step": 17980 + }, + { + "epoch": 53.38278931750742, + "grad_norm": 3.051279067993164, + "learning_rate": 4.6686478454680535e-08, + "loss": 0.1417, + "step": 17990 + }, + { + "epoch": 53.41246290801187, + "grad_norm": 0.09458828717470169, + "learning_rate": 4.665676077265973e-08, + "loss": 0.0557, + "step": 18000 + }, + { + "epoch": 53.44213649851632, + "grad_norm": 2.142488479614258, + "learning_rate": 4.6627043090638926e-08, + "loss": 0.0159, + "step": 18010 + }, + { + "epoch": 53.47181008902077, + "grad_norm": 27.72325325012207, + "learning_rate": 4.659732540861813e-08, + "loss": 0.0645, + "step": 18020 + }, + { + "epoch": 53.50148367952522, + "grad_norm": 6.143973350524902, + "learning_rate": 4.6567607726597324e-08, + "loss": 0.2689, + "step": 18030 + }, + { + "epoch": 53.531157270029674, + "grad_norm": 6.248129367828369, + "learning_rate": 4.653789004457652e-08, + "loss": 0.0869, + "step": 18040 + }, + { + "epoch": 53.560830860534125, + "grad_norm": 0.2991623878479004, + "learning_rate": 4.650817236255572e-08, + "loss": 0.0715, + "step": 18050 + }, + { + "epoch": 53.590504451038576, + "grad_norm": 0.13965006172657013, + "learning_rate": 4.647845468053492e-08, + "loss": 0.1029, + "step": 18060 + }, + { + "epoch": 53.62017804154303, + "grad_norm": 0.2683430016040802, + "learning_rate": 4.644873699851411e-08, + "loss": 0.1369, + "step": 18070 + }, + { + "epoch": 53.64985163204748, + "grad_norm": 0.30355045199394226, + "learning_rate": 4.6419019316493315e-08, + "loss": 0.1232, + "step": 18080 + }, + { + "epoch": 53.67952522255193, + "grad_norm": 0.45333707332611084, + "learning_rate": 4.638930163447251e-08, + "loss": 0.141, + "step": 18090 + }, + { + "epoch": 53.70919881305638, + "grad_norm": 10.719876289367676, + "learning_rate": 4.6359583952451706e-08, + "loss": 0.2275, + "step": 18100 + }, + { + "epoch": 53.73887240356083, + "grad_norm": 2.1315464973449707, + "learning_rate": 4.632986627043091e-08, + "loss": 0.0562, + "step": 18110 + }, + { + "epoch": 53.768545994065285, + "grad_norm": 2.096303701400757, + "learning_rate": 4.63001485884101e-08, + "loss": 0.0947, + "step": 18120 + }, + { + "epoch": 53.798219584569736, + "grad_norm": 12.821309089660645, + "learning_rate": 4.62704309063893e-08, + "loss": 0.0987, + "step": 18130 + }, + { + "epoch": 53.82789317507418, + "grad_norm": 0.026705903932452202, + "learning_rate": 4.62407132243685e-08, + "loss": 0.135, + "step": 18140 + }, + { + "epoch": 53.85756676557863, + "grad_norm": 33.73533248901367, + "learning_rate": 4.6210995542347696e-08, + "loss": 0.2725, + "step": 18150 + }, + { + "epoch": 53.88724035608308, + "grad_norm": 29.84590721130371, + "learning_rate": 4.618127786032689e-08, + "loss": 0.1764, + "step": 18160 + }, + { + "epoch": 53.916913946587535, + "grad_norm": 0.8282679915428162, + "learning_rate": 4.6151560178306094e-08, + "loss": 0.1331, + "step": 18170 + }, + { + "epoch": 53.946587537091986, + "grad_norm": 8.03003215789795, + "learning_rate": 4.612184249628529e-08, + "loss": 0.036, + "step": 18180 + }, + { + "epoch": 53.97626112759644, + "grad_norm": 2.3937315940856934, + "learning_rate": 4.6092124814264485e-08, + "loss": 0.1634, + "step": 18190 + }, + { + "epoch": 54.0, + "eval_accuracy": 0.9387527839643652, + "eval_loss": 0.21538078784942627, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.42, + "eval_samples_per_second": 139.875, + "eval_steps_per_second": 17.601, + "step": 18198 + }, + { + "epoch": 54.00593471810089, + "grad_norm": 0.13960596919059753, + "learning_rate": 4.606240713224368e-08, + "loss": 0.171, + "step": 18200 + }, + { + "epoch": 54.03560830860534, + "grad_norm": 22.90239715576172, + "learning_rate": 4.6032689450222876e-08, + "loss": 0.1737, + "step": 18210 + }, + { + "epoch": 54.06528189910979, + "grad_norm": 0.10312114655971527, + "learning_rate": 4.600297176820208e-08, + "loss": 0.1067, + "step": 18220 + }, + { + "epoch": 54.09495548961424, + "grad_norm": 1.474500060081482, + "learning_rate": 4.5973254086181274e-08, + "loss": 0.1308, + "step": 18230 + }, + { + "epoch": 54.124629080118694, + "grad_norm": 0.0846097320318222, + "learning_rate": 4.594353640416047e-08, + "loss": 0.0975, + "step": 18240 + }, + { + "epoch": 54.154302670623146, + "grad_norm": 0.35900041460990906, + "learning_rate": 4.591381872213967e-08, + "loss": 0.0284, + "step": 18250 + }, + { + "epoch": 54.1839762611276, + "grad_norm": 27.608943939208984, + "learning_rate": 4.588410104011887e-08, + "loss": 0.0835, + "step": 18260 + }, + { + "epoch": 54.21364985163205, + "grad_norm": 14.536050796508789, + "learning_rate": 4.585438335809806e-08, + "loss": 0.0584, + "step": 18270 + }, + { + "epoch": 54.2433234421365, + "grad_norm": 0.08200222253799438, + "learning_rate": 4.5824665676077264e-08, + "loss": 0.0253, + "step": 18280 + }, + { + "epoch": 54.27299703264095, + "grad_norm": 13.335010528564453, + "learning_rate": 4.579494799405646e-08, + "loss": 0.1016, + "step": 18290 + }, + { + "epoch": 54.3026706231454, + "grad_norm": 0.6454624533653259, + "learning_rate": 4.5765230312035655e-08, + "loss": 0.0614, + "step": 18300 + }, + { + "epoch": 54.332344213649854, + "grad_norm": 3.429720401763916, + "learning_rate": 4.573551263001486e-08, + "loss": 0.1066, + "step": 18310 + }, + { + "epoch": 54.362017804154306, + "grad_norm": 12.131242752075195, + "learning_rate": 4.570579494799405e-08, + "loss": 0.1803, + "step": 18320 + }, + { + "epoch": 54.39169139465876, + "grad_norm": 0.2834315299987793, + "learning_rate": 4.567607726597325e-08, + "loss": 0.0103, + "step": 18330 + }, + { + "epoch": 54.4213649851632, + "grad_norm": 0.2915608584880829, + "learning_rate": 4.564635958395245e-08, + "loss": 0.1319, + "step": 18340 + }, + { + "epoch": 54.45103857566765, + "grad_norm": 0.11542122811079025, + "learning_rate": 4.5616641901931646e-08, + "loss": 0.0776, + "step": 18350 + }, + { + "epoch": 54.480712166172104, + "grad_norm": 0.07382392138242722, + "learning_rate": 4.558692421991084e-08, + "loss": 0.1229, + "step": 18360 + }, + { + "epoch": 54.510385756676556, + "grad_norm": 0.06255319714546204, + "learning_rate": 4.5557206537890044e-08, + "loss": 0.1037, + "step": 18370 + }, + { + "epoch": 54.54005934718101, + "grad_norm": 13.259079933166504, + "learning_rate": 4.552748885586924e-08, + "loss": 0.0657, + "step": 18380 + }, + { + "epoch": 54.56973293768546, + "grad_norm": 50.039730072021484, + "learning_rate": 4.5497771173848435e-08, + "loss": 0.2461, + "step": 18390 + }, + { + "epoch": 54.59940652818991, + "grad_norm": 13.940142631530762, + "learning_rate": 4.546805349182764e-08, + "loss": 0.1769, + "step": 18400 + }, + { + "epoch": 54.62908011869436, + "grad_norm": 0.6205267310142517, + "learning_rate": 4.543833580980683e-08, + "loss": 0.1782, + "step": 18410 + }, + { + "epoch": 54.65875370919881, + "grad_norm": 23.032922744750977, + "learning_rate": 4.540861812778603e-08, + "loss": 0.1652, + "step": 18420 + }, + { + "epoch": 54.688427299703264, + "grad_norm": 0.25740522146224976, + "learning_rate": 4.537890044576523e-08, + "loss": 0.0434, + "step": 18430 + }, + { + "epoch": 54.718100890207715, + "grad_norm": 0.9557994604110718, + "learning_rate": 4.5349182763744426e-08, + "loss": 0.1289, + "step": 18440 + }, + { + "epoch": 54.74777448071217, + "grad_norm": 7.040013313293457, + "learning_rate": 4.531946508172362e-08, + "loss": 0.1888, + "step": 18450 + }, + { + "epoch": 54.77744807121662, + "grad_norm": 18.52097511291504, + "learning_rate": 4.528974739970282e-08, + "loss": 0.1082, + "step": 18460 + }, + { + "epoch": 54.80712166172107, + "grad_norm": 0.04265541583299637, + "learning_rate": 4.526002971768202e-08, + "loss": 0.1133, + "step": 18470 + }, + { + "epoch": 54.83679525222552, + "grad_norm": 0.43623632192611694, + "learning_rate": 4.5230312035661214e-08, + "loss": 0.0599, + "step": 18480 + }, + { + "epoch": 54.86646884272997, + "grad_norm": 28.51265525817871, + "learning_rate": 4.5200594353640416e-08, + "loss": 0.0992, + "step": 18490 + }, + { + "epoch": 54.896142433234424, + "grad_norm": 17.585397720336914, + "learning_rate": 4.517087667161961e-08, + "loss": 0.2832, + "step": 18500 + }, + { + "epoch": 54.925816023738875, + "grad_norm": 0.4795592725276947, + "learning_rate": 4.514115898959881e-08, + "loss": 0.1319, + "step": 18510 + }, + { + "epoch": 54.95548961424333, + "grad_norm": 0.8867962956428528, + "learning_rate": 4.511144130757801e-08, + "loss": 0.1686, + "step": 18520 + }, + { + "epoch": 54.98516320474778, + "grad_norm": 0.5381495952606201, + "learning_rate": 4.5081723625557205e-08, + "loss": 0.1468, + "step": 18530 + }, + { + "epoch": 55.0, + "eval_accuracy": 0.9387527839643652, + "eval_loss": 0.21615757048130035, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4343, + "eval_samples_per_second": 139.564, + "eval_steps_per_second": 17.562, + "step": 18535 + }, + { + "epoch": 55.01483679525222, + "grad_norm": 4.053667068481445, + "learning_rate": 4.50520059435364e-08, + "loss": 0.0584, + "step": 18540 + }, + { + "epoch": 55.04451038575667, + "grad_norm": 27.8851375579834, + "learning_rate": 4.50222882615156e-08, + "loss": 0.2037, + "step": 18550 + }, + { + "epoch": 55.074183976261125, + "grad_norm": 16.587261199951172, + "learning_rate": 4.49925705794948e-08, + "loss": 0.2317, + "step": 18560 + }, + { + "epoch": 55.103857566765576, + "grad_norm": 10.939718246459961, + "learning_rate": 4.4962852897473994e-08, + "loss": 0.0474, + "step": 18570 + }, + { + "epoch": 55.13353115727003, + "grad_norm": 3.5621824264526367, + "learning_rate": 4.4933135215453196e-08, + "loss": 0.1421, + "step": 18580 + }, + { + "epoch": 55.16320474777448, + "grad_norm": 0.2428486943244934, + "learning_rate": 4.490341753343239e-08, + "loss": 0.132, + "step": 18590 + }, + { + "epoch": 55.19287833827893, + "grad_norm": 1.3807475566864014, + "learning_rate": 4.487369985141159e-08, + "loss": 0.2114, + "step": 18600 + }, + { + "epoch": 55.22255192878338, + "grad_norm": 0.2000487595796585, + "learning_rate": 4.484398216939079e-08, + "loss": 0.1099, + "step": 18610 + }, + { + "epoch": 55.25222551928783, + "grad_norm": 0.191057488322258, + "learning_rate": 4.4814264487369984e-08, + "loss": 0.1291, + "step": 18620 + }, + { + "epoch": 55.281899109792285, + "grad_norm": 5.270522117614746, + "learning_rate": 4.478454680534918e-08, + "loss": 0.0919, + "step": 18630 + }, + { + "epoch": 55.311572700296736, + "grad_norm": 16.712888717651367, + "learning_rate": 4.475482912332838e-08, + "loss": 0.1312, + "step": 18640 + }, + { + "epoch": 55.34124629080119, + "grad_norm": 0.7915297746658325, + "learning_rate": 4.472511144130758e-08, + "loss": 0.091, + "step": 18650 + }, + { + "epoch": 55.37091988130564, + "grad_norm": 2.2512435913085938, + "learning_rate": 4.469539375928677e-08, + "loss": 0.1754, + "step": 18660 + }, + { + "epoch": 55.40059347181009, + "grad_norm": 0.17550450563430786, + "learning_rate": 4.4665676077265975e-08, + "loss": 0.1215, + "step": 18670 + }, + { + "epoch": 55.43026706231454, + "grad_norm": 37.675743103027344, + "learning_rate": 4.463595839524517e-08, + "loss": 0.0473, + "step": 18680 + }, + { + "epoch": 55.45994065281899, + "grad_norm": 15.365400314331055, + "learning_rate": 4.4606240713224366e-08, + "loss": 0.1299, + "step": 18690 + }, + { + "epoch": 55.489614243323444, + "grad_norm": 0.2100025713443756, + "learning_rate": 4.457652303120357e-08, + "loss": 0.117, + "step": 18700 + }, + { + "epoch": 55.519287833827896, + "grad_norm": 30.858510971069336, + "learning_rate": 4.4546805349182764e-08, + "loss": 0.1732, + "step": 18710 + }, + { + "epoch": 55.54896142433235, + "grad_norm": 18.448200225830078, + "learning_rate": 4.451708766716196e-08, + "loss": 0.2504, + "step": 18720 + }, + { + "epoch": 55.5786350148368, + "grad_norm": 0.09546982496976852, + "learning_rate": 4.4487369985141155e-08, + "loss": 0.161, + "step": 18730 + }, + { + "epoch": 55.60830860534124, + "grad_norm": 1.728231430053711, + "learning_rate": 4.445765230312035e-08, + "loss": 0.051, + "step": 18740 + }, + { + "epoch": 55.637982195845694, + "grad_norm": 0.07326488196849823, + "learning_rate": 4.442793462109955e-08, + "loss": 0.0408, + "step": 18750 + }, + { + "epoch": 55.667655786350146, + "grad_norm": 0.24781008064746857, + "learning_rate": 4.439821693907875e-08, + "loss": 0.2215, + "step": 18760 + }, + { + "epoch": 55.6973293768546, + "grad_norm": 0.8417479395866394, + "learning_rate": 4.4368499257057944e-08, + "loss": 0.0242, + "step": 18770 + }, + { + "epoch": 55.72700296735905, + "grad_norm": 0.14463327825069427, + "learning_rate": 4.4338781575037146e-08, + "loss": 0.063, + "step": 18780 + }, + { + "epoch": 55.7566765578635, + "grad_norm": 5.815578937530518, + "learning_rate": 4.430906389301634e-08, + "loss": 0.1054, + "step": 18790 + }, + { + "epoch": 55.78635014836795, + "grad_norm": 3.6503617763519287, + "learning_rate": 4.4279346210995537e-08, + "loss": 0.1137, + "step": 18800 + }, + { + "epoch": 55.8160237388724, + "grad_norm": 25.230243682861328, + "learning_rate": 4.424962852897474e-08, + "loss": 0.2474, + "step": 18810 + }, + { + "epoch": 55.845697329376854, + "grad_norm": 0.22377228736877441, + "learning_rate": 4.4219910846953934e-08, + "loss": 0.01, + "step": 18820 + }, + { + "epoch": 55.875370919881306, + "grad_norm": 0.23734180629253387, + "learning_rate": 4.419019316493313e-08, + "loss": 0.0428, + "step": 18830 + }, + { + "epoch": 55.90504451038576, + "grad_norm": 0.017383946105837822, + "learning_rate": 4.416047548291233e-08, + "loss": 0.1577, + "step": 18840 + }, + { + "epoch": 55.93471810089021, + "grad_norm": 0.5170205235481262, + "learning_rate": 4.413075780089153e-08, + "loss": 0.1281, + "step": 18850 + }, + { + "epoch": 55.96439169139466, + "grad_norm": 3.537140130996704, + "learning_rate": 4.410104011887072e-08, + "loss": 0.1219, + "step": 18860 + }, + { + "epoch": 55.99406528189911, + "grad_norm": 0.24714510142803192, + "learning_rate": 4.4071322436849925e-08, + "loss": 0.2018, + "step": 18870 + }, + { + "epoch": 56.0, + "eval_accuracy": 0.9376391982182628, + "eval_loss": 0.21553164720535278, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4202, + "eval_samples_per_second": 139.871, + "eval_steps_per_second": 17.601, + "step": 18872 + }, + { + "epoch": 56.02373887240356, + "grad_norm": 24.41205596923828, + "learning_rate": 4.404160475482912e-08, + "loss": 0.2701, + "step": 18880 + }, + { + "epoch": 56.053412462908014, + "grad_norm": 5.279331684112549, + "learning_rate": 4.4011887072808316e-08, + "loss": 0.0254, + "step": 18890 + }, + { + "epoch": 56.083086053412465, + "grad_norm": 0.030180493369698524, + "learning_rate": 4.398216939078752e-08, + "loss": 0.0939, + "step": 18900 + }, + { + "epoch": 56.11275964391692, + "grad_norm": 1.1968932151794434, + "learning_rate": 4.3952451708766714e-08, + "loss": 0.097, + "step": 18910 + }, + { + "epoch": 56.14243323442137, + "grad_norm": 0.03116113319993019, + "learning_rate": 4.392273402674591e-08, + "loss": 0.0859, + "step": 18920 + }, + { + "epoch": 56.17210682492582, + "grad_norm": 1.0056349039077759, + "learning_rate": 4.389301634472511e-08, + "loss": 0.1741, + "step": 18930 + }, + { + "epoch": 56.201780415430264, + "grad_norm": 0.2703072130680084, + "learning_rate": 4.386329866270431e-08, + "loss": 0.0587, + "step": 18940 + }, + { + "epoch": 56.231454005934715, + "grad_norm": 0.11760912835597992, + "learning_rate": 4.38335809806835e-08, + "loss": 0.1446, + "step": 18950 + }, + { + "epoch": 56.26112759643917, + "grad_norm": 0.058933477848768234, + "learning_rate": 4.3803863298662705e-08, + "loss": 0.0586, + "step": 18960 + }, + { + "epoch": 56.29080118694362, + "grad_norm": 0.1020619198679924, + "learning_rate": 4.37741456166419e-08, + "loss": 0.045, + "step": 18970 + }, + { + "epoch": 56.32047477744807, + "grad_norm": 0.2952682077884674, + "learning_rate": 4.3744427934621096e-08, + "loss": 0.1317, + "step": 18980 + }, + { + "epoch": 56.35014836795252, + "grad_norm": 0.651990532875061, + "learning_rate": 4.37147102526003e-08, + "loss": 0.0722, + "step": 18990 + }, + { + "epoch": 56.37982195845697, + "grad_norm": 12.091405868530273, + "learning_rate": 4.368499257057949e-08, + "loss": 0.2299, + "step": 19000 + }, + { + "epoch": 56.409495548961424, + "grad_norm": 0.23879829049110413, + "learning_rate": 4.365527488855869e-08, + "loss": 0.1369, + "step": 19010 + }, + { + "epoch": 56.439169139465875, + "grad_norm": 33.00304412841797, + "learning_rate": 4.362555720653789e-08, + "loss": 0.1082, + "step": 19020 + }, + { + "epoch": 56.468842729970326, + "grad_norm": 21.59186553955078, + "learning_rate": 4.3595839524517086e-08, + "loss": 0.1509, + "step": 19030 + }, + { + "epoch": 56.49851632047478, + "grad_norm": 0.18806514143943787, + "learning_rate": 4.356612184249628e-08, + "loss": 0.128, + "step": 19040 + }, + { + "epoch": 56.52818991097923, + "grad_norm": 5.368435859680176, + "learning_rate": 4.3536404160475484e-08, + "loss": 0.158, + "step": 19050 + }, + { + "epoch": 56.55786350148368, + "grad_norm": 2.423006534576416, + "learning_rate": 4.350668647845468e-08, + "loss": 0.018, + "step": 19060 + }, + { + "epoch": 56.58753709198813, + "grad_norm": 0.16314983367919922, + "learning_rate": 4.3476968796433875e-08, + "loss": 0.0961, + "step": 19070 + }, + { + "epoch": 56.61721068249258, + "grad_norm": 2.302842855453491, + "learning_rate": 4.344725111441308e-08, + "loss": 0.1792, + "step": 19080 + }, + { + "epoch": 56.646884272997035, + "grad_norm": 2.807516574859619, + "learning_rate": 4.341753343239227e-08, + "loss": 0.1226, + "step": 19090 + }, + { + "epoch": 56.676557863501486, + "grad_norm": 1.3524049520492554, + "learning_rate": 4.338781575037147e-08, + "loss": 0.1865, + "step": 19100 + }, + { + "epoch": 56.70623145400594, + "grad_norm": 0.5241471529006958, + "learning_rate": 4.335809806835067e-08, + "loss": 0.0904, + "step": 19110 + }, + { + "epoch": 56.73590504451039, + "grad_norm": 14.16799259185791, + "learning_rate": 4.3328380386329866e-08, + "loss": 0.0343, + "step": 19120 + }, + { + "epoch": 56.76557863501483, + "grad_norm": 0.2682155966758728, + "learning_rate": 4.329866270430906e-08, + "loss": 0.1868, + "step": 19130 + }, + { + "epoch": 56.795252225519285, + "grad_norm": 6.699462413787842, + "learning_rate": 4.3268945022288263e-08, + "loss": 0.1058, + "step": 19140 + }, + { + "epoch": 56.824925816023736, + "grad_norm": 4.119089126586914, + "learning_rate": 4.323922734026746e-08, + "loss": 0.1334, + "step": 19150 + }, + { + "epoch": 56.85459940652819, + "grad_norm": 0.29346227645874023, + "learning_rate": 4.3209509658246654e-08, + "loss": 0.1291, + "step": 19160 + }, + { + "epoch": 56.88427299703264, + "grad_norm": 0.07669465988874435, + "learning_rate": 4.3179791976225856e-08, + "loss": 0.0238, + "step": 19170 + }, + { + "epoch": 56.91394658753709, + "grad_norm": 19.339195251464844, + "learning_rate": 4.315007429420505e-08, + "loss": 0.139, + "step": 19180 + }, + { + "epoch": 56.94362017804154, + "grad_norm": 0.1426158845424652, + "learning_rate": 4.312035661218425e-08, + "loss": 0.0246, + "step": 19190 + }, + { + "epoch": 56.97329376854599, + "grad_norm": 12.752077102661133, + "learning_rate": 4.309063893016345e-08, + "loss": 0.3176, + "step": 19200 + }, + { + "epoch": 57.0, + "eval_accuracy": 0.9387527839643652, + "eval_loss": 0.21504710614681244, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4109, + "eval_samples_per_second": 140.074, + "eval_steps_per_second": 17.626, + "step": 19209 + }, + { + "epoch": 57.002967359050444, + "grad_norm": 0.04527468979358673, + "learning_rate": 4.3060921248142645e-08, + "loss": 0.0639, + "step": 19210 + }, + { + "epoch": 57.032640949554896, + "grad_norm": 12.827802658081055, + "learning_rate": 4.303120356612184e-08, + "loss": 0.1, + "step": 19220 + }, + { + "epoch": 57.06231454005935, + "grad_norm": 4.508238315582275, + "learning_rate": 4.300148588410104e-08, + "loss": 0.2034, + "step": 19230 + }, + { + "epoch": 57.0919881305638, + "grad_norm": 0.08051464706659317, + "learning_rate": 4.297176820208024e-08, + "loss": 0.0893, + "step": 19240 + }, + { + "epoch": 57.12166172106825, + "grad_norm": 0.39993274211883545, + "learning_rate": 4.2942050520059434e-08, + "loss": 0.1393, + "step": 19250 + }, + { + "epoch": 57.1513353115727, + "grad_norm": 12.097809791564941, + "learning_rate": 4.291233283803863e-08, + "loss": 0.2275, + "step": 19260 + }, + { + "epoch": 57.18100890207715, + "grad_norm": 0.10356388986110687, + "learning_rate": 4.2882615156017825e-08, + "loss": 0.101, + "step": 19270 + }, + { + "epoch": 57.210682492581604, + "grad_norm": 0.7746573090553284, + "learning_rate": 4.285289747399703e-08, + "loss": 0.0846, + "step": 19280 + }, + { + "epoch": 57.240356083086056, + "grad_norm": 22.501604080200195, + "learning_rate": 4.282317979197622e-08, + "loss": 0.1893, + "step": 19290 + }, + { + "epoch": 57.27002967359051, + "grad_norm": 0.16493858397006989, + "learning_rate": 4.279346210995542e-08, + "loss": 0.0244, + "step": 19300 + }, + { + "epoch": 57.29970326409496, + "grad_norm": 0.2059623748064041, + "learning_rate": 4.276374442793462e-08, + "loss": 0.1279, + "step": 19310 + }, + { + "epoch": 57.32937685459941, + "grad_norm": 0.025211386382579803, + "learning_rate": 4.2734026745913816e-08, + "loss": 0.0701, + "step": 19320 + }, + { + "epoch": 57.359050445103854, + "grad_norm": 18.30111312866211, + "learning_rate": 4.270430906389301e-08, + "loss": 0.0349, + "step": 19330 + }, + { + "epoch": 57.388724035608305, + "grad_norm": 11.93643856048584, + "learning_rate": 4.267459138187221e-08, + "loss": 0.2896, + "step": 19340 + }, + { + "epoch": 57.41839762611276, + "grad_norm": 2.6392433643341064, + "learning_rate": 4.264487369985141e-08, + "loss": 0.2194, + "step": 19350 + }, + { + "epoch": 57.44807121661721, + "grad_norm": 21.00785255432129, + "learning_rate": 4.2615156017830604e-08, + "loss": 0.2009, + "step": 19360 + }, + { + "epoch": 57.47774480712166, + "grad_norm": 0.048111021518707275, + "learning_rate": 4.2585438335809806e-08, + "loss": 0.0336, + "step": 19370 + }, + { + "epoch": 57.50741839762611, + "grad_norm": 0.059449195861816406, + "learning_rate": 4.2555720653789e-08, + "loss": 0.0308, + "step": 19380 + }, + { + "epoch": 57.53709198813056, + "grad_norm": 0.13079652190208435, + "learning_rate": 4.25260029717682e-08, + "loss": 0.0715, + "step": 19390 + }, + { + "epoch": 57.566765578635014, + "grad_norm": 26.17479705810547, + "learning_rate": 4.24962852897474e-08, + "loss": 0.2065, + "step": 19400 + }, + { + "epoch": 57.596439169139465, + "grad_norm": 5.0062737464904785, + "learning_rate": 4.2466567607726595e-08, + "loss": 0.1282, + "step": 19410 + }, + { + "epoch": 57.62611275964392, + "grad_norm": 10.62472152709961, + "learning_rate": 4.243684992570579e-08, + "loss": 0.1356, + "step": 19420 + }, + { + "epoch": 57.65578635014837, + "grad_norm": 0.1455954611301422, + "learning_rate": 4.240713224368499e-08, + "loss": 0.0314, + "step": 19430 + }, + { + "epoch": 57.68545994065282, + "grad_norm": 0.2594520151615143, + "learning_rate": 4.237741456166419e-08, + "loss": 0.0587, + "step": 19440 + }, + { + "epoch": 57.71513353115727, + "grad_norm": 0.4185321629047394, + "learning_rate": 4.2347696879643384e-08, + "loss": 0.0874, + "step": 19450 + }, + { + "epoch": 57.74480712166172, + "grad_norm": 0.3749360740184784, + "learning_rate": 4.2317979197622586e-08, + "loss": 0.0767, + "step": 19460 + }, + { + "epoch": 57.774480712166174, + "grad_norm": 14.158487319946289, + "learning_rate": 4.228826151560178e-08, + "loss": 0.0679, + "step": 19470 + }, + { + "epoch": 57.804154302670625, + "grad_norm": 0.3825371265411377, + "learning_rate": 4.225854383358098e-08, + "loss": 0.0625, + "step": 19480 + }, + { + "epoch": 57.833827893175076, + "grad_norm": 27.611360549926758, + "learning_rate": 4.222882615156018e-08, + "loss": 0.0315, + "step": 19490 + }, + { + "epoch": 57.86350148367953, + "grad_norm": 20.072166442871094, + "learning_rate": 4.2199108469539374e-08, + "loss": 0.0678, + "step": 19500 + }, + { + "epoch": 57.89317507418398, + "grad_norm": 0.17713208496570587, + "learning_rate": 4.216939078751857e-08, + "loss": 0.1741, + "step": 19510 + }, + { + "epoch": 57.92284866468843, + "grad_norm": 25.875642776489258, + "learning_rate": 4.213967310549777e-08, + "loss": 0.1492, + "step": 19520 + }, + { + "epoch": 57.952522255192875, + "grad_norm": 0.0641832947731018, + "learning_rate": 4.210995542347697e-08, + "loss": 0.0451, + "step": 19530 + }, + { + "epoch": 57.982195845697326, + "grad_norm": 25.249614715576172, + "learning_rate": 4.208023774145616e-08, + "loss": 0.1373, + "step": 19540 + }, + { + "epoch": 58.0, + "eval_accuracy": 0.9398663697104677, + "eval_loss": 0.21301446855068207, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4095, + "eval_samples_per_second": 140.105, + "eval_steps_per_second": 17.63, + "step": 19546 + }, + { + "epoch": 58.01186943620178, + "grad_norm": 0.3826630413532257, + "learning_rate": 4.2050520059435365e-08, + "loss": 0.0899, + "step": 19550 + }, + { + "epoch": 58.04154302670623, + "grad_norm": 1.6142431497573853, + "learning_rate": 4.202080237741456e-08, + "loss": 0.263, + "step": 19560 + }, + { + "epoch": 58.07121661721068, + "grad_norm": 2.775526285171509, + "learning_rate": 4.1991084695393756e-08, + "loss": 0.0608, + "step": 19570 + }, + { + "epoch": 58.10089020771513, + "grad_norm": 1.3405287265777588, + "learning_rate": 4.196136701337296e-08, + "loss": 0.1225, + "step": 19580 + }, + { + "epoch": 58.13056379821958, + "grad_norm": 4.504879474639893, + "learning_rate": 4.1931649331352154e-08, + "loss": 0.0606, + "step": 19590 + }, + { + "epoch": 58.160237388724035, + "grad_norm": 10.010052680969238, + "learning_rate": 4.190193164933135e-08, + "loss": 0.0986, + "step": 19600 + }, + { + "epoch": 58.189910979228486, + "grad_norm": 2.0360288619995117, + "learning_rate": 4.187221396731055e-08, + "loss": 0.075, + "step": 19610 + }, + { + "epoch": 58.21958456973294, + "grad_norm": 0.24835920333862305, + "learning_rate": 4.184249628528975e-08, + "loss": 0.0538, + "step": 19620 + }, + { + "epoch": 58.24925816023739, + "grad_norm": 1.3923277854919434, + "learning_rate": 4.181277860326894e-08, + "loss": 0.0845, + "step": 19630 + }, + { + "epoch": 58.27893175074184, + "grad_norm": 24.412200927734375, + "learning_rate": 4.1783060921248145e-08, + "loss": 0.1511, + "step": 19640 + }, + { + "epoch": 58.30860534124629, + "grad_norm": 0.04272349178791046, + "learning_rate": 4.175334323922734e-08, + "loss": 0.1517, + "step": 19650 + }, + { + "epoch": 58.33827893175074, + "grad_norm": 0.20252041518688202, + "learning_rate": 4.1723625557206536e-08, + "loss": 0.207, + "step": 19660 + }, + { + "epoch": 58.367952522255194, + "grad_norm": 0.6975942850112915, + "learning_rate": 4.169390787518574e-08, + "loss": 0.1652, + "step": 19670 + }, + { + "epoch": 58.397626112759646, + "grad_norm": 10.065960884094238, + "learning_rate": 4.166419019316493e-08, + "loss": 0.1253, + "step": 19680 + }, + { + "epoch": 58.4272997032641, + "grad_norm": 26.717851638793945, + "learning_rate": 4.163447251114413e-08, + "loss": 0.0877, + "step": 19690 + }, + { + "epoch": 58.45697329376855, + "grad_norm": 0.69111168384552, + "learning_rate": 4.160475482912333e-08, + "loss": 0.157, + "step": 19700 + }, + { + "epoch": 58.486646884273, + "grad_norm": 2.4029974937438965, + "learning_rate": 4.1575037147102526e-08, + "loss": 0.0248, + "step": 19710 + }, + { + "epoch": 58.51632047477745, + "grad_norm": 5.435256004333496, + "learning_rate": 4.154531946508172e-08, + "loss": 0.1422, + "step": 19720 + }, + { + "epoch": 58.545994065281896, + "grad_norm": 2.009157657623291, + "learning_rate": 4.1515601783060924e-08, + "loss": 0.057, + "step": 19730 + }, + { + "epoch": 58.57566765578635, + "grad_norm": 1.0079905986785889, + "learning_rate": 4.148588410104012e-08, + "loss": 0.1423, + "step": 19740 + }, + { + "epoch": 58.6053412462908, + "grad_norm": 1.02312433719635, + "learning_rate": 4.1456166419019315e-08, + "loss": 0.04, + "step": 19750 + }, + { + "epoch": 58.63501483679525, + "grad_norm": 0.10257770866155624, + "learning_rate": 4.142644873699852e-08, + "loss": 0.0714, + "step": 19760 + }, + { + "epoch": 58.6646884272997, + "grad_norm": 12.237330436706543, + "learning_rate": 4.1396731054977706e-08, + "loss": 0.0149, + "step": 19770 + }, + { + "epoch": 58.69436201780415, + "grad_norm": 5.213571071624756, + "learning_rate": 4.13670133729569e-08, + "loss": 0.1742, + "step": 19780 + }, + { + "epoch": 58.724035608308604, + "grad_norm": 1.6978023052215576, + "learning_rate": 4.1337295690936104e-08, + "loss": 0.0598, + "step": 19790 + }, + { + "epoch": 58.753709198813056, + "grad_norm": 0.2049713432788849, + "learning_rate": 4.13075780089153e-08, + "loss": 0.2522, + "step": 19800 + }, + { + "epoch": 58.78338278931751, + "grad_norm": 0.7940706610679626, + "learning_rate": 4.1277860326894495e-08, + "loss": 0.2081, + "step": 19810 + }, + { + "epoch": 58.81305637982196, + "grad_norm": 0.10785741358995438, + "learning_rate": 4.12481426448737e-08, + "loss": 0.2196, + "step": 19820 + }, + { + "epoch": 58.84272997032641, + "grad_norm": 0.7244274616241455, + "learning_rate": 4.121842496285289e-08, + "loss": 0.1918, + "step": 19830 + }, + { + "epoch": 58.87240356083086, + "grad_norm": 0.48458001017570496, + "learning_rate": 4.1188707280832094e-08, + "loss": 0.0863, + "step": 19840 + }, + { + "epoch": 58.90207715133531, + "grad_norm": 0.4201776087284088, + "learning_rate": 4.115898959881129e-08, + "loss": 0.1216, + "step": 19850 + }, + { + "epoch": 58.931750741839764, + "grad_norm": 18.671785354614258, + "learning_rate": 4.1129271916790486e-08, + "loss": 0.1874, + "step": 19860 + }, + { + "epoch": 58.961424332344215, + "grad_norm": 1.2378088235855103, + "learning_rate": 4.109955423476969e-08, + "loss": 0.149, + "step": 19870 + }, + { + "epoch": 58.99109792284867, + "grad_norm": 11.572568893432617, + "learning_rate": 4.106983655274888e-08, + "loss": 0.2037, + "step": 19880 + }, + { + "epoch": 59.0, + "eval_accuracy": 0.9398663697104677, + "eval_loss": 0.2139567732810974, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.3876, + "eval_samples_per_second": 140.584, + "eval_steps_per_second": 17.69, + "step": 19883 + }, + { + "epoch": 59.02077151335312, + "grad_norm": 0.3536854088306427, + "learning_rate": 4.104011887072808e-08, + "loss": 0.0166, + "step": 19890 + }, + { + "epoch": 59.05044510385757, + "grad_norm": 5.377005100250244, + "learning_rate": 4.101040118870728e-08, + "loss": 0.0914, + "step": 19900 + }, + { + "epoch": 59.08011869436202, + "grad_norm": 4.067911624908447, + "learning_rate": 4.0980683506686476e-08, + "loss": 0.0521, + "step": 19910 + }, + { + "epoch": 59.10979228486647, + "grad_norm": 6.344936847686768, + "learning_rate": 4.095096582466567e-08, + "loss": 0.0842, + "step": 19920 + }, + { + "epoch": 59.13946587537092, + "grad_norm": 8.9869966506958, + "learning_rate": 4.0921248142644874e-08, + "loss": 0.134, + "step": 19930 + }, + { + "epoch": 59.16913946587537, + "grad_norm": 0.5296763777732849, + "learning_rate": 4.089153046062407e-08, + "loss": 0.1071, + "step": 19940 + }, + { + "epoch": 59.19881305637982, + "grad_norm": 13.775887489318848, + "learning_rate": 4.0861812778603265e-08, + "loss": 0.1345, + "step": 19950 + }, + { + "epoch": 59.22848664688427, + "grad_norm": 0.07188119739294052, + "learning_rate": 4.083209509658247e-08, + "loss": 0.1513, + "step": 19960 + }, + { + "epoch": 59.25816023738872, + "grad_norm": 21.98750877380371, + "learning_rate": 4.080237741456166e-08, + "loss": 0.0635, + "step": 19970 + }, + { + "epoch": 59.287833827893174, + "grad_norm": 0.1115342304110527, + "learning_rate": 4.077265973254086e-08, + "loss": 0.1365, + "step": 19980 + }, + { + "epoch": 59.317507418397625, + "grad_norm": 0.11834222823381424, + "learning_rate": 4.074294205052006e-08, + "loss": 0.0938, + "step": 19990 + }, + { + "epoch": 59.347181008902076, + "grad_norm": 0.35669243335723877, + "learning_rate": 4.0713224368499256e-08, + "loss": 0.1722, + "step": 20000 + }, + { + "epoch": 59.37685459940653, + "grad_norm": 0.014322211034595966, + "learning_rate": 4.068350668647845e-08, + "loss": 0.2342, + "step": 20010 + }, + { + "epoch": 59.40652818991098, + "grad_norm": 0.29262566566467285, + "learning_rate": 4.0653789004457653e-08, + "loss": 0.1114, + "step": 20020 + }, + { + "epoch": 59.43620178041543, + "grad_norm": 5.646292209625244, + "learning_rate": 4.062407132243685e-08, + "loss": 0.1513, + "step": 20030 + }, + { + "epoch": 59.46587537091988, + "grad_norm": 0.8104656934738159, + "learning_rate": 4.0594353640416044e-08, + "loss": 0.1358, + "step": 20040 + }, + { + "epoch": 59.49554896142433, + "grad_norm": 0.7047475576400757, + "learning_rate": 4.0564635958395246e-08, + "loss": 0.1334, + "step": 20050 + }, + { + "epoch": 59.525222551928785, + "grad_norm": 9.53100299835205, + "learning_rate": 4.053491827637444e-08, + "loss": 0.0355, + "step": 20060 + }, + { + "epoch": 59.554896142433236, + "grad_norm": 1.5414891242980957, + "learning_rate": 4.050520059435364e-08, + "loss": 0.1104, + "step": 20070 + }, + { + "epoch": 59.58456973293769, + "grad_norm": 0.34475046396255493, + "learning_rate": 4.047548291233284e-08, + "loss": 0.0431, + "step": 20080 + }, + { + "epoch": 59.61424332344214, + "grad_norm": 0.05810406059026718, + "learning_rate": 4.0445765230312035e-08, + "loss": 0.1927, + "step": 20090 + }, + { + "epoch": 59.64391691394659, + "grad_norm": 6.237088203430176, + "learning_rate": 4.041604754829123e-08, + "loss": 0.0651, + "step": 20100 + }, + { + "epoch": 59.67359050445104, + "grad_norm": 0.0516524501144886, + "learning_rate": 4.038632986627043e-08, + "loss": 0.2198, + "step": 20110 + }, + { + "epoch": 59.703264094955486, + "grad_norm": 0.3602466285228729, + "learning_rate": 4.035661218424963e-08, + "loss": 0.2398, + "step": 20120 + }, + { + "epoch": 59.73293768545994, + "grad_norm": 4.058080673217773, + "learning_rate": 4.0326894502228824e-08, + "loss": 0.0533, + "step": 20130 + }, + { + "epoch": 59.76261127596439, + "grad_norm": 19.936752319335938, + "learning_rate": 4.0297176820208026e-08, + "loss": 0.1758, + "step": 20140 + }, + { + "epoch": 59.79228486646884, + "grad_norm": 0.059924885630607605, + "learning_rate": 4.026745913818722e-08, + "loss": 0.043, + "step": 20150 + }, + { + "epoch": 59.82195845697329, + "grad_norm": 0.06912856549024582, + "learning_rate": 4.023774145616642e-08, + "loss": 0.1434, + "step": 20160 + }, + { + "epoch": 59.85163204747774, + "grad_norm": 0.18286217749118805, + "learning_rate": 4.020802377414562e-08, + "loss": 0.1113, + "step": 20170 + }, + { + "epoch": 59.881305637982194, + "grad_norm": 1.1850963830947876, + "learning_rate": 4.0178306092124815e-08, + "loss": 0.2239, + "step": 20180 + }, + { + "epoch": 59.910979228486646, + "grad_norm": 0.08885449171066284, + "learning_rate": 4.014858841010401e-08, + "loss": 0.0939, + "step": 20190 + }, + { + "epoch": 59.9406528189911, + "grad_norm": 18.67483901977539, + "learning_rate": 4.011887072808321e-08, + "loss": 0.0823, + "step": 20200 + }, + { + "epoch": 59.97032640949555, + "grad_norm": 0.7483816146850586, + "learning_rate": 4.008915304606241e-08, + "loss": 0.0188, + "step": 20210 + }, + { + "epoch": 60.0, + "grad_norm": 0.039210036396980286, + "learning_rate": 4.00594353640416e-08, + "loss": 0.0822, + "step": 20220 + }, + { + "epoch": 60.0, + "eval_accuracy": 0.9387527839643652, + "eval_loss": 0.21453207731246948, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.3857, + "eval_samples_per_second": 140.626, + "eval_steps_per_second": 17.696, + "step": 20220 + }, + { + "epoch": 60.02967359050445, + "grad_norm": 0.5022128820419312, + "learning_rate": 4.0029717682020805e-08, + "loss": 0.0299, + "step": 20230 + }, + { + "epoch": 60.0593471810089, + "grad_norm": 16.38877296447754, + "learning_rate": 4e-08, + "loss": 0.1439, + "step": 20240 + }, + { + "epoch": 60.089020771513354, + "grad_norm": 0.0816325768828392, + "learning_rate": 3.9970282317979196e-08, + "loss": 0.1972, + "step": 20250 + }, + { + "epoch": 60.118694362017806, + "grad_norm": 0.6087883710861206, + "learning_rate": 3.99405646359584e-08, + "loss": 0.035, + "step": 20260 + }, + { + "epoch": 60.14836795252226, + "grad_norm": 3.8838796615600586, + "learning_rate": 3.9910846953937594e-08, + "loss": 0.2333, + "step": 20270 + }, + { + "epoch": 60.17804154302671, + "grad_norm": 0.01618698425590992, + "learning_rate": 3.988112927191679e-08, + "loss": 0.1552, + "step": 20280 + }, + { + "epoch": 60.20771513353116, + "grad_norm": 0.1347048133611679, + "learning_rate": 3.985141158989599e-08, + "loss": 0.2897, + "step": 20290 + }, + { + "epoch": 60.23738872403561, + "grad_norm": 0.7319648265838623, + "learning_rate": 3.982169390787518e-08, + "loss": 0.2274, + "step": 20300 + }, + { + "epoch": 60.26706231454006, + "grad_norm": 1.2421296834945679, + "learning_rate": 3.9791976225854376e-08, + "loss": 0.1519, + "step": 20310 + }, + { + "epoch": 60.29673590504451, + "grad_norm": 0.2309073507785797, + "learning_rate": 3.976225854383358e-08, + "loss": 0.0694, + "step": 20320 + }, + { + "epoch": 60.32640949554896, + "grad_norm": 4.153825759887695, + "learning_rate": 3.9732540861812774e-08, + "loss": 0.1835, + "step": 20330 + }, + { + "epoch": 60.35608308605341, + "grad_norm": 0.31095850467681885, + "learning_rate": 3.970282317979197e-08, + "loss": 0.0158, + "step": 20340 + }, + { + "epoch": 60.38575667655786, + "grad_norm": 0.046984415501356125, + "learning_rate": 3.967310549777117e-08, + "loss": 0.1715, + "step": 20350 + }, + { + "epoch": 60.41543026706231, + "grad_norm": 1.3429558277130127, + "learning_rate": 3.964338781575037e-08, + "loss": 0.1558, + "step": 20360 + }, + { + "epoch": 60.445103857566764, + "grad_norm": 13.21709156036377, + "learning_rate": 3.961367013372956e-08, + "loss": 0.0281, + "step": 20370 + }, + { + "epoch": 60.474777448071215, + "grad_norm": 1.4754674434661865, + "learning_rate": 3.9583952451708764e-08, + "loss": 0.0489, + "step": 20380 + }, + { + "epoch": 60.50445103857567, + "grad_norm": 22.908586502075195, + "learning_rate": 3.955423476968796e-08, + "loss": 0.1369, + "step": 20390 + }, + { + "epoch": 60.53412462908012, + "grad_norm": 5.819228649139404, + "learning_rate": 3.9524517087667155e-08, + "loss": 0.0601, + "step": 20400 + }, + { + "epoch": 60.56379821958457, + "grad_norm": 27.98056411743164, + "learning_rate": 3.949479940564636e-08, + "loss": 0.0567, + "step": 20410 + }, + { + "epoch": 60.59347181008902, + "grad_norm": 2.5692005157470703, + "learning_rate": 3.946508172362555e-08, + "loss": 0.1734, + "step": 20420 + }, + { + "epoch": 60.62314540059347, + "grad_norm": 0.13508091866970062, + "learning_rate": 3.943536404160475e-08, + "loss": 0.0684, + "step": 20430 + }, + { + "epoch": 60.652818991097924, + "grad_norm": 0.06365261226892471, + "learning_rate": 3.940564635958395e-08, + "loss": 0.1092, + "step": 20440 + }, + { + "epoch": 60.682492581602375, + "grad_norm": 23.340839385986328, + "learning_rate": 3.9375928677563146e-08, + "loss": 0.177, + "step": 20450 + }, + { + "epoch": 60.712166172106826, + "grad_norm": 2.5604941844940186, + "learning_rate": 3.934621099554234e-08, + "loss": 0.0612, + "step": 20460 + }, + { + "epoch": 60.74183976261128, + "grad_norm": 7.3809814453125, + "learning_rate": 3.9316493313521544e-08, + "loss": 0.116, + "step": 20470 + }, + { + "epoch": 60.77151335311573, + "grad_norm": 33.01871109008789, + "learning_rate": 3.928677563150074e-08, + "loss": 0.1278, + "step": 20480 + }, + { + "epoch": 60.80118694362018, + "grad_norm": 11.429134368896484, + "learning_rate": 3.9257057949479935e-08, + "loss": 0.1549, + "step": 20490 + }, + { + "epoch": 60.83086053412463, + "grad_norm": 11.244832038879395, + "learning_rate": 3.922734026745914e-08, + "loss": 0.1763, + "step": 20500 + }, + { + "epoch": 60.86053412462908, + "grad_norm": 2.079745054244995, + "learning_rate": 3.919762258543833e-08, + "loss": 0.0119, + "step": 20510 + }, + { + "epoch": 60.89020771513353, + "grad_norm": 24.433578491210938, + "learning_rate": 3.916790490341753e-08, + "loss": 0.111, + "step": 20520 + }, + { + "epoch": 60.91988130563798, + "grad_norm": 0.08574667572975159, + "learning_rate": 3.913818722139673e-08, + "loss": 0.0683, + "step": 20530 + }, + { + "epoch": 60.94955489614243, + "grad_norm": 0.04563106968998909, + "learning_rate": 3.9108469539375926e-08, + "loss": 0.0802, + "step": 20540 + }, + { + "epoch": 60.97922848664688, + "grad_norm": 18.50768280029297, + "learning_rate": 3.907875185735513e-08, + "loss": 0.0389, + "step": 20550 + }, + { + "epoch": 61.0, + "eval_accuracy": 0.9387527839643652, + "eval_loss": 0.21462009847164154, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.3954, + "eval_samples_per_second": 140.413, + "eval_steps_per_second": 17.669, + "step": 20557 + }, + { + "epoch": 61.00890207715133, + "grad_norm": 0.3333418071269989, + "learning_rate": 3.904903417533432e-08, + "loss": 0.0423, + "step": 20560 + }, + { + "epoch": 61.038575667655785, + "grad_norm": 1.3838636875152588, + "learning_rate": 3.901931649331352e-08, + "loss": 0.1328, + "step": 20570 + }, + { + "epoch": 61.068249258160236, + "grad_norm": 0.43667104840278625, + "learning_rate": 3.898959881129272e-08, + "loss": 0.2557, + "step": 20580 + }, + { + "epoch": 61.09792284866469, + "grad_norm": 24.14552879333496, + "learning_rate": 3.8959881129271916e-08, + "loss": 0.2242, + "step": 20590 + }, + { + "epoch": 61.12759643916914, + "grad_norm": 0.38586464524269104, + "learning_rate": 3.893016344725111e-08, + "loss": 0.0381, + "step": 20600 + }, + { + "epoch": 61.15727002967359, + "grad_norm": 2.3139126300811768, + "learning_rate": 3.8900445765230314e-08, + "loss": 0.2121, + "step": 20610 + }, + { + "epoch": 61.18694362017804, + "grad_norm": 5.977110862731934, + "learning_rate": 3.887072808320951e-08, + "loss": 0.1487, + "step": 20620 + }, + { + "epoch": 61.21661721068249, + "grad_norm": 0.040202606469392776, + "learning_rate": 3.8841010401188705e-08, + "loss": 0.0848, + "step": 20630 + }, + { + "epoch": 61.246290801186944, + "grad_norm": 3.395541191101074, + "learning_rate": 3.881129271916791e-08, + "loss": 0.0429, + "step": 20640 + }, + { + "epoch": 61.275964391691396, + "grad_norm": 0.0597139447927475, + "learning_rate": 3.87815750371471e-08, + "loss": 0.1212, + "step": 20650 + }, + { + "epoch": 61.30563798219585, + "grad_norm": 25.174171447753906, + "learning_rate": 3.87518573551263e-08, + "loss": 0.1302, + "step": 20660 + }, + { + "epoch": 61.3353115727003, + "grad_norm": 0.15728041529655457, + "learning_rate": 3.87221396731055e-08, + "loss": 0.0415, + "step": 20670 + }, + { + "epoch": 61.36498516320475, + "grad_norm": 0.9784430265426636, + "learning_rate": 3.8692421991084696e-08, + "loss": 0.2228, + "step": 20680 + }, + { + "epoch": 61.3946587537092, + "grad_norm": 0.02506338059902191, + "learning_rate": 3.866270430906389e-08, + "loss": 0.107, + "step": 20690 + }, + { + "epoch": 61.42433234421365, + "grad_norm": 0.07302246987819672, + "learning_rate": 3.8632986627043093e-08, + "loss": 0.1249, + "step": 20700 + }, + { + "epoch": 61.454005934718104, + "grad_norm": 0.11709848791360855, + "learning_rate": 3.860326894502229e-08, + "loss": 0.0838, + "step": 20710 + }, + { + "epoch": 61.48367952522255, + "grad_norm": 0.5479147434234619, + "learning_rate": 3.8573551263001484e-08, + "loss": 0.0653, + "step": 20720 + }, + { + "epoch": 61.513353115727, + "grad_norm": 0.5872834920883179, + "learning_rate": 3.8543833580980687e-08, + "loss": 0.0301, + "step": 20730 + }, + { + "epoch": 61.54302670623145, + "grad_norm": 3.723036289215088, + "learning_rate": 3.851411589895988e-08, + "loss": 0.1696, + "step": 20740 + }, + { + "epoch": 61.5727002967359, + "grad_norm": 2.144200563430786, + "learning_rate": 3.848439821693908e-08, + "loss": 0.2464, + "step": 20750 + }, + { + "epoch": 61.602373887240354, + "grad_norm": 2.4528162479400635, + "learning_rate": 3.845468053491828e-08, + "loss": 0.0996, + "step": 20760 + }, + { + "epoch": 61.632047477744806, + "grad_norm": 1.8446149826049805, + "learning_rate": 3.8424962852897475e-08, + "loss": 0.0844, + "step": 20770 + }, + { + "epoch": 61.66172106824926, + "grad_norm": 27.3325138092041, + "learning_rate": 3.839524517087667e-08, + "loss": 0.1631, + "step": 20780 + }, + { + "epoch": 61.69139465875371, + "grad_norm": 0.7625749707221985, + "learning_rate": 3.836552748885587e-08, + "loss": 0.0896, + "step": 20790 + }, + { + "epoch": 61.72106824925816, + "grad_norm": 0.05664860084652901, + "learning_rate": 3.833580980683507e-08, + "loss": 0.0798, + "step": 20800 + }, + { + "epoch": 61.75074183976261, + "grad_norm": 0.7609520554542542, + "learning_rate": 3.8306092124814264e-08, + "loss": 0.0701, + "step": 20810 + }, + { + "epoch": 61.78041543026706, + "grad_norm": 1.9650874137878418, + "learning_rate": 3.827637444279346e-08, + "loss": 0.0281, + "step": 20820 + }, + { + "epoch": 61.810089020771514, + "grad_norm": 0.1229434683918953, + "learning_rate": 3.8246656760772655e-08, + "loss": 0.2993, + "step": 20830 + }, + { + "epoch": 61.839762611275965, + "grad_norm": 4.57073974609375, + "learning_rate": 3.821693907875185e-08, + "loss": 0.0691, + "step": 20840 + }, + { + "epoch": 61.86943620178042, + "grad_norm": 1.6015318632125854, + "learning_rate": 3.818722139673105e-08, + "loss": 0.1052, + "step": 20850 + }, + { + "epoch": 61.89910979228487, + "grad_norm": 0.029950298368930817, + "learning_rate": 3.815750371471025e-08, + "loss": 0.1078, + "step": 20860 + }, + { + "epoch": 61.92878338278932, + "grad_norm": 0.04366104677319527, + "learning_rate": 3.8127786032689444e-08, + "loss": 0.1103, + "step": 20870 + }, + { + "epoch": 61.95845697329377, + "grad_norm": 3.8062872886657715, + "learning_rate": 3.8098068350668646e-08, + "loss": 0.1269, + "step": 20880 + }, + { + "epoch": 61.98813056379822, + "grad_norm": 29.268861770629883, + "learning_rate": 3.806835066864784e-08, + "loss": 0.0284, + "step": 20890 + }, + { + "epoch": 62.0, + "eval_accuracy": 0.9354120267260579, + "eval_loss": 0.21583662927150726, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.3809, + "eval_samples_per_second": 140.733, + "eval_steps_per_second": 17.709, + "step": 20894 + }, + { + "epoch": 62.017804154302674, + "grad_norm": 3.6597909927368164, + "learning_rate": 3.803863298662704e-08, + "loss": 0.0432, + "step": 20900 + }, + { + "epoch": 62.047477744807125, + "grad_norm": 0.648855447769165, + "learning_rate": 3.800891530460624e-08, + "loss": 0.3126, + "step": 20910 + }, + { + "epoch": 62.07715133531157, + "grad_norm": 16.089683532714844, + "learning_rate": 3.7979197622585434e-08, + "loss": 0.2077, + "step": 20920 + }, + { + "epoch": 62.10682492581602, + "grad_norm": 0.09567385911941528, + "learning_rate": 3.794947994056463e-08, + "loss": 0.0608, + "step": 20930 + }, + { + "epoch": 62.13649851632047, + "grad_norm": 0.7036837935447693, + "learning_rate": 3.791976225854383e-08, + "loss": 0.0897, + "step": 20940 + }, + { + "epoch": 62.166172106824924, + "grad_norm": 29.827733993530273, + "learning_rate": 3.789004457652303e-08, + "loss": 0.1248, + "step": 20950 + }, + { + "epoch": 62.195845697329375, + "grad_norm": 3.2616679668426514, + "learning_rate": 3.786032689450222e-08, + "loss": 0.0548, + "step": 20960 + }, + { + "epoch": 62.225519287833826, + "grad_norm": 5.594547748565674, + "learning_rate": 3.7830609212481425e-08, + "loss": 0.0812, + "step": 20970 + }, + { + "epoch": 62.25519287833828, + "grad_norm": 0.4343978464603424, + "learning_rate": 3.780089153046062e-08, + "loss": 0.0569, + "step": 20980 + }, + { + "epoch": 62.28486646884273, + "grad_norm": 19.991840362548828, + "learning_rate": 3.7771173848439816e-08, + "loss": 0.2759, + "step": 20990 + }, + { + "epoch": 62.31454005934718, + "grad_norm": 6.663703441619873, + "learning_rate": 3.774145616641902e-08, + "loss": 0.194, + "step": 21000 + }, + { + "epoch": 62.34421364985163, + "grad_norm": 0.9615452885627747, + "learning_rate": 3.7711738484398214e-08, + "loss": 0.1067, + "step": 21010 + }, + { + "epoch": 62.37388724035608, + "grad_norm": 25.811023712158203, + "learning_rate": 3.768202080237741e-08, + "loss": 0.1279, + "step": 21020 + }, + { + "epoch": 62.403560830860535, + "grad_norm": 26.204620361328125, + "learning_rate": 3.765230312035661e-08, + "loss": 0.2478, + "step": 21030 + }, + { + "epoch": 62.433234421364986, + "grad_norm": 0.037931546568870544, + "learning_rate": 3.762258543833581e-08, + "loss": 0.0234, + "step": 21040 + }, + { + "epoch": 62.46290801186944, + "grad_norm": 7.701491355895996, + "learning_rate": 3.7592867756315e-08, + "loss": 0.1223, + "step": 21050 + }, + { + "epoch": 62.49258160237389, + "grad_norm": 0.19550606608390808, + "learning_rate": 3.7563150074294205e-08, + "loss": 0.0234, + "step": 21060 + }, + { + "epoch": 62.52225519287834, + "grad_norm": 0.4739478528499603, + "learning_rate": 3.75334323922734e-08, + "loss": 0.0766, + "step": 21070 + }, + { + "epoch": 62.55192878338279, + "grad_norm": 14.261784553527832, + "learning_rate": 3.7503714710252596e-08, + "loss": 0.0926, + "step": 21080 + }, + { + "epoch": 62.58160237388724, + "grad_norm": 7.327427387237549, + "learning_rate": 3.74739970282318e-08, + "loss": 0.1849, + "step": 21090 + }, + { + "epoch": 62.611275964391695, + "grad_norm": 1.1986632347106934, + "learning_rate": 3.744427934621099e-08, + "loss": 0.0696, + "step": 21100 + }, + { + "epoch": 62.640949554896146, + "grad_norm": 7.291756629943848, + "learning_rate": 3.741456166419019e-08, + "loss": 0.0258, + "step": 21110 + }, + { + "epoch": 62.67062314540059, + "grad_norm": 36.29313659667969, + "learning_rate": 3.738484398216939e-08, + "loss": 0.073, + "step": 21120 + }, + { + "epoch": 62.70029673590504, + "grad_norm": 0.034853823482990265, + "learning_rate": 3.7355126300148586e-08, + "loss": 0.0496, + "step": 21130 + }, + { + "epoch": 62.72997032640949, + "grad_norm": 0.11678212881088257, + "learning_rate": 3.732540861812778e-08, + "loss": 0.0938, + "step": 21140 + }, + { + "epoch": 62.759643916913944, + "grad_norm": 0.08672474324703217, + "learning_rate": 3.7295690936106984e-08, + "loss": 0.0537, + "step": 21150 + }, + { + "epoch": 62.789317507418396, + "grad_norm": 19.631711959838867, + "learning_rate": 3.726597325408618e-08, + "loss": 0.0243, + "step": 21160 + }, + { + "epoch": 62.81899109792285, + "grad_norm": 25.03116226196289, + "learning_rate": 3.7236255572065375e-08, + "loss": 0.0777, + "step": 21170 + }, + { + "epoch": 62.8486646884273, + "grad_norm": 12.389244079589844, + "learning_rate": 3.720653789004458e-08, + "loss": 0.0681, + "step": 21180 + }, + { + "epoch": 62.87833827893175, + "grad_norm": 0.0801253393292427, + "learning_rate": 3.717682020802377e-08, + "loss": 0.2013, + "step": 21190 + }, + { + "epoch": 62.9080118694362, + "grad_norm": 10.151801109313965, + "learning_rate": 3.714710252600297e-08, + "loss": 0.2353, + "step": 21200 + }, + { + "epoch": 62.93768545994065, + "grad_norm": 0.7944397926330566, + "learning_rate": 3.711738484398217e-08, + "loss": 0.1969, + "step": 21210 + }, + { + "epoch": 62.967359050445104, + "grad_norm": 0.21934179961681366, + "learning_rate": 3.7087667161961366e-08, + "loss": 0.0676, + "step": 21220 + }, + { + "epoch": 62.997032640949556, + "grad_norm": 38.06763458251953, + "learning_rate": 3.705794947994056e-08, + "loss": 0.0688, + "step": 21230 + }, + { + "epoch": 63.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.21550700068473816, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.3873, + "eval_samples_per_second": 140.591, + "eval_steps_per_second": 17.691, + "step": 21231 + }, + { + "epoch": 63.02670623145401, + "grad_norm": 0.10738669335842133, + "learning_rate": 3.7028231797919763e-08, + "loss": 0.1111, + "step": 21240 + }, + { + "epoch": 63.05637982195846, + "grad_norm": 0.170561283826828, + "learning_rate": 3.699851411589896e-08, + "loss": 0.0468, + "step": 21250 + }, + { + "epoch": 63.08605341246291, + "grad_norm": 0.12636661529541016, + "learning_rate": 3.696879643387816e-08, + "loss": 0.0326, + "step": 21260 + }, + { + "epoch": 63.11572700296736, + "grad_norm": 9.373662948608398, + "learning_rate": 3.6939078751857357e-08, + "loss": 0.2184, + "step": 21270 + }, + { + "epoch": 63.14540059347181, + "grad_norm": 0.4143591821193695, + "learning_rate": 3.690936106983655e-08, + "loss": 0.1563, + "step": 21280 + }, + { + "epoch": 63.175074183976264, + "grad_norm": 0.09927982091903687, + "learning_rate": 3.6879643387815754e-08, + "loss": 0.1217, + "step": 21290 + }, + { + "epoch": 63.204747774480715, + "grad_norm": 1.0905404090881348, + "learning_rate": 3.684992570579495e-08, + "loss": 0.0824, + "step": 21300 + }, + { + "epoch": 63.23442136498517, + "grad_norm": 0.027889233082532883, + "learning_rate": 3.6820208023774145e-08, + "loss": 0.0829, + "step": 21310 + }, + { + "epoch": 63.26409495548961, + "grad_norm": 0.02461789920926094, + "learning_rate": 3.679049034175335e-08, + "loss": 0.0608, + "step": 21320 + }, + { + "epoch": 63.29376854599406, + "grad_norm": 3.8867502212524414, + "learning_rate": 3.676077265973254e-08, + "loss": 0.1179, + "step": 21330 + }, + { + "epoch": 63.323442136498514, + "grad_norm": 5.955862045288086, + "learning_rate": 3.673105497771174e-08, + "loss": 0.0265, + "step": 21340 + }, + { + "epoch": 63.353115727002965, + "grad_norm": 0.014909915626049042, + "learning_rate": 3.6701337295690934e-08, + "loss": 0.0479, + "step": 21350 + }, + { + "epoch": 63.38278931750742, + "grad_norm": 0.2238546460866928, + "learning_rate": 3.667161961367013e-08, + "loss": 0.1354, + "step": 21360 + }, + { + "epoch": 63.41246290801187, + "grad_norm": 1.0291926860809326, + "learning_rate": 3.6641901931649325e-08, + "loss": 0.0546, + "step": 21370 + }, + { + "epoch": 63.44213649851632, + "grad_norm": 4.551973819732666, + "learning_rate": 3.661218424962853e-08, + "loss": 0.0712, + "step": 21380 + }, + { + "epoch": 63.47181008902077, + "grad_norm": 2.690764904022217, + "learning_rate": 3.658246656760772e-08, + "loss": 0.155, + "step": 21390 + }, + { + "epoch": 63.50148367952522, + "grad_norm": 12.60783863067627, + "learning_rate": 3.655274888558692e-08, + "loss": 0.2186, + "step": 21400 + }, + { + "epoch": 63.531157270029674, + "grad_norm": 0.04005071520805359, + "learning_rate": 3.652303120356612e-08, + "loss": 0.1537, + "step": 21410 + }, + { + "epoch": 63.560830860534125, + "grad_norm": 19.562034606933594, + "learning_rate": 3.6493313521545316e-08, + "loss": 0.2405, + "step": 21420 + }, + { + "epoch": 63.590504451038576, + "grad_norm": 0.03735380247235298, + "learning_rate": 3.646359583952451e-08, + "loss": 0.1549, + "step": 21430 + }, + { + "epoch": 63.62017804154303, + "grad_norm": 3.4894440174102783, + "learning_rate": 3.643387815750371e-08, + "loss": 0.0923, + "step": 21440 + }, + { + "epoch": 63.64985163204748, + "grad_norm": 16.44664764404297, + "learning_rate": 3.640416047548291e-08, + "loss": 0.1899, + "step": 21450 + }, + { + "epoch": 63.67952522255193, + "grad_norm": 0.12129179388284683, + "learning_rate": 3.6374442793462104e-08, + "loss": 0.0111, + "step": 21460 + }, + { + "epoch": 63.70919881305638, + "grad_norm": 0.5461145043373108, + "learning_rate": 3.6344725111441306e-08, + "loss": 0.1387, + "step": 21470 + }, + { + "epoch": 63.73887240356083, + "grad_norm": 0.03296272084116936, + "learning_rate": 3.63150074294205e-08, + "loss": 0.1979, + "step": 21480 + }, + { + "epoch": 63.768545994065285, + "grad_norm": 0.877440869808197, + "learning_rate": 3.62852897473997e-08, + "loss": 0.0446, + "step": 21490 + }, + { + "epoch": 63.798219584569736, + "grad_norm": 7.329700946807861, + "learning_rate": 3.62555720653789e-08, + "loss": 0.235, + "step": 21500 + }, + { + "epoch": 63.82789317507418, + "grad_norm": 9.922820091247559, + "learning_rate": 3.6225854383358095e-08, + "loss": 0.1492, + "step": 21510 + }, + { + "epoch": 63.85756676557863, + "grad_norm": 14.266340255737305, + "learning_rate": 3.619613670133729e-08, + "loss": 0.1766, + "step": 21520 + }, + { + "epoch": 63.88724035608308, + "grad_norm": 24.026348114013672, + "learning_rate": 3.616641901931649e-08, + "loss": 0.2357, + "step": 21530 + }, + { + "epoch": 63.916913946587535, + "grad_norm": 0.0754564106464386, + "learning_rate": 3.613670133729569e-08, + "loss": 0.1632, + "step": 21540 + }, + { + "epoch": 63.946587537091986, + "grad_norm": 0.9624794125556946, + "learning_rate": 3.6106983655274884e-08, + "loss": 0.1156, + "step": 21550 + }, + { + "epoch": 63.97626112759644, + "grad_norm": 17.813106536865234, + "learning_rate": 3.6077265973254086e-08, + "loss": 0.072, + "step": 21560 + }, + { + "epoch": 64.0, + "eval_accuracy": 0.9376391982182628, + "eval_loss": 0.214819073677063, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4499, + "eval_samples_per_second": 139.227, + "eval_steps_per_second": 17.52, + "step": 21568 + }, + { + "epoch": 64.00593471810089, + "grad_norm": 0.0980786606669426, + "learning_rate": 3.604754829123328e-08, + "loss": 0.0863, + "step": 21570 + }, + { + "epoch": 64.03560830860535, + "grad_norm": 0.5276815295219421, + "learning_rate": 3.601783060921248e-08, + "loss": 0.1124, + "step": 21580 + }, + { + "epoch": 64.06528189910979, + "grad_norm": 0.0700572282075882, + "learning_rate": 3.598811292719168e-08, + "loss": 0.0677, + "step": 21590 + }, + { + "epoch": 64.09495548961425, + "grad_norm": 0.08522570878267288, + "learning_rate": 3.5958395245170874e-08, + "loss": 0.0576, + "step": 21600 + }, + { + "epoch": 64.1246290801187, + "grad_norm": 0.7345262169837952, + "learning_rate": 3.592867756315007e-08, + "loss": 0.0186, + "step": 21610 + }, + { + "epoch": 64.15430267062314, + "grad_norm": 1.8311265707015991, + "learning_rate": 3.589895988112927e-08, + "loss": 0.0963, + "step": 21620 + }, + { + "epoch": 64.1839762611276, + "grad_norm": 5.712540149688721, + "learning_rate": 3.586924219910847e-08, + "loss": 0.0075, + "step": 21630 + }, + { + "epoch": 64.21364985163204, + "grad_norm": 20.523571014404297, + "learning_rate": 3.583952451708766e-08, + "loss": 0.1091, + "step": 21640 + }, + { + "epoch": 64.2433234421365, + "grad_norm": 0.22826354205608368, + "learning_rate": 3.5809806835066865e-08, + "loss": 0.0906, + "step": 21650 + }, + { + "epoch": 64.27299703264094, + "grad_norm": 8.211501121520996, + "learning_rate": 3.578008915304606e-08, + "loss": 0.0494, + "step": 21660 + }, + { + "epoch": 64.3026706231454, + "grad_norm": 0.060058269649744034, + "learning_rate": 3.5750371471025256e-08, + "loss": 0.0993, + "step": 21670 + }, + { + "epoch": 64.33234421364985, + "grad_norm": 11.591500282287598, + "learning_rate": 3.572065378900446e-08, + "loss": 0.1176, + "step": 21680 + }, + { + "epoch": 64.3620178041543, + "grad_norm": 0.6577944755554199, + "learning_rate": 3.5690936106983654e-08, + "loss": 0.0156, + "step": 21690 + }, + { + "epoch": 64.39169139465875, + "grad_norm": 5.969498157501221, + "learning_rate": 3.566121842496285e-08, + "loss": 0.0576, + "step": 21700 + }, + { + "epoch": 64.42136498516321, + "grad_norm": 9.492920875549316, + "learning_rate": 3.563150074294205e-08, + "loss": 0.0655, + "step": 21710 + }, + { + "epoch": 64.45103857566765, + "grad_norm": 13.466991424560547, + "learning_rate": 3.560178306092125e-08, + "loss": 0.1636, + "step": 21720 + }, + { + "epoch": 64.48071216617211, + "grad_norm": 39.472042083740234, + "learning_rate": 3.557206537890044e-08, + "loss": 0.1466, + "step": 21730 + }, + { + "epoch": 64.51038575667656, + "grad_norm": 1.329652190208435, + "learning_rate": 3.5542347696879645e-08, + "loss": 0.2495, + "step": 21740 + }, + { + "epoch": 64.54005934718101, + "grad_norm": 3.5545904636383057, + "learning_rate": 3.551263001485884e-08, + "loss": 0.1485, + "step": 21750 + }, + { + "epoch": 64.56973293768546, + "grad_norm": 0.3025396466255188, + "learning_rate": 3.5482912332838036e-08, + "loss": 0.0151, + "step": 21760 + }, + { + "epoch": 64.59940652818992, + "grad_norm": 0.4819646179676056, + "learning_rate": 3.545319465081724e-08, + "loss": 0.0247, + "step": 21770 + }, + { + "epoch": 64.62908011869436, + "grad_norm": 33.32431411743164, + "learning_rate": 3.5423476968796433e-08, + "loss": 0.3625, + "step": 21780 + }, + { + "epoch": 64.65875370919882, + "grad_norm": 9.889187812805176, + "learning_rate": 3.539375928677563e-08, + "loss": 0.1006, + "step": 21790 + }, + { + "epoch": 64.68842729970326, + "grad_norm": 5.109920024871826, + "learning_rate": 3.536404160475483e-08, + "loss": 0.0721, + "step": 21800 + }, + { + "epoch": 64.71810089020771, + "grad_norm": 1.30587899684906, + "learning_rate": 3.5334323922734026e-08, + "loss": 0.0778, + "step": 21810 + }, + { + "epoch": 64.74777448071217, + "grad_norm": 0.07968409359455109, + "learning_rate": 3.530460624071322e-08, + "loss": 0.1947, + "step": 21820 + }, + { + "epoch": 64.77744807121661, + "grad_norm": 2.763672113418579, + "learning_rate": 3.5274888558692424e-08, + "loss": 0.2357, + "step": 21830 + }, + { + "epoch": 64.80712166172107, + "grad_norm": 4.4791436195373535, + "learning_rate": 3.524517087667162e-08, + "loss": 0.041, + "step": 21840 + }, + { + "epoch": 64.83679525222551, + "grad_norm": 24.83987045288086, + "learning_rate": 3.5215453194650815e-08, + "loss": 0.2589, + "step": 21850 + }, + { + "epoch": 64.86646884272997, + "grad_norm": 6.100238800048828, + "learning_rate": 3.518573551263002e-08, + "loss": 0.1472, + "step": 21860 + }, + { + "epoch": 64.89614243323442, + "grad_norm": 1.3396775722503662, + "learning_rate": 3.5156017830609206e-08, + "loss": 0.0446, + "step": 21870 + }, + { + "epoch": 64.92581602373888, + "grad_norm": 0.3272039294242859, + "learning_rate": 3.512630014858841e-08, + "loss": 0.0431, + "step": 21880 + }, + { + "epoch": 64.95548961424332, + "grad_norm": 24.597782135009766, + "learning_rate": 3.5096582466567604e-08, + "loss": 0.0811, + "step": 21890 + }, + { + "epoch": 64.98516320474778, + "grad_norm": 18.216787338256836, + "learning_rate": 3.50668647845468e-08, + "loss": 0.2157, + "step": 21900 + }, + { + "epoch": 65.0, + "eval_accuracy": 0.9376391982182628, + "eval_loss": 0.21524594724178314, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.39, + "eval_samples_per_second": 140.531, + "eval_steps_per_second": 17.684, + "step": 21905 + }, + { + "epoch": 65.01483679525222, + "grad_norm": 0.14716294407844543, + "learning_rate": 3.5037147102526e-08, + "loss": 0.1207, + "step": 21910 + }, + { + "epoch": 65.04451038575668, + "grad_norm": 7.924790382385254, + "learning_rate": 3.50074294205052e-08, + "loss": 0.157, + "step": 21920 + }, + { + "epoch": 65.07418397626112, + "grad_norm": 2.471666097640991, + "learning_rate": 3.497771173848439e-08, + "loss": 0.1531, + "step": 21930 + }, + { + "epoch": 65.10385756676558, + "grad_norm": 39.06802749633789, + "learning_rate": 3.4947994056463595e-08, + "loss": 0.0941, + "step": 21940 + }, + { + "epoch": 65.13353115727003, + "grad_norm": 7.894285678863525, + "learning_rate": 3.491827637444279e-08, + "loss": 0.1395, + "step": 21950 + }, + { + "epoch": 65.16320474777449, + "grad_norm": 17.090858459472656, + "learning_rate": 3.4888558692421986e-08, + "loss": 0.093, + "step": 21960 + }, + { + "epoch": 65.19287833827893, + "grad_norm": 0.16929692029953003, + "learning_rate": 3.485884101040119e-08, + "loss": 0.0111, + "step": 21970 + }, + { + "epoch": 65.22255192878339, + "grad_norm": 34.81718063354492, + "learning_rate": 3.482912332838038e-08, + "loss": 0.0978, + "step": 21980 + }, + { + "epoch": 65.25222551928783, + "grad_norm": 17.899751663208008, + "learning_rate": 3.479940564635958e-08, + "loss": 0.2818, + "step": 21990 + }, + { + "epoch": 65.28189910979229, + "grad_norm": 1.5292011499404907, + "learning_rate": 3.476968796433878e-08, + "loss": 0.0642, + "step": 22000 + }, + { + "epoch": 65.31157270029674, + "grad_norm": 8.607584953308105, + "learning_rate": 3.4739970282317976e-08, + "loss": 0.084, + "step": 22010 + }, + { + "epoch": 65.34124629080118, + "grad_norm": 16.26321792602539, + "learning_rate": 3.471025260029717e-08, + "loss": 0.0978, + "step": 22020 + }, + { + "epoch": 65.37091988130564, + "grad_norm": 0.0707717016339302, + "learning_rate": 3.4680534918276374e-08, + "loss": 0.072, + "step": 22030 + }, + { + "epoch": 65.40059347181008, + "grad_norm": 3.7242720127105713, + "learning_rate": 3.465081723625557e-08, + "loss": 0.0571, + "step": 22040 + }, + { + "epoch": 65.43026706231454, + "grad_norm": 2.655663251876831, + "learning_rate": 3.4621099554234765e-08, + "loss": 0.0127, + "step": 22050 + }, + { + "epoch": 65.45994065281899, + "grad_norm": 0.16307079792022705, + "learning_rate": 3.459138187221397e-08, + "loss": 0.0706, + "step": 22060 + }, + { + "epoch": 65.48961424332344, + "grad_norm": 0.23033441603183746, + "learning_rate": 3.456166419019316e-08, + "loss": 0.0709, + "step": 22070 + }, + { + "epoch": 65.51928783382789, + "grad_norm": 29.767139434814453, + "learning_rate": 3.453194650817236e-08, + "loss": 0.1674, + "step": 22080 + }, + { + "epoch": 65.54896142433235, + "grad_norm": 13.61626148223877, + "learning_rate": 3.450222882615156e-08, + "loss": 0.2168, + "step": 22090 + }, + { + "epoch": 65.57863501483679, + "grad_norm": 8.21802043914795, + "learning_rate": 3.4472511144130756e-08, + "loss": 0.1301, + "step": 22100 + }, + { + "epoch": 65.60830860534125, + "grad_norm": 19.532373428344727, + "learning_rate": 3.444279346210995e-08, + "loss": 0.1094, + "step": 22110 + }, + { + "epoch": 65.6379821958457, + "grad_norm": 11.06965160369873, + "learning_rate": 3.4413075780089153e-08, + "loss": 0.112, + "step": 22120 + }, + { + "epoch": 65.66765578635015, + "grad_norm": 17.097740173339844, + "learning_rate": 3.438335809806835e-08, + "loss": 0.2386, + "step": 22130 + }, + { + "epoch": 65.6973293768546, + "grad_norm": 0.34818902611732483, + "learning_rate": 3.4353640416047544e-08, + "loss": 0.1513, + "step": 22140 + }, + { + "epoch": 65.72700296735906, + "grad_norm": 18.577716827392578, + "learning_rate": 3.4323922734026747e-08, + "loss": 0.1607, + "step": 22150 + }, + { + "epoch": 65.7566765578635, + "grad_norm": 0.45560747385025024, + "learning_rate": 3.429420505200594e-08, + "loss": 0.0486, + "step": 22160 + }, + { + "epoch": 65.78635014836796, + "grad_norm": 7.466179847717285, + "learning_rate": 3.426448736998514e-08, + "loss": 0.087, + "step": 22170 + }, + { + "epoch": 65.8160237388724, + "grad_norm": 21.45915412902832, + "learning_rate": 3.423476968796434e-08, + "loss": 0.0869, + "step": 22180 + }, + { + "epoch": 65.84569732937686, + "grad_norm": 14.706273078918457, + "learning_rate": 3.4205052005943535e-08, + "loss": 0.0914, + "step": 22190 + }, + { + "epoch": 65.8753709198813, + "grad_norm": 0.21090994775295258, + "learning_rate": 3.417533432392273e-08, + "loss": 0.127, + "step": 22200 + }, + { + "epoch": 65.90504451038575, + "grad_norm": 0.18304432928562164, + "learning_rate": 3.414561664190193e-08, + "loss": 0.1129, + "step": 22210 + }, + { + "epoch": 65.93471810089021, + "grad_norm": 8.942584991455078, + "learning_rate": 3.411589895988113e-08, + "loss": 0.1332, + "step": 22220 + }, + { + "epoch": 65.96439169139465, + "grad_norm": 36.557857513427734, + "learning_rate": 3.4086181277860324e-08, + "loss": 0.0701, + "step": 22230 + }, + { + "epoch": 65.99406528189911, + "grad_norm": 0.6432826519012451, + "learning_rate": 3.4056463595839526e-08, + "loss": 0.0504, + "step": 22240 + }, + { + "epoch": 66.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.21466438472270966, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.6612, + "eval_samples_per_second": 134.81, + "eval_steps_per_second": 16.964, + "step": 22242 + }, + { + "epoch": 66.02373887240356, + "grad_norm": 0.11380965262651443, + "learning_rate": 3.402674591381872e-08, + "loss": 0.201, + "step": 22250 + }, + { + "epoch": 66.05341246290801, + "grad_norm": 0.1753075271844864, + "learning_rate": 3.399702823179792e-08, + "loss": 0.1279, + "step": 22260 + }, + { + "epoch": 66.08308605341246, + "grad_norm": 27.999914169311523, + "learning_rate": 3.396731054977712e-08, + "loss": 0.0827, + "step": 22270 + }, + { + "epoch": 66.11275964391692, + "grad_norm": 0.11182748526334763, + "learning_rate": 3.3937592867756315e-08, + "loss": 0.3431, + "step": 22280 + }, + { + "epoch": 66.14243323442136, + "grad_norm": 0.7182711958885193, + "learning_rate": 3.390787518573551e-08, + "loss": 0.1327, + "step": 22290 + }, + { + "epoch": 66.17210682492582, + "grad_norm": 0.3055594563484192, + "learning_rate": 3.387815750371471e-08, + "loss": 0.1747, + "step": 22300 + }, + { + "epoch": 66.20178041543026, + "grad_norm": 0.1695636510848999, + "learning_rate": 3.384843982169391e-08, + "loss": 0.1281, + "step": 22310 + }, + { + "epoch": 66.23145400593472, + "grad_norm": 13.885003089904785, + "learning_rate": 3.38187221396731e-08, + "loss": 0.1112, + "step": 22320 + }, + { + "epoch": 66.26112759643917, + "grad_norm": 3.058169364929199, + "learning_rate": 3.3789004457652305e-08, + "loss": 0.2072, + "step": 22330 + }, + { + "epoch": 66.29080118694363, + "grad_norm": 42.76344680786133, + "learning_rate": 3.37592867756315e-08, + "loss": 0.2601, + "step": 22340 + }, + { + "epoch": 66.32047477744807, + "grad_norm": 0.3705493211746216, + "learning_rate": 3.3729569093610696e-08, + "loss": 0.0847, + "step": 22350 + }, + { + "epoch": 66.35014836795253, + "grad_norm": 8.76292896270752, + "learning_rate": 3.36998514115899e-08, + "loss": 0.0502, + "step": 22360 + }, + { + "epoch": 66.37982195845697, + "grad_norm": 0.20587889850139618, + "learning_rate": 3.3670133729569094e-08, + "loss": 0.0125, + "step": 22370 + }, + { + "epoch": 66.40949554896143, + "grad_norm": 0.6230179667472839, + "learning_rate": 3.364041604754829e-08, + "loss": 0.0608, + "step": 22380 + }, + { + "epoch": 66.43916913946587, + "grad_norm": 0.04393775388598442, + "learning_rate": 3.361069836552749e-08, + "loss": 0.0063, + "step": 22390 + }, + { + "epoch": 66.46884272997033, + "grad_norm": 1.6063216924667358, + "learning_rate": 3.358098068350668e-08, + "loss": 0.0834, + "step": 22400 + }, + { + "epoch": 66.49851632047478, + "grad_norm": 9.917571067810059, + "learning_rate": 3.355126300148588e-08, + "loss": 0.1799, + "step": 22410 + }, + { + "epoch": 66.52818991097922, + "grad_norm": 1.602888584136963, + "learning_rate": 3.352154531946508e-08, + "loss": 0.1153, + "step": 22420 + }, + { + "epoch": 66.55786350148368, + "grad_norm": 0.10029692202806473, + "learning_rate": 3.3491827637444274e-08, + "loss": 0.0283, + "step": 22430 + }, + { + "epoch": 66.58753709198812, + "grad_norm": 0.17795151472091675, + "learning_rate": 3.3462109955423476e-08, + "loss": 0.1532, + "step": 22440 + }, + { + "epoch": 66.61721068249258, + "grad_norm": 0.3700118362903595, + "learning_rate": 3.343239227340267e-08, + "loss": 0.0572, + "step": 22450 + }, + { + "epoch": 66.64688427299703, + "grad_norm": 0.06877090781927109, + "learning_rate": 3.340267459138187e-08, + "loss": 0.1072, + "step": 22460 + }, + { + "epoch": 66.67655786350149, + "grad_norm": 0.1132412701845169, + "learning_rate": 3.337295690936107e-08, + "loss": 0.0577, + "step": 22470 + }, + { + "epoch": 66.70623145400593, + "grad_norm": 5.065250873565674, + "learning_rate": 3.3343239227340264e-08, + "loss": 0.1421, + "step": 22480 + }, + { + "epoch": 66.73590504451039, + "grad_norm": 2.218524217605591, + "learning_rate": 3.331352154531946e-08, + "loss": 0.145, + "step": 22490 + }, + { + "epoch": 66.76557863501483, + "grad_norm": 18.598480224609375, + "learning_rate": 3.328380386329866e-08, + "loss": 0.1486, + "step": 22500 + }, + { + "epoch": 66.79525222551929, + "grad_norm": 0.1693621277809143, + "learning_rate": 3.325408618127786e-08, + "loss": 0.0621, + "step": 22510 + }, + { + "epoch": 66.82492581602374, + "grad_norm": 2.6794567108154297, + "learning_rate": 3.322436849925705e-08, + "loss": 0.0877, + "step": 22520 + }, + { + "epoch": 66.8545994065282, + "grad_norm": 0.05296458303928375, + "learning_rate": 3.3194650817236255e-08, + "loss": 0.2213, + "step": 22530 + }, + { + "epoch": 66.88427299703264, + "grad_norm": 2.0937249660491943, + "learning_rate": 3.316493313521545e-08, + "loss": 0.1261, + "step": 22540 + }, + { + "epoch": 66.9139465875371, + "grad_norm": 19.72899627685547, + "learning_rate": 3.3135215453194646e-08, + "loss": 0.0245, + "step": 22550 + }, + { + "epoch": 66.94362017804154, + "grad_norm": 15.299428939819336, + "learning_rate": 3.310549777117385e-08, + "loss": 0.2362, + "step": 22560 + }, + { + "epoch": 66.973293768546, + "grad_norm": 21.386157989501953, + "learning_rate": 3.3075780089153044e-08, + "loss": 0.0749, + "step": 22570 + }, + { + "epoch": 67.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.21436813473701477, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.5578, + "eval_samples_per_second": 136.937, + "eval_steps_per_second": 17.231, + "step": 22579 + }, + { + "epoch": 67.00296735905044, + "grad_norm": 20.991018295288086, + "learning_rate": 3.304606240713224e-08, + "loss": 0.1671, + "step": 22580 + }, + { + "epoch": 67.0326409495549, + "grad_norm": 25.613224029541016, + "learning_rate": 3.301634472511144e-08, + "loss": 0.2264, + "step": 22590 + }, + { + "epoch": 67.06231454005935, + "grad_norm": 0.45081862807273865, + "learning_rate": 3.298662704309064e-08, + "loss": 0.1294, + "step": 22600 + }, + { + "epoch": 67.09198813056379, + "grad_norm": 15.637017250061035, + "learning_rate": 3.295690936106983e-08, + "loss": 0.0996, + "step": 22610 + }, + { + "epoch": 67.12166172106825, + "grad_norm": 0.8885765075683594, + "learning_rate": 3.2927191679049035e-08, + "loss": 0.1693, + "step": 22620 + }, + { + "epoch": 67.1513353115727, + "grad_norm": 8.278532981872559, + "learning_rate": 3.289747399702823e-08, + "loss": 0.011, + "step": 22630 + }, + { + "epoch": 67.18100890207715, + "grad_norm": 0.1448608636856079, + "learning_rate": 3.2867756315007426e-08, + "loss": 0.1297, + "step": 22640 + }, + { + "epoch": 67.2106824925816, + "grad_norm": 27.25446891784668, + "learning_rate": 3.283803863298663e-08, + "loss": 0.169, + "step": 22650 + }, + { + "epoch": 67.24035608308606, + "grad_norm": 0.6312297582626343, + "learning_rate": 3.280832095096582e-08, + "loss": 0.0686, + "step": 22660 + }, + { + "epoch": 67.2700296735905, + "grad_norm": 0.8003244400024414, + "learning_rate": 3.277860326894502e-08, + "loss": 0.1347, + "step": 22670 + }, + { + "epoch": 67.29970326409496, + "grad_norm": 21.152252197265625, + "learning_rate": 3.274888558692422e-08, + "loss": 0.1254, + "step": 22680 + }, + { + "epoch": 67.3293768545994, + "grad_norm": 0.0464036725461483, + "learning_rate": 3.2719167904903416e-08, + "loss": 0.1355, + "step": 22690 + }, + { + "epoch": 67.35905044510386, + "grad_norm": 3.4640443325042725, + "learning_rate": 3.268945022288261e-08, + "loss": 0.0865, + "step": 22700 + }, + { + "epoch": 67.3887240356083, + "grad_norm": 0.09148798137903214, + "learning_rate": 3.2659732540861814e-08, + "loss": 0.115, + "step": 22710 + }, + { + "epoch": 67.41839762611276, + "grad_norm": 11.933761596679688, + "learning_rate": 3.263001485884101e-08, + "loss": 0.183, + "step": 22720 + }, + { + "epoch": 67.44807121661721, + "grad_norm": 0.8215745091438293, + "learning_rate": 3.2600297176820205e-08, + "loss": 0.1887, + "step": 22730 + }, + { + "epoch": 67.47774480712167, + "grad_norm": 1.1779602766036987, + "learning_rate": 3.257057949479941e-08, + "loss": 0.0347, + "step": 22740 + }, + { + "epoch": 67.50741839762611, + "grad_norm": 0.030136406421661377, + "learning_rate": 3.25408618127786e-08, + "loss": 0.1188, + "step": 22750 + }, + { + "epoch": 67.53709198813057, + "grad_norm": 5.192669868469238, + "learning_rate": 3.25111441307578e-08, + "loss": 0.1307, + "step": 22760 + }, + { + "epoch": 67.56676557863501, + "grad_norm": 5.010256290435791, + "learning_rate": 3.2481426448737e-08, + "loss": 0.0959, + "step": 22770 + }, + { + "epoch": 67.59643916913947, + "grad_norm": 0.9659035205841064, + "learning_rate": 3.2451708766716196e-08, + "loss": 0.216, + "step": 22780 + }, + { + "epoch": 67.62611275964392, + "grad_norm": 1.0886143445968628, + "learning_rate": 3.242199108469539e-08, + "loss": 0.1145, + "step": 22790 + }, + { + "epoch": 67.65578635014836, + "grad_norm": 0.31104207038879395, + "learning_rate": 3.2392273402674593e-08, + "loss": 0.2326, + "step": 22800 + }, + { + "epoch": 67.68545994065282, + "grad_norm": 1.2276573181152344, + "learning_rate": 3.236255572065379e-08, + "loss": 0.1053, + "step": 22810 + }, + { + "epoch": 67.71513353115726, + "grad_norm": 5.071595668792725, + "learning_rate": 3.2332838038632985e-08, + "loss": 0.0571, + "step": 22820 + }, + { + "epoch": 67.74480712166172, + "grad_norm": 4.423135757446289, + "learning_rate": 3.2303120356612187e-08, + "loss": 0.1914, + "step": 22830 + }, + { + "epoch": 67.77448071216617, + "grad_norm": 19.529739379882812, + "learning_rate": 3.227340267459138e-08, + "loss": 0.1342, + "step": 22840 + }, + { + "epoch": 67.80415430267063, + "grad_norm": 5.241663932800293, + "learning_rate": 3.224368499257058e-08, + "loss": 0.0632, + "step": 22850 + }, + { + "epoch": 67.83382789317507, + "grad_norm": 0.704096257686615, + "learning_rate": 3.221396731054978e-08, + "loss": 0.0164, + "step": 22860 + }, + { + "epoch": 67.86350148367953, + "grad_norm": 0.23139019310474396, + "learning_rate": 3.2184249628528975e-08, + "loss": 0.1127, + "step": 22870 + }, + { + "epoch": 67.89317507418397, + "grad_norm": 27.71816635131836, + "learning_rate": 3.215453194650817e-08, + "loss": 0.1591, + "step": 22880 + }, + { + "epoch": 67.92284866468843, + "grad_norm": 27.9517765045166, + "learning_rate": 3.212481426448737e-08, + "loss": 0.2201, + "step": 22890 + }, + { + "epoch": 67.95252225519287, + "grad_norm": 15.616602897644043, + "learning_rate": 3.209509658246657e-08, + "loss": 0.0757, + "step": 22900 + }, + { + "epoch": 67.98219584569733, + "grad_norm": 0.8733271360397339, + "learning_rate": 3.2065378900445764e-08, + "loss": 0.2755, + "step": 22910 + }, + { + "epoch": 68.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.21455733478069305, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4251, + "eval_samples_per_second": 139.764, + "eval_steps_per_second": 17.587, + "step": 22916 + }, + { + "epoch": 68.01186943620178, + "grad_norm": 31.030261993408203, + "learning_rate": 3.2035661218424966e-08, + "loss": 0.1611, + "step": 22920 + }, + { + "epoch": 68.04154302670624, + "grad_norm": 1.123826265335083, + "learning_rate": 3.2005943536404155e-08, + "loss": 0.1636, + "step": 22930 + }, + { + "epoch": 68.07121661721068, + "grad_norm": 0.7579827904701233, + "learning_rate": 3.197622585438336e-08, + "loss": 0.2666, + "step": 22940 + }, + { + "epoch": 68.10089020771514, + "grad_norm": 8.212387084960938, + "learning_rate": 3.194650817236255e-08, + "loss": 0.1515, + "step": 22950 + }, + { + "epoch": 68.13056379821958, + "grad_norm": 0.25645363330841064, + "learning_rate": 3.191679049034175e-08, + "loss": 0.0537, + "step": 22960 + }, + { + "epoch": 68.16023738872404, + "grad_norm": 0.26609930396080017, + "learning_rate": 3.188707280832095e-08, + "loss": 0.1107, + "step": 22970 + }, + { + "epoch": 68.18991097922849, + "grad_norm": 0.3834483027458191, + "learning_rate": 3.1857355126300146e-08, + "loss": 0.1035, + "step": 22980 + }, + { + "epoch": 68.21958456973294, + "grad_norm": 25.05132293701172, + "learning_rate": 3.182763744427934e-08, + "loss": 0.2039, + "step": 22990 + }, + { + "epoch": 68.24925816023739, + "grad_norm": 18.19559097290039, + "learning_rate": 3.1797919762258543e-08, + "loss": 0.1988, + "step": 23000 + }, + { + "epoch": 68.27893175074183, + "grad_norm": 0.03914305195212364, + "learning_rate": 3.176820208023774e-08, + "loss": 0.1743, + "step": 23010 + }, + { + "epoch": 68.30860534124629, + "grad_norm": 0.27617037296295166, + "learning_rate": 3.1738484398216934e-08, + "loss": 0.1541, + "step": 23020 + }, + { + "epoch": 68.33827893175074, + "grad_norm": 0.031375136226415634, + "learning_rate": 3.1708766716196136e-08, + "loss": 0.158, + "step": 23030 + }, + { + "epoch": 68.3679525222552, + "grad_norm": 0.5404309630393982, + "learning_rate": 3.167904903417533e-08, + "loss": 0.1677, + "step": 23040 + }, + { + "epoch": 68.39762611275964, + "grad_norm": 0.16142582893371582, + "learning_rate": 3.164933135215453e-08, + "loss": 0.2063, + "step": 23050 + }, + { + "epoch": 68.4272997032641, + "grad_norm": 0.10212104767560959, + "learning_rate": 3.161961367013373e-08, + "loss": 0.0135, + "step": 23060 + }, + { + "epoch": 68.45697329376854, + "grad_norm": 11.531194686889648, + "learning_rate": 3.1589895988112925e-08, + "loss": 0.2499, + "step": 23070 + }, + { + "epoch": 68.486646884273, + "grad_norm": 1.6364941596984863, + "learning_rate": 3.156017830609212e-08, + "loss": 0.0776, + "step": 23080 + }, + { + "epoch": 68.51632047477744, + "grad_norm": 0.23265908658504486, + "learning_rate": 3.153046062407132e-08, + "loss": 0.0258, + "step": 23090 + }, + { + "epoch": 68.5459940652819, + "grad_norm": 1.0026811361312866, + "learning_rate": 3.150074294205052e-08, + "loss": 0.0382, + "step": 23100 + }, + { + "epoch": 68.57566765578635, + "grad_norm": 7.67866849899292, + "learning_rate": 3.1471025260029714e-08, + "loss": 0.1432, + "step": 23110 + }, + { + "epoch": 68.6053412462908, + "grad_norm": 0.06290806084871292, + "learning_rate": 3.1441307578008916e-08, + "loss": 0.1244, + "step": 23120 + }, + { + "epoch": 68.63501483679525, + "grad_norm": 7.192017555236816, + "learning_rate": 3.141158989598811e-08, + "loss": 0.1479, + "step": 23130 + }, + { + "epoch": 68.66468842729971, + "grad_norm": 0.661907434463501, + "learning_rate": 3.138187221396731e-08, + "loss": 0.0301, + "step": 23140 + }, + { + "epoch": 68.69436201780415, + "grad_norm": 11.946431159973145, + "learning_rate": 3.135215453194651e-08, + "loss": 0.023, + "step": 23150 + }, + { + "epoch": 68.72403560830861, + "grad_norm": 0.06608235090970993, + "learning_rate": 3.1322436849925705e-08, + "loss": 0.0382, + "step": 23160 + }, + { + "epoch": 68.75370919881306, + "grad_norm": 0.16639122366905212, + "learning_rate": 3.12927191679049e-08, + "loss": 0.0682, + "step": 23170 + }, + { + "epoch": 68.78338278931751, + "grad_norm": 8.820795059204102, + "learning_rate": 3.12630014858841e-08, + "loss": 0.0916, + "step": 23180 + }, + { + "epoch": 68.81305637982196, + "grad_norm": 0.9654533267021179, + "learning_rate": 3.12332838038633e-08, + "loss": 0.0785, + "step": 23190 + }, + { + "epoch": 68.8427299703264, + "grad_norm": 1.2793062925338745, + "learning_rate": 3.120356612184249e-08, + "loss": 0.1324, + "step": 23200 + }, + { + "epoch": 68.87240356083086, + "grad_norm": 8.007743835449219, + "learning_rate": 3.1173848439821695e-08, + "loss": 0.1033, + "step": 23210 + }, + { + "epoch": 68.9020771513353, + "grad_norm": 4.502223014831543, + "learning_rate": 3.114413075780089e-08, + "loss": 0.1899, + "step": 23220 + }, + { + "epoch": 68.93175074183976, + "grad_norm": 3.1419596672058105, + "learning_rate": 3.1114413075780086e-08, + "loss": 0.0578, + "step": 23230 + }, + { + "epoch": 68.96142433234421, + "grad_norm": 1.0220685005187988, + "learning_rate": 3.108469539375929e-08, + "loss": 0.0697, + "step": 23240 + }, + { + "epoch": 68.99109792284867, + "grad_norm": 0.40401822328567505, + "learning_rate": 3.1054977711738484e-08, + "loss": 0.0668, + "step": 23250 + }, + { + "epoch": 69.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.2146816849708557, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4688, + "eval_samples_per_second": 138.819, + "eval_steps_per_second": 17.468, + "step": 23253 + }, + { + "epoch": 69.02077151335311, + "grad_norm": 0.09608868509531021, + "learning_rate": 3.102526002971768e-08, + "loss": 0.0477, + "step": 23260 + }, + { + "epoch": 69.05044510385757, + "grad_norm": 0.017330914735794067, + "learning_rate": 3.099554234769688e-08, + "loss": 0.0899, + "step": 23270 + }, + { + "epoch": 69.08011869436201, + "grad_norm": 8.931391716003418, + "learning_rate": 3.096582466567608e-08, + "loss": 0.166, + "step": 23280 + }, + { + "epoch": 69.10979228486647, + "grad_norm": 7.54853630065918, + "learning_rate": 3.093610698365527e-08, + "loss": 0.1364, + "step": 23290 + }, + { + "epoch": 69.13946587537092, + "grad_norm": 0.12589536607265472, + "learning_rate": 3.0906389301634475e-08, + "loss": 0.07, + "step": 23300 + }, + { + "epoch": 69.16913946587538, + "grad_norm": 1.496419072151184, + "learning_rate": 3.087667161961367e-08, + "loss": 0.2164, + "step": 23310 + }, + { + "epoch": 69.19881305637982, + "grad_norm": 5.149827003479004, + "learning_rate": 3.0846953937592866e-08, + "loss": 0.0976, + "step": 23320 + }, + { + "epoch": 69.22848664688428, + "grad_norm": 6.946267127990723, + "learning_rate": 3.081723625557207e-08, + "loss": 0.0898, + "step": 23330 + }, + { + "epoch": 69.25816023738872, + "grad_norm": 0.08405820280313492, + "learning_rate": 3.0787518573551263e-08, + "loss": 0.1354, + "step": 23340 + }, + { + "epoch": 69.28783382789318, + "grad_norm": 32.566322326660156, + "learning_rate": 3.075780089153046e-08, + "loss": 0.3072, + "step": 23350 + }, + { + "epoch": 69.31750741839762, + "grad_norm": 0.21209552884101868, + "learning_rate": 3.072808320950966e-08, + "loss": 0.0728, + "step": 23360 + }, + { + "epoch": 69.34718100890208, + "grad_norm": 8.363408088684082, + "learning_rate": 3.0698365527488857e-08, + "loss": 0.1787, + "step": 23370 + }, + { + "epoch": 69.37685459940653, + "grad_norm": 0.5158764123916626, + "learning_rate": 3.066864784546805e-08, + "loss": 0.0478, + "step": 23380 + }, + { + "epoch": 69.40652818991099, + "grad_norm": 0.08160132169723511, + "learning_rate": 3.0638930163447254e-08, + "loss": 0.1771, + "step": 23390 + }, + { + "epoch": 69.43620178041543, + "grad_norm": 6.852834224700928, + "learning_rate": 3.060921248142645e-08, + "loss": 0.1654, + "step": 23400 + }, + { + "epoch": 69.46587537091987, + "grad_norm": 12.549159049987793, + "learning_rate": 3.0579494799405645e-08, + "loss": 0.2855, + "step": 23410 + }, + { + "epoch": 69.49554896142433, + "grad_norm": 1.7986286878585815, + "learning_rate": 3.054977711738485e-08, + "loss": 0.0281, + "step": 23420 + }, + { + "epoch": 69.52522255192878, + "grad_norm": 25.667591094970703, + "learning_rate": 3.052005943536404e-08, + "loss": 0.2737, + "step": 23430 + }, + { + "epoch": 69.55489614243324, + "grad_norm": 3.0696027278900146, + "learning_rate": 3.049034175334324e-08, + "loss": 0.0414, + "step": 23440 + }, + { + "epoch": 69.58456973293768, + "grad_norm": 0.3964274823665619, + "learning_rate": 3.0460624071322434e-08, + "loss": 0.1105, + "step": 23450 + }, + { + "epoch": 69.61424332344214, + "grad_norm": 0.23525798320770264, + "learning_rate": 3.043090638930163e-08, + "loss": 0.1111, + "step": 23460 + }, + { + "epoch": 69.64391691394658, + "grad_norm": 4.316598892211914, + "learning_rate": 3.040118870728083e-08, + "loss": 0.0726, + "step": 23470 + }, + { + "epoch": 69.67359050445104, + "grad_norm": 0.6952884197235107, + "learning_rate": 3.037147102526003e-08, + "loss": 0.0392, + "step": 23480 + }, + { + "epoch": 69.70326409495549, + "grad_norm": 0.25949233770370483, + "learning_rate": 3.034175334323922e-08, + "loss": 0.1203, + "step": 23490 + }, + { + "epoch": 69.73293768545994, + "grad_norm": 0.45124226808547974, + "learning_rate": 3.0312035661218425e-08, + "loss": 0.066, + "step": 23500 + }, + { + "epoch": 69.76261127596439, + "grad_norm": 0.15573439002037048, + "learning_rate": 3.028231797919762e-08, + "loss": 0.1304, + "step": 23510 + }, + { + "epoch": 69.79228486646885, + "grad_norm": 9.533170700073242, + "learning_rate": 3.0252600297176816e-08, + "loss": 0.126, + "step": 23520 + }, + { + "epoch": 69.82195845697329, + "grad_norm": 2.8052823543548584, + "learning_rate": 3.022288261515602e-08, + "loss": 0.1438, + "step": 23530 + }, + { + "epoch": 69.85163204747775, + "grad_norm": 0.31082990765571594, + "learning_rate": 3.019316493313521e-08, + "loss": 0.229, + "step": 23540 + }, + { + "epoch": 69.8813056379822, + "grad_norm": 13.117657661437988, + "learning_rate": 3.016344725111441e-08, + "loss": 0.0962, + "step": 23550 + }, + { + "epoch": 69.91097922848665, + "grad_norm": 8.092022895812988, + "learning_rate": 3.013372956909361e-08, + "loss": 0.1549, + "step": 23560 + }, + { + "epoch": 69.9406528189911, + "grad_norm": 0.32675430178642273, + "learning_rate": 3.0104011887072806e-08, + "loss": 0.1356, + "step": 23570 + }, + { + "epoch": 69.97032640949556, + "grad_norm": 6.792847156524658, + "learning_rate": 3.0074294205052e-08, + "loss": 0.0921, + "step": 23580 + }, + { + "epoch": 70.0, + "grad_norm": 0.0869373306632042, + "learning_rate": 3.0044576523031204e-08, + "loss": 0.1172, + "step": 23590 + }, + { + "epoch": 70.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.21435607969760895, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4465, + "eval_samples_per_second": 139.3, + "eval_steps_per_second": 17.529, + "step": 23590 + }, + { + "epoch": 70.02967359050444, + "grad_norm": 16.061342239379883, + "learning_rate": 3.00148588410104e-08, + "loss": 0.2631, + "step": 23600 + }, + { + "epoch": 70.0593471810089, + "grad_norm": 25.90700912475586, + "learning_rate": 2.9985141158989595e-08, + "loss": 0.1075, + "step": 23610 + }, + { + "epoch": 70.08902077151335, + "grad_norm": 3.8025810718536377, + "learning_rate": 2.99554234769688e-08, + "loss": 0.0102, + "step": 23620 + }, + { + "epoch": 70.1186943620178, + "grad_norm": 12.369791030883789, + "learning_rate": 2.992570579494799e-08, + "loss": 0.1587, + "step": 23630 + }, + { + "epoch": 70.14836795252225, + "grad_norm": 0.04863696172833443, + "learning_rate": 2.989598811292719e-08, + "loss": 0.1052, + "step": 23640 + }, + { + "epoch": 70.17804154302671, + "grad_norm": 0.4129660427570343, + "learning_rate": 2.986627043090639e-08, + "loss": 0.1144, + "step": 23650 + }, + { + "epoch": 70.20771513353115, + "grad_norm": 6.723005294799805, + "learning_rate": 2.9836552748885586e-08, + "loss": 0.011, + "step": 23660 + }, + { + "epoch": 70.23738872403561, + "grad_norm": 14.145872116088867, + "learning_rate": 2.980683506686478e-08, + "loss": 0.3349, + "step": 23670 + }, + { + "epoch": 70.26706231454006, + "grad_norm": 3.8153560161590576, + "learning_rate": 2.977711738484398e-08, + "loss": 0.1202, + "step": 23680 + }, + { + "epoch": 70.29673590504451, + "grad_norm": 19.890621185302734, + "learning_rate": 2.974739970282318e-08, + "loss": 0.3387, + "step": 23690 + }, + { + "epoch": 70.32640949554896, + "grad_norm": 1.4970753192901611, + "learning_rate": 2.9717682020802374e-08, + "loss": 0.2223, + "step": 23700 + }, + { + "epoch": 70.35608308605342, + "grad_norm": 11.730340003967285, + "learning_rate": 2.9687964338781573e-08, + "loss": 0.0878, + "step": 23710 + }, + { + "epoch": 70.38575667655786, + "grad_norm": 2.4869039058685303, + "learning_rate": 2.9658246656760772e-08, + "loss": 0.0274, + "step": 23720 + }, + { + "epoch": 70.41543026706232, + "grad_norm": 0.5863876938819885, + "learning_rate": 2.9628528974739968e-08, + "loss": 0.0903, + "step": 23730 + }, + { + "epoch": 70.44510385756676, + "grad_norm": 18.896570205688477, + "learning_rate": 2.9598811292719166e-08, + "loss": 0.125, + "step": 23740 + }, + { + "epoch": 70.47477744807122, + "grad_norm": 36.0111083984375, + "learning_rate": 2.9569093610698365e-08, + "loss": 0.124, + "step": 23750 + }, + { + "epoch": 70.50445103857567, + "grad_norm": 1.371915340423584, + "learning_rate": 2.953937592867756e-08, + "loss": 0.0574, + "step": 23760 + }, + { + "epoch": 70.53412462908013, + "grad_norm": 13.837684631347656, + "learning_rate": 2.950965824665676e-08, + "loss": 0.0247, + "step": 23770 + }, + { + "epoch": 70.56379821958457, + "grad_norm": 27.787487030029297, + "learning_rate": 2.947994056463596e-08, + "loss": 0.187, + "step": 23780 + }, + { + "epoch": 70.59347181008901, + "grad_norm": 4.242045879364014, + "learning_rate": 2.9450222882615154e-08, + "loss": 0.0425, + "step": 23790 + }, + { + "epoch": 70.62314540059347, + "grad_norm": 4.545398712158203, + "learning_rate": 2.9420505200594353e-08, + "loss": 0.0587, + "step": 23800 + }, + { + "epoch": 70.65281899109792, + "grad_norm": 22.26297950744629, + "learning_rate": 2.939078751857355e-08, + "loss": 0.0983, + "step": 23810 + }, + { + "epoch": 70.68249258160238, + "grad_norm": 7.697746753692627, + "learning_rate": 2.936106983655275e-08, + "loss": 0.0831, + "step": 23820 + }, + { + "epoch": 70.71216617210682, + "grad_norm": 0.22201137244701385, + "learning_rate": 2.9331352154531946e-08, + "loss": 0.0836, + "step": 23830 + }, + { + "epoch": 70.74183976261128, + "grad_norm": 2.1026604175567627, + "learning_rate": 2.9301634472511145e-08, + "loss": 0.0673, + "step": 23840 + }, + { + "epoch": 70.77151335311572, + "grad_norm": 2.128117799758911, + "learning_rate": 2.9271916790490344e-08, + "loss": 0.1252, + "step": 23850 + }, + { + "epoch": 70.80118694362018, + "grad_norm": 0.04580182954668999, + "learning_rate": 2.924219910846954e-08, + "loss": 0.0273, + "step": 23860 + }, + { + "epoch": 70.83086053412462, + "grad_norm": 0.21978555619716644, + "learning_rate": 2.9212481426448738e-08, + "loss": 0.1526, + "step": 23870 + }, + { + "epoch": 70.86053412462908, + "grad_norm": 20.255714416503906, + "learning_rate": 2.9182763744427937e-08, + "loss": 0.1747, + "step": 23880 + }, + { + "epoch": 70.89020771513353, + "grad_norm": 0.18826305866241455, + "learning_rate": 2.9153046062407132e-08, + "loss": 0.0159, + "step": 23890 + }, + { + "epoch": 70.91988130563799, + "grad_norm": 0.5652926564216614, + "learning_rate": 2.912332838038633e-08, + "loss": 0.1622, + "step": 23900 + }, + { + "epoch": 70.94955489614243, + "grad_norm": 30.026426315307617, + "learning_rate": 2.909361069836553e-08, + "loss": 0.0903, + "step": 23910 + }, + { + "epoch": 70.97922848664689, + "grad_norm": 0.5028266310691833, + "learning_rate": 2.9063893016344725e-08, + "loss": 0.0444, + "step": 23920 + }, + { + "epoch": 71.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.2140464037656784, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4399, + "eval_samples_per_second": 139.443, + "eval_steps_per_second": 17.547, + "step": 23927 + }, + { + "epoch": 71.00890207715133, + "grad_norm": 0.1538078635931015, + "learning_rate": 2.9034175334323924e-08, + "loss": 0.0691, + "step": 23930 + }, + { + "epoch": 71.03857566765579, + "grad_norm": 2.673807144165039, + "learning_rate": 2.9004457652303123e-08, + "loss": 0.1867, + "step": 23940 + }, + { + "epoch": 71.06824925816024, + "grad_norm": 31.203920364379883, + "learning_rate": 2.897473997028232e-08, + "loss": 0.2034, + "step": 23950 + }, + { + "epoch": 71.0979228486647, + "grad_norm": 0.24453473091125488, + "learning_rate": 2.8945022288261517e-08, + "loss": 0.1067, + "step": 23960 + }, + { + "epoch": 71.12759643916914, + "grad_norm": 0.5905619263648987, + "learning_rate": 2.8915304606240716e-08, + "loss": 0.1236, + "step": 23970 + }, + { + "epoch": 71.1572700296736, + "grad_norm": 26.954341888427734, + "learning_rate": 2.8885586924219908e-08, + "loss": 0.0953, + "step": 23980 + }, + { + "epoch": 71.18694362017804, + "grad_norm": 0.244420126080513, + "learning_rate": 2.8855869242199104e-08, + "loss": 0.199, + "step": 23990 + }, + { + "epoch": 71.21661721068249, + "grad_norm": 19.181415557861328, + "learning_rate": 2.8826151560178303e-08, + "loss": 0.0597, + "step": 24000 + }, + { + "epoch": 71.24629080118694, + "grad_norm": 0.4857439398765564, + "learning_rate": 2.87964338781575e-08, + "loss": 0.035, + "step": 24010 + }, + { + "epoch": 71.27596439169139, + "grad_norm": 0.22652041912078857, + "learning_rate": 2.8766716196136697e-08, + "loss": 0.1366, + "step": 24020 + }, + { + "epoch": 71.30563798219585, + "grad_norm": 8.519232749938965, + "learning_rate": 2.8736998514115896e-08, + "loss": 0.123, + "step": 24030 + }, + { + "epoch": 71.33531157270029, + "grad_norm": 0.257071316242218, + "learning_rate": 2.8707280832095095e-08, + "loss": 0.0867, + "step": 24040 + }, + { + "epoch": 71.36498516320475, + "grad_norm": 0.02793349325656891, + "learning_rate": 2.867756315007429e-08, + "loss": 0.1134, + "step": 24050 + }, + { + "epoch": 71.3946587537092, + "grad_norm": 41.03744888305664, + "learning_rate": 2.864784546805349e-08, + "loss": 0.2212, + "step": 24060 + }, + { + "epoch": 71.42433234421365, + "grad_norm": 20.574779510498047, + "learning_rate": 2.8618127786032688e-08, + "loss": 0.0227, + "step": 24070 + }, + { + "epoch": 71.4540059347181, + "grad_norm": 3.004847526550293, + "learning_rate": 2.8588410104011883e-08, + "loss": 0.1812, + "step": 24080 + }, + { + "epoch": 71.48367952522256, + "grad_norm": 1.1904126405715942, + "learning_rate": 2.8558692421991082e-08, + "loss": 0.0917, + "step": 24090 + }, + { + "epoch": 71.513353115727, + "grad_norm": 2.096341371536255, + "learning_rate": 2.852897473997028e-08, + "loss": 0.0911, + "step": 24100 + }, + { + "epoch": 71.54302670623146, + "grad_norm": 0.04477624595165253, + "learning_rate": 2.8499257057949476e-08, + "loss": 0.1221, + "step": 24110 + }, + { + "epoch": 71.5727002967359, + "grad_norm": 0.11669624596834183, + "learning_rate": 2.8469539375928675e-08, + "loss": 0.0288, + "step": 24120 + }, + { + "epoch": 71.60237388724036, + "grad_norm": 0.887355625629425, + "learning_rate": 2.8439821693907874e-08, + "loss": 0.0538, + "step": 24130 + }, + { + "epoch": 71.6320474777448, + "grad_norm": 3.2912182807922363, + "learning_rate": 2.841010401188707e-08, + "loss": 0.0192, + "step": 24140 + }, + { + "epoch": 71.66172106824926, + "grad_norm": 2.066939353942871, + "learning_rate": 2.8380386329866268e-08, + "loss": 0.1182, + "step": 24150 + }, + { + "epoch": 71.69139465875371, + "grad_norm": 13.18309211730957, + "learning_rate": 2.8350668647845467e-08, + "loss": 0.2951, + "step": 24160 + }, + { + "epoch": 71.72106824925817, + "grad_norm": 1.017183542251587, + "learning_rate": 2.8320950965824663e-08, + "loss": 0.0866, + "step": 24170 + }, + { + "epoch": 71.75074183976261, + "grad_norm": 3.4243462085723877, + "learning_rate": 2.829123328380386e-08, + "loss": 0.1092, + "step": 24180 + }, + { + "epoch": 71.78041543026706, + "grad_norm": 0.32482969760894775, + "learning_rate": 2.826151560178306e-08, + "loss": 0.0852, + "step": 24190 + }, + { + "epoch": 71.81008902077151, + "grad_norm": 13.063733100891113, + "learning_rate": 2.8231797919762256e-08, + "loss": 0.1278, + "step": 24200 + }, + { + "epoch": 71.83976261127596, + "grad_norm": 7.5946831703186035, + "learning_rate": 2.8202080237741455e-08, + "loss": 0.1978, + "step": 24210 + }, + { + "epoch": 71.86943620178042, + "grad_norm": 0.19417603313922882, + "learning_rate": 2.8172362555720653e-08, + "loss": 0.0255, + "step": 24220 + }, + { + "epoch": 71.89910979228486, + "grad_norm": 0.9870342016220093, + "learning_rate": 2.814264487369985e-08, + "loss": 0.0198, + "step": 24230 + }, + { + "epoch": 71.92878338278932, + "grad_norm": 3.2072653770446777, + "learning_rate": 2.8112927191679048e-08, + "loss": 0.0862, + "step": 24240 + }, + { + "epoch": 71.95845697329376, + "grad_norm": 6.328478813171387, + "learning_rate": 2.8083209509658247e-08, + "loss": 0.044, + "step": 24250 + }, + { + "epoch": 71.98813056379822, + "grad_norm": 0.30914050340652466, + "learning_rate": 2.8053491827637442e-08, + "loss": 0.0151, + "step": 24260 + }, + { + "epoch": 72.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.21391244232654572, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4382, + "eval_samples_per_second": 139.48, + "eval_steps_per_second": 17.552, + "step": 24264 + }, + { + "epoch": 72.01780415430267, + "grad_norm": 19.240577697753906, + "learning_rate": 2.802377414561664e-08, + "loss": 0.0843, + "step": 24270 + }, + { + "epoch": 72.04747774480713, + "grad_norm": 0.5960517525672913, + "learning_rate": 2.799405646359584e-08, + "loss": 0.0302, + "step": 24280 + }, + { + "epoch": 72.07715133531157, + "grad_norm": 0.2563973367214203, + "learning_rate": 2.7964338781575035e-08, + "loss": 0.0542, + "step": 24290 + }, + { + "epoch": 72.10682492581603, + "grad_norm": 0.12349473685026169, + "learning_rate": 2.7934621099554234e-08, + "loss": 0.0795, + "step": 24300 + }, + { + "epoch": 72.13649851632047, + "grad_norm": 18.538719177246094, + "learning_rate": 2.7904903417533433e-08, + "loss": 0.1533, + "step": 24310 + }, + { + "epoch": 72.16617210682493, + "grad_norm": 34.00318908691406, + "learning_rate": 2.7875185735512628e-08, + "loss": 0.1061, + "step": 24320 + }, + { + "epoch": 72.19584569732937, + "grad_norm": 0.4124857187271118, + "learning_rate": 2.7845468053491827e-08, + "loss": 0.1882, + "step": 24330 + }, + { + "epoch": 72.22551928783383, + "grad_norm": 2.345719337463379, + "learning_rate": 2.7815750371471026e-08, + "loss": 0.2126, + "step": 24340 + }, + { + "epoch": 72.25519287833828, + "grad_norm": 0.07407350838184357, + "learning_rate": 2.778603268945022e-08, + "loss": 0.0305, + "step": 24350 + }, + { + "epoch": 72.28486646884274, + "grad_norm": 27.938703536987305, + "learning_rate": 2.775631500742942e-08, + "loss": 0.1778, + "step": 24360 + }, + { + "epoch": 72.31454005934718, + "grad_norm": 22.241634368896484, + "learning_rate": 2.772659732540862e-08, + "loss": 0.1611, + "step": 24370 + }, + { + "epoch": 72.34421364985164, + "grad_norm": 15.07386302947998, + "learning_rate": 2.7696879643387815e-08, + "loss": 0.1489, + "step": 24380 + }, + { + "epoch": 72.37388724035608, + "grad_norm": 23.86855697631836, + "learning_rate": 2.7667161961367013e-08, + "loss": 0.1265, + "step": 24390 + }, + { + "epoch": 72.40356083086053, + "grad_norm": 0.21612170338630676, + "learning_rate": 2.7637444279346212e-08, + "loss": 0.2254, + "step": 24400 + }, + { + "epoch": 72.43323442136499, + "grad_norm": 17.510412216186523, + "learning_rate": 2.7607726597325408e-08, + "loss": 0.1667, + "step": 24410 + }, + { + "epoch": 72.46290801186943, + "grad_norm": 7.31636381149292, + "learning_rate": 2.7578008915304607e-08, + "loss": 0.1058, + "step": 24420 + }, + { + "epoch": 72.49258160237389, + "grad_norm": 30.76015281677246, + "learning_rate": 2.7548291233283805e-08, + "loss": 0.0937, + "step": 24430 + }, + { + "epoch": 72.52225519287833, + "grad_norm": 12.683025360107422, + "learning_rate": 2.7518573551263e-08, + "loss": 0.1562, + "step": 24440 + }, + { + "epoch": 72.55192878338279, + "grad_norm": 3.2623279094696045, + "learning_rate": 2.74888558692422e-08, + "loss": 0.0181, + "step": 24450 + }, + { + "epoch": 72.58160237388724, + "grad_norm": 1.1036429405212402, + "learning_rate": 2.74591381872214e-08, + "loss": 0.1198, + "step": 24460 + }, + { + "epoch": 72.6112759643917, + "grad_norm": 0.22911342978477478, + "learning_rate": 2.7429420505200594e-08, + "loss": 0.0771, + "step": 24470 + }, + { + "epoch": 72.64094955489614, + "grad_norm": 0.24080713093280792, + "learning_rate": 2.7399702823179793e-08, + "loss": 0.1076, + "step": 24480 + }, + { + "epoch": 72.6706231454006, + "grad_norm": 4.642510890960693, + "learning_rate": 2.7369985141158992e-08, + "loss": 0.2823, + "step": 24490 + }, + { + "epoch": 72.70029673590504, + "grad_norm": 51.818092346191406, + "learning_rate": 2.7340267459138184e-08, + "loss": 0.1437, + "step": 24500 + }, + { + "epoch": 72.7299703264095, + "grad_norm": 0.04532275348901749, + "learning_rate": 2.731054977711738e-08, + "loss": 0.0842, + "step": 24510 + }, + { + "epoch": 72.75964391691394, + "grad_norm": 0.9237525463104248, + "learning_rate": 2.7280832095096578e-08, + "loss": 0.124, + "step": 24520 + }, + { + "epoch": 72.7893175074184, + "grad_norm": 0.13483864068984985, + "learning_rate": 2.7251114413075777e-08, + "loss": 0.0772, + "step": 24530 + }, + { + "epoch": 72.81899109792285, + "grad_norm": 0.06471768021583557, + "learning_rate": 2.7221396731054973e-08, + "loss": 0.1214, + "step": 24540 + }, + { + "epoch": 72.8486646884273, + "grad_norm": 16.446203231811523, + "learning_rate": 2.719167904903417e-08, + "loss": 0.1231, + "step": 24550 + }, + { + "epoch": 72.87833827893175, + "grad_norm": 11.568562507629395, + "learning_rate": 2.716196136701337e-08, + "loss": 0.0341, + "step": 24560 + }, + { + "epoch": 72.90801186943621, + "grad_norm": 4.06491756439209, + "learning_rate": 2.713224368499257e-08, + "loss": 0.13, + "step": 24570 + }, + { + "epoch": 72.93768545994065, + "grad_norm": 20.78244400024414, + "learning_rate": 2.7102526002971764e-08, + "loss": 0.1106, + "step": 24580 + }, + { + "epoch": 72.9673590504451, + "grad_norm": 2.478055000305176, + "learning_rate": 2.7072808320950963e-08, + "loss": 0.2383, + "step": 24590 + }, + { + "epoch": 72.99703264094956, + "grad_norm": 0.1363847851753235, + "learning_rate": 2.7043090638930162e-08, + "loss": 0.0589, + "step": 24600 + }, + { + "epoch": 73.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.21280135214328766, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4436, + "eval_samples_per_second": 139.364, + "eval_steps_per_second": 17.537, + "step": 24601 + }, + { + "epoch": 73.026706231454, + "grad_norm": 24.08588409423828, + "learning_rate": 2.7013372956909358e-08, + "loss": 0.1759, + "step": 24610 + }, + { + "epoch": 73.05637982195846, + "grad_norm": 0.11079324781894684, + "learning_rate": 2.6983655274888556e-08, + "loss": 0.1981, + "step": 24620 + }, + { + "epoch": 73.0860534124629, + "grad_norm": 7.702559471130371, + "learning_rate": 2.6953937592867755e-08, + "loss": 0.0623, + "step": 24630 + }, + { + "epoch": 73.11572700296736, + "grad_norm": 15.62405014038086, + "learning_rate": 2.692421991084695e-08, + "loss": 0.0654, + "step": 24640 + }, + { + "epoch": 73.1454005934718, + "grad_norm": 0.11296529322862625, + "learning_rate": 2.689450222882615e-08, + "loss": 0.0488, + "step": 24650 + }, + { + "epoch": 73.17507418397626, + "grad_norm": 0.3461909890174866, + "learning_rate": 2.686478454680535e-08, + "loss": 0.1479, + "step": 24660 + }, + { + "epoch": 73.20474777448071, + "grad_norm": 28.400121688842773, + "learning_rate": 2.6835066864784544e-08, + "loss": 0.1259, + "step": 24670 + }, + { + "epoch": 73.23442136498517, + "grad_norm": 15.51494026184082, + "learning_rate": 2.6805349182763743e-08, + "loss": 0.1908, + "step": 24680 + }, + { + "epoch": 73.26409495548961, + "grad_norm": 0.7268391847610474, + "learning_rate": 2.677563150074294e-08, + "loss": 0.0479, + "step": 24690 + }, + { + "epoch": 73.29376854599407, + "grad_norm": 3.568721055984497, + "learning_rate": 2.6745913818722137e-08, + "loss": 0.0414, + "step": 24700 + }, + { + "epoch": 73.32344213649851, + "grad_norm": 22.746782302856445, + "learning_rate": 2.6716196136701336e-08, + "loss": 0.0284, + "step": 24710 + }, + { + "epoch": 73.35311572700297, + "grad_norm": 13.997262001037598, + "learning_rate": 2.6686478454680535e-08, + "loss": 0.0643, + "step": 24720 + }, + { + "epoch": 73.38278931750742, + "grad_norm": 0.2996833026409149, + "learning_rate": 2.665676077265973e-08, + "loss": 0.0079, + "step": 24730 + }, + { + "epoch": 73.41246290801188, + "grad_norm": 0.3103340268135071, + "learning_rate": 2.662704309063893e-08, + "loss": 0.005, + "step": 24740 + }, + { + "epoch": 73.44213649851632, + "grad_norm": 1.7172578573226929, + "learning_rate": 2.6597325408618128e-08, + "loss": 0.164, + "step": 24750 + }, + { + "epoch": 73.47181008902078, + "grad_norm": 20.99454116821289, + "learning_rate": 2.6567607726597323e-08, + "loss": 0.2169, + "step": 24760 + }, + { + "epoch": 73.50148367952522, + "grad_norm": 0.27654969692230225, + "learning_rate": 2.6537890044576522e-08, + "loss": 0.0917, + "step": 24770 + }, + { + "epoch": 73.53115727002967, + "grad_norm": 0.362141877412796, + "learning_rate": 2.650817236255572e-08, + "loss": 0.2726, + "step": 24780 + }, + { + "epoch": 73.56083086053413, + "grad_norm": 0.11802315711975098, + "learning_rate": 2.6478454680534916e-08, + "loss": 0.0699, + "step": 24790 + }, + { + "epoch": 73.59050445103857, + "grad_norm": 0.19492311775684357, + "learning_rate": 2.6448736998514115e-08, + "loss": 0.0591, + "step": 24800 + }, + { + "epoch": 73.62017804154303, + "grad_norm": 32.038116455078125, + "learning_rate": 2.6419019316493314e-08, + "loss": 0.2595, + "step": 24810 + }, + { + "epoch": 73.64985163204747, + "grad_norm": 1.2785868644714355, + "learning_rate": 2.638930163447251e-08, + "loss": 0.1782, + "step": 24820 + }, + { + "epoch": 73.67952522255193, + "grad_norm": 23.27189064025879, + "learning_rate": 2.635958395245171e-08, + "loss": 0.1321, + "step": 24830 + }, + { + "epoch": 73.70919881305637, + "grad_norm": 7.24150276184082, + "learning_rate": 2.6329866270430907e-08, + "loss": 0.1328, + "step": 24840 + }, + { + "epoch": 73.73887240356083, + "grad_norm": 0.513422966003418, + "learning_rate": 2.6300148588410103e-08, + "loss": 0.0779, + "step": 24850 + }, + { + "epoch": 73.76854599406528, + "grad_norm": 0.2550732493400574, + "learning_rate": 2.62704309063893e-08, + "loss": 0.0993, + "step": 24860 + }, + { + "epoch": 73.79821958456974, + "grad_norm": 14.442099571228027, + "learning_rate": 2.62407132243685e-08, + "loss": 0.3045, + "step": 24870 + }, + { + "epoch": 73.82789317507418, + "grad_norm": 0.22222663462162018, + "learning_rate": 2.6210995542347696e-08, + "loss": 0.0942, + "step": 24880 + }, + { + "epoch": 73.85756676557864, + "grad_norm": 14.221259117126465, + "learning_rate": 2.6181277860326895e-08, + "loss": 0.0986, + "step": 24890 + }, + { + "epoch": 73.88724035608308, + "grad_norm": 1.9469053745269775, + "learning_rate": 2.6151560178306094e-08, + "loss": 0.0784, + "step": 24900 + }, + { + "epoch": 73.91691394658754, + "grad_norm": 1.2633998394012451, + "learning_rate": 2.612184249628529e-08, + "loss": 0.0793, + "step": 24910 + }, + { + "epoch": 73.94658753709199, + "grad_norm": 16.340274810791016, + "learning_rate": 2.6092124814264488e-08, + "loss": 0.0344, + "step": 24920 + }, + { + "epoch": 73.97626112759644, + "grad_norm": 7.996826171875, + "learning_rate": 2.6062407132243687e-08, + "loss": 0.0837, + "step": 24930 + }, + { + "epoch": 74.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.21323858201503754, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4616, + "eval_samples_per_second": 138.974, + "eval_steps_per_second": 17.488, + "step": 24938 + }, + { + "epoch": 74.00593471810089, + "grad_norm": 21.485498428344727, + "learning_rate": 2.6032689450222882e-08, + "loss": 0.0351, + "step": 24940 + }, + { + "epoch": 74.03560830860535, + "grad_norm": 0.7053015828132629, + "learning_rate": 2.600297176820208e-08, + "loss": 0.1028, + "step": 24950 + }, + { + "epoch": 74.06528189910979, + "grad_norm": 0.48565417528152466, + "learning_rate": 2.597325408618128e-08, + "loss": 0.0421, + "step": 24960 + }, + { + "epoch": 74.09495548961425, + "grad_norm": 0.9154394865036011, + "learning_rate": 2.5943536404160475e-08, + "loss": 0.1323, + "step": 24970 + }, + { + "epoch": 74.1246290801187, + "grad_norm": 0.36087489128112793, + "learning_rate": 2.5913818722139674e-08, + "loss": 0.084, + "step": 24980 + }, + { + "epoch": 74.15430267062314, + "grad_norm": 0.044180337339639664, + "learning_rate": 2.5884101040118873e-08, + "loss": 0.0371, + "step": 24990 + }, + { + "epoch": 74.1839762611276, + "grad_norm": 0.9429210424423218, + "learning_rate": 2.585438335809807e-08, + "loss": 0.0542, + "step": 25000 + }, + { + "epoch": 74.21364985163204, + "grad_norm": 8.689069747924805, + "learning_rate": 2.5824665676077267e-08, + "loss": 0.0686, + "step": 25010 + }, + { + "epoch": 74.2433234421365, + "grad_norm": 12.095833778381348, + "learning_rate": 2.5794947994056466e-08, + "loss": 0.1257, + "step": 25020 + }, + { + "epoch": 74.27299703264094, + "grad_norm": 11.239665031433105, + "learning_rate": 2.5765230312035658e-08, + "loss": 0.1982, + "step": 25030 + }, + { + "epoch": 74.3026706231454, + "grad_norm": 0.07872098684310913, + "learning_rate": 2.5735512630014854e-08, + "loss": 0.2934, + "step": 25040 + }, + { + "epoch": 74.33234421364985, + "grad_norm": 24.6083984375, + "learning_rate": 2.5705794947994053e-08, + "loss": 0.1275, + "step": 25050 + }, + { + "epoch": 74.3620178041543, + "grad_norm": 0.9260368943214417, + "learning_rate": 2.567607726597325e-08, + "loss": 0.0202, + "step": 25060 + }, + { + "epoch": 74.39169139465875, + "grad_norm": 0.483385294675827, + "learning_rate": 2.5646359583952447e-08, + "loss": 0.0765, + "step": 25070 + }, + { + "epoch": 74.42136498516321, + "grad_norm": 13.815287590026855, + "learning_rate": 2.5616641901931646e-08, + "loss": 0.1485, + "step": 25080 + }, + { + "epoch": 74.45103857566765, + "grad_norm": 24.407188415527344, + "learning_rate": 2.5586924219910845e-08, + "loss": 0.0949, + "step": 25090 + }, + { + "epoch": 74.48071216617211, + "grad_norm": 19.66283416748047, + "learning_rate": 2.555720653789004e-08, + "loss": 0.075, + "step": 25100 + }, + { + "epoch": 74.51038575667656, + "grad_norm": 0.785720944404602, + "learning_rate": 2.552748885586924e-08, + "loss": 0.0584, + "step": 25110 + }, + { + "epoch": 74.54005934718101, + "grad_norm": 12.038480758666992, + "learning_rate": 2.5497771173848438e-08, + "loss": 0.0146, + "step": 25120 + }, + { + "epoch": 74.56973293768546, + "grad_norm": 0.022142969071865082, + "learning_rate": 2.5468053491827633e-08, + "loss": 0.1039, + "step": 25130 + }, + { + "epoch": 74.59940652818992, + "grad_norm": 0.0834992453455925, + "learning_rate": 2.5438335809806832e-08, + "loss": 0.0844, + "step": 25140 + }, + { + "epoch": 74.62908011869436, + "grad_norm": 0.7954944372177124, + "learning_rate": 2.540861812778603e-08, + "loss": 0.0259, + "step": 25150 + }, + { + "epoch": 74.65875370919882, + "grad_norm": 1.6571975946426392, + "learning_rate": 2.5378900445765226e-08, + "loss": 0.074, + "step": 25160 + }, + { + "epoch": 74.68842729970326, + "grad_norm": 1.0592490434646606, + "learning_rate": 2.5349182763744425e-08, + "loss": 0.216, + "step": 25170 + }, + { + "epoch": 74.71810089020771, + "grad_norm": 5.199361324310303, + "learning_rate": 2.5319465081723624e-08, + "loss": 0.1591, + "step": 25180 + }, + { + "epoch": 74.74777448071217, + "grad_norm": 0.05440591648221016, + "learning_rate": 2.528974739970282e-08, + "loss": 0.2293, + "step": 25190 + }, + { + "epoch": 74.77744807121661, + "grad_norm": 0.06045141816139221, + "learning_rate": 2.5260029717682018e-08, + "loss": 0.0749, + "step": 25200 + }, + { + "epoch": 74.80712166172107, + "grad_norm": 0.035291604697704315, + "learning_rate": 2.5230312035661217e-08, + "loss": 0.0971, + "step": 25210 + }, + { + "epoch": 74.83679525222551, + "grad_norm": 0.553788959980011, + "learning_rate": 2.5200594353640413e-08, + "loss": 0.0871, + "step": 25220 + }, + { + "epoch": 74.86646884272997, + "grad_norm": 0.08223233371973038, + "learning_rate": 2.517087667161961e-08, + "loss": 0.0353, + "step": 25230 + }, + { + "epoch": 74.89614243323442, + "grad_norm": 0.9288651943206787, + "learning_rate": 2.514115898959881e-08, + "loss": 0.1236, + "step": 25240 + }, + { + "epoch": 74.92581602373888, + "grad_norm": 11.224554061889648, + "learning_rate": 2.5111441307578006e-08, + "loss": 0.077, + "step": 25250 + }, + { + "epoch": 74.95548961424332, + "grad_norm": 3.580038070678711, + "learning_rate": 2.5081723625557205e-08, + "loss": 0.166, + "step": 25260 + }, + { + "epoch": 74.98516320474778, + "grad_norm": 0.8281358480453491, + "learning_rate": 2.5052005943536403e-08, + "loss": 0.0628, + "step": 25270 + }, + { + "epoch": 75.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.2134045511484146, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4397, + "eval_samples_per_second": 139.448, + "eval_steps_per_second": 17.547, + "step": 25275 + }, + { + "epoch": 75.01483679525222, + "grad_norm": 5.448498725891113, + "learning_rate": 2.50222882615156e-08, + "loss": 0.2072, + "step": 25280 + }, + { + "epoch": 75.04451038575668, + "grad_norm": 19.68935203552246, + "learning_rate": 2.4992570579494798e-08, + "loss": 0.1974, + "step": 25290 + }, + { + "epoch": 75.07418397626112, + "grad_norm": 0.48606744408607483, + "learning_rate": 2.4962852897473997e-08, + "loss": 0.0098, + "step": 25300 + }, + { + "epoch": 75.10385756676558, + "grad_norm": 2.0884244441986084, + "learning_rate": 2.4933135215453195e-08, + "loss": 0.0751, + "step": 25310 + }, + { + "epoch": 75.13353115727003, + "grad_norm": 5.487764358520508, + "learning_rate": 2.490341753343239e-08, + "loss": 0.1447, + "step": 25320 + }, + { + "epoch": 75.16320474777449, + "grad_norm": 0.41616198420524597, + "learning_rate": 2.487369985141159e-08, + "loss": 0.1476, + "step": 25330 + }, + { + "epoch": 75.19287833827893, + "grad_norm": 1.4823486804962158, + "learning_rate": 2.484398216939079e-08, + "loss": 0.185, + "step": 25340 + }, + { + "epoch": 75.22255192878339, + "grad_norm": 0.3554471433162689, + "learning_rate": 2.4814264487369984e-08, + "loss": 0.2014, + "step": 25350 + }, + { + "epoch": 75.25222551928783, + "grad_norm": 5.722182750701904, + "learning_rate": 2.4784546805349183e-08, + "loss": 0.0986, + "step": 25360 + }, + { + "epoch": 75.28189910979229, + "grad_norm": 7.047977924346924, + "learning_rate": 2.4754829123328382e-08, + "loss": 0.227, + "step": 25370 + }, + { + "epoch": 75.31157270029674, + "grad_norm": 1.9838703870773315, + "learning_rate": 2.4725111441307577e-08, + "loss": 0.1367, + "step": 25380 + }, + { + "epoch": 75.34124629080118, + "grad_norm": 1.1120964288711548, + "learning_rate": 2.4695393759286776e-08, + "loss": 0.041, + "step": 25390 + }, + { + "epoch": 75.37091988130564, + "grad_norm": 0.5453504920005798, + "learning_rate": 2.4665676077265975e-08, + "loss": 0.0865, + "step": 25400 + }, + { + "epoch": 75.40059347181008, + "grad_norm": 0.4922330379486084, + "learning_rate": 2.463595839524517e-08, + "loss": 0.0959, + "step": 25410 + }, + { + "epoch": 75.43026706231454, + "grad_norm": 4.9285359382629395, + "learning_rate": 2.4606240713224366e-08, + "loss": 0.1058, + "step": 25420 + }, + { + "epoch": 75.45994065281899, + "grad_norm": 19.605525970458984, + "learning_rate": 2.4576523031203565e-08, + "loss": 0.1285, + "step": 25430 + }, + { + "epoch": 75.48961424332344, + "grad_norm": 0.11149556189775467, + "learning_rate": 2.454680534918276e-08, + "loss": 0.0988, + "step": 25440 + }, + { + "epoch": 75.51928783382789, + "grad_norm": 1.8115448951721191, + "learning_rate": 2.451708766716196e-08, + "loss": 0.0187, + "step": 25450 + }, + { + "epoch": 75.54896142433235, + "grad_norm": 0.1558704674243927, + "learning_rate": 2.4487369985141158e-08, + "loss": 0.1053, + "step": 25460 + }, + { + "epoch": 75.57863501483679, + "grad_norm": 1.0271644592285156, + "learning_rate": 2.4457652303120353e-08, + "loss": 0.0753, + "step": 25470 + }, + { + "epoch": 75.60830860534125, + "grad_norm": 8.253524780273438, + "learning_rate": 2.4427934621099552e-08, + "loss": 0.0384, + "step": 25480 + }, + { + "epoch": 75.6379821958457, + "grad_norm": 0.8538500070571899, + "learning_rate": 2.439821693907875e-08, + "loss": 0.0257, + "step": 25490 + }, + { + "epoch": 75.66765578635015, + "grad_norm": 13.795876502990723, + "learning_rate": 2.4368499257057946e-08, + "loss": 0.0635, + "step": 25500 + }, + { + "epoch": 75.6973293768546, + "grad_norm": 4.328965663909912, + "learning_rate": 2.4338781575037145e-08, + "loss": 0.0964, + "step": 25510 + }, + { + "epoch": 75.72700296735906, + "grad_norm": 16.013851165771484, + "learning_rate": 2.4309063893016344e-08, + "loss": 0.0429, + "step": 25520 + }, + { + "epoch": 75.7566765578635, + "grad_norm": 1.2509129047393799, + "learning_rate": 2.427934621099554e-08, + "loss": 0.1053, + "step": 25530 + }, + { + "epoch": 75.78635014836796, + "grad_norm": 2.8325133323669434, + "learning_rate": 2.424962852897474e-08, + "loss": 0.0869, + "step": 25540 + }, + { + "epoch": 75.8160237388724, + "grad_norm": 17.708765029907227, + "learning_rate": 2.4219910846953937e-08, + "loss": 0.1593, + "step": 25550 + }, + { + "epoch": 75.84569732937686, + "grad_norm": 24.657751083374023, + "learning_rate": 2.4190193164933133e-08, + "loss": 0.1618, + "step": 25560 + }, + { + "epoch": 75.8753709198813, + "grad_norm": 2.7630531787872314, + "learning_rate": 2.416047548291233e-08, + "loss": 0.026, + "step": 25570 + }, + { + "epoch": 75.90504451038575, + "grad_norm": 0.518692135810852, + "learning_rate": 2.413075780089153e-08, + "loss": 0.0749, + "step": 25580 + }, + { + "epoch": 75.93471810089021, + "grad_norm": 0.06567501276731491, + "learning_rate": 2.4101040118870726e-08, + "loss": 0.0641, + "step": 25590 + }, + { + "epoch": 75.96439169139465, + "grad_norm": 20.12701416015625, + "learning_rate": 2.4071322436849925e-08, + "loss": 0.1104, + "step": 25600 + }, + { + "epoch": 75.99406528189911, + "grad_norm": 13.88818359375, + "learning_rate": 2.4041604754829123e-08, + "loss": 0.1776, + "step": 25610 + }, + { + "epoch": 76.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.21316297352313995, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.5231, + "eval_samples_per_second": 137.664, + "eval_steps_per_second": 17.323, + "step": 25612 + }, + { + "epoch": 76.02373887240356, + "grad_norm": 1.1475058794021606, + "learning_rate": 2.401188707280832e-08, + "loss": 0.0723, + "step": 25620 + }, + { + "epoch": 76.05341246290801, + "grad_norm": 0.1814432144165039, + "learning_rate": 2.3982169390787518e-08, + "loss": 0.025, + "step": 25630 + }, + { + "epoch": 76.08308605341246, + "grad_norm": 0.1719997078180313, + "learning_rate": 2.3952451708766717e-08, + "loss": 0.1252, + "step": 25640 + }, + { + "epoch": 76.11275964391692, + "grad_norm": 39.92985534667969, + "learning_rate": 2.3922734026745915e-08, + "loss": 0.0653, + "step": 25650 + }, + { + "epoch": 76.14243323442136, + "grad_norm": 14.16137409210205, + "learning_rate": 2.389301634472511e-08, + "loss": 0.1498, + "step": 25660 + }, + { + "epoch": 76.17210682492582, + "grad_norm": 1.2290475368499756, + "learning_rate": 2.386329866270431e-08, + "loss": 0.0375, + "step": 25670 + }, + { + "epoch": 76.20178041543026, + "grad_norm": 0.6803627014160156, + "learning_rate": 2.383358098068351e-08, + "loss": 0.0221, + "step": 25680 + }, + { + "epoch": 76.23145400593472, + "grad_norm": 23.389780044555664, + "learning_rate": 2.38038632986627e-08, + "loss": 0.0645, + "step": 25690 + }, + { + "epoch": 76.26112759643917, + "grad_norm": 0.2557353973388672, + "learning_rate": 2.37741456166419e-08, + "loss": 0.1225, + "step": 25700 + }, + { + "epoch": 76.29080118694363, + "grad_norm": 0.21505314111709595, + "learning_rate": 2.37444279346211e-08, + "loss": 0.069, + "step": 25710 + }, + { + "epoch": 76.32047477744807, + "grad_norm": 26.22347068786621, + "learning_rate": 2.3714710252600294e-08, + "loss": 0.0635, + "step": 25720 + }, + { + "epoch": 76.35014836795253, + "grad_norm": 18.83198356628418, + "learning_rate": 2.3684992570579493e-08, + "loss": 0.1459, + "step": 25730 + }, + { + "epoch": 76.37982195845697, + "grad_norm": 11.645444869995117, + "learning_rate": 2.365527488855869e-08, + "loss": 0.19, + "step": 25740 + }, + { + "epoch": 76.40949554896143, + "grad_norm": 0.2236717790365219, + "learning_rate": 2.3625557206537887e-08, + "loss": 0.0342, + "step": 25750 + }, + { + "epoch": 76.43916913946587, + "grad_norm": 17.46052360534668, + "learning_rate": 2.3595839524517086e-08, + "loss": 0.1534, + "step": 25760 + }, + { + "epoch": 76.46884272997033, + "grad_norm": 12.907841682434082, + "learning_rate": 2.3566121842496285e-08, + "loss": 0.212, + "step": 25770 + }, + { + "epoch": 76.49851632047478, + "grad_norm": 0.07877858728170395, + "learning_rate": 2.353640416047548e-08, + "loss": 0.154, + "step": 25780 + }, + { + "epoch": 76.52818991097922, + "grad_norm": 0.07683785259723663, + "learning_rate": 2.350668647845468e-08, + "loss": 0.1206, + "step": 25790 + }, + { + "epoch": 76.55786350148368, + "grad_norm": 32.758216857910156, + "learning_rate": 2.3476968796433878e-08, + "loss": 0.1235, + "step": 25800 + }, + { + "epoch": 76.58753709198812, + "grad_norm": 27.953744888305664, + "learning_rate": 2.3447251114413073e-08, + "loss": 0.0823, + "step": 25810 + }, + { + "epoch": 76.61721068249258, + "grad_norm": 1.0265517234802246, + "learning_rate": 2.3417533432392272e-08, + "loss": 0.0437, + "step": 25820 + }, + { + "epoch": 76.64688427299703, + "grad_norm": 11.69684886932373, + "learning_rate": 2.338781575037147e-08, + "loss": 0.0216, + "step": 25830 + }, + { + "epoch": 76.67655786350149, + "grad_norm": 1.1309185028076172, + "learning_rate": 2.3358098068350666e-08, + "loss": 0.0789, + "step": 25840 + }, + { + "epoch": 76.70623145400593, + "grad_norm": 1.636128306388855, + "learning_rate": 2.3328380386329865e-08, + "loss": 0.1393, + "step": 25850 + }, + { + "epoch": 76.73590504451039, + "grad_norm": 10.685659408569336, + "learning_rate": 2.3298662704309064e-08, + "loss": 0.0304, + "step": 25860 + }, + { + "epoch": 76.76557863501483, + "grad_norm": 0.2311255931854248, + "learning_rate": 2.326894502228826e-08, + "loss": 0.2024, + "step": 25870 + }, + { + "epoch": 76.79525222551929, + "grad_norm": 0.061885036528110504, + "learning_rate": 2.323922734026746e-08, + "loss": 0.1066, + "step": 25880 + }, + { + "epoch": 76.82492581602374, + "grad_norm": 0.20818279683589935, + "learning_rate": 2.3209509658246657e-08, + "loss": 0.1274, + "step": 25890 + }, + { + "epoch": 76.8545994065282, + "grad_norm": 0.03082026168704033, + "learning_rate": 2.3179791976225853e-08, + "loss": 0.2069, + "step": 25900 + }, + { + "epoch": 76.88427299703264, + "grad_norm": 0.09540823847055435, + "learning_rate": 2.315007429420505e-08, + "loss": 0.0978, + "step": 25910 + }, + { + "epoch": 76.9139465875371, + "grad_norm": 0.14996585249900818, + "learning_rate": 2.312035661218425e-08, + "loss": 0.0571, + "step": 25920 + }, + { + "epoch": 76.94362017804154, + "grad_norm": 45.92282485961914, + "learning_rate": 2.3090638930163446e-08, + "loss": 0.0454, + "step": 25930 + }, + { + "epoch": 76.973293768546, + "grad_norm": 2.1909797191619873, + "learning_rate": 2.3060921248142645e-08, + "loss": 0.1058, + "step": 25940 + }, + { + "epoch": 77.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.21332357823848724, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.5216, + "eval_samples_per_second": 137.696, + "eval_steps_per_second": 17.327, + "step": 25949 + }, + { + "epoch": 77.00296735905044, + "grad_norm": 0.10912071913480759, + "learning_rate": 2.303120356612184e-08, + "loss": 0.0189, + "step": 25950 + }, + { + "epoch": 77.0326409495549, + "grad_norm": 0.05919058620929718, + "learning_rate": 2.300148588410104e-08, + "loss": 0.1476, + "step": 25960 + }, + { + "epoch": 77.06231454005935, + "grad_norm": 0.1942834109067917, + "learning_rate": 2.2971768202080235e-08, + "loss": 0.1097, + "step": 25970 + }, + { + "epoch": 77.09198813056379, + "grad_norm": 0.23527885973453522, + "learning_rate": 2.2942050520059433e-08, + "loss": 0.0742, + "step": 25980 + }, + { + "epoch": 77.12166172106825, + "grad_norm": 40.24421691894531, + "learning_rate": 2.2912332838038632e-08, + "loss": 0.1933, + "step": 25990 + }, + { + "epoch": 77.1513353115727, + "grad_norm": 0.12766513228416443, + "learning_rate": 2.2882615156017828e-08, + "loss": 0.0777, + "step": 26000 + }, + { + "epoch": 77.18100890207715, + "grad_norm": 0.03798043727874756, + "learning_rate": 2.2852897473997027e-08, + "loss": 0.0357, + "step": 26010 + }, + { + "epoch": 77.2106824925816, + "grad_norm": 14.990479469299316, + "learning_rate": 2.2823179791976225e-08, + "loss": 0.0962, + "step": 26020 + }, + { + "epoch": 77.24035608308606, + "grad_norm": 2.8798444271087646, + "learning_rate": 2.279346210995542e-08, + "loss": 0.2104, + "step": 26030 + }, + { + "epoch": 77.2700296735905, + "grad_norm": 0.5706474781036377, + "learning_rate": 2.276374442793462e-08, + "loss": 0.1136, + "step": 26040 + }, + { + "epoch": 77.29970326409496, + "grad_norm": 0.23117579519748688, + "learning_rate": 2.273402674591382e-08, + "loss": 0.1115, + "step": 26050 + }, + { + "epoch": 77.3293768545994, + "grad_norm": 17.225099563598633, + "learning_rate": 2.2704309063893014e-08, + "loss": 0.1037, + "step": 26060 + }, + { + "epoch": 77.35905044510386, + "grad_norm": 16.16424560546875, + "learning_rate": 2.2674591381872213e-08, + "loss": 0.2318, + "step": 26070 + }, + { + "epoch": 77.3887240356083, + "grad_norm": 16.156719207763672, + "learning_rate": 2.264487369985141e-08, + "loss": 0.0151, + "step": 26080 + }, + { + "epoch": 77.41839762611276, + "grad_norm": 0.2856292724609375, + "learning_rate": 2.2615156017830607e-08, + "loss": 0.0129, + "step": 26090 + }, + { + "epoch": 77.44807121661721, + "grad_norm": 0.34148547053337097, + "learning_rate": 2.2585438335809806e-08, + "loss": 0.1025, + "step": 26100 + }, + { + "epoch": 77.47774480712167, + "grad_norm": 11.188664436340332, + "learning_rate": 2.2555720653789005e-08, + "loss": 0.1048, + "step": 26110 + }, + { + "epoch": 77.50741839762611, + "grad_norm": 0.3945986330509186, + "learning_rate": 2.25260029717682e-08, + "loss": 0.0817, + "step": 26120 + }, + { + "epoch": 77.53709198813057, + "grad_norm": 0.14263637363910675, + "learning_rate": 2.24962852897474e-08, + "loss": 0.0816, + "step": 26130 + }, + { + "epoch": 77.56676557863501, + "grad_norm": 0.056312888860702515, + "learning_rate": 2.2466567607726598e-08, + "loss": 0.095, + "step": 26140 + }, + { + "epoch": 77.59643916913947, + "grad_norm": 0.2972889840602875, + "learning_rate": 2.2436849925705793e-08, + "loss": 0.0508, + "step": 26150 + }, + { + "epoch": 77.62611275964392, + "grad_norm": 0.041869718581438065, + "learning_rate": 2.2407132243684992e-08, + "loss": 0.0676, + "step": 26160 + }, + { + "epoch": 77.65578635014836, + "grad_norm": 25.140268325805664, + "learning_rate": 2.237741456166419e-08, + "loss": 0.2033, + "step": 26170 + }, + { + "epoch": 77.68545994065282, + "grad_norm": 23.874086380004883, + "learning_rate": 2.2347696879643387e-08, + "loss": 0.0955, + "step": 26180 + }, + { + "epoch": 77.71513353115726, + "grad_norm": 4.0271477699279785, + "learning_rate": 2.2317979197622585e-08, + "loss": 0.1545, + "step": 26190 + }, + { + "epoch": 77.74480712166172, + "grad_norm": 12.73366928100586, + "learning_rate": 2.2288261515601784e-08, + "loss": 0.0893, + "step": 26200 + }, + { + "epoch": 77.77448071216617, + "grad_norm": 9.032092094421387, + "learning_rate": 2.225854383358098e-08, + "loss": 0.0177, + "step": 26210 + }, + { + "epoch": 77.80415430267063, + "grad_norm": 7.933441638946533, + "learning_rate": 2.2228826151560175e-08, + "loss": 0.1246, + "step": 26220 + }, + { + "epoch": 77.83382789317507, + "grad_norm": 0.13614125549793243, + "learning_rate": 2.2199108469539374e-08, + "loss": 0.1108, + "step": 26230 + }, + { + "epoch": 77.86350148367953, + "grad_norm": 12.124828338623047, + "learning_rate": 2.2169390787518573e-08, + "loss": 0.0528, + "step": 26240 + }, + { + "epoch": 77.89317507418397, + "grad_norm": 11.350160598754883, + "learning_rate": 2.2139673105497768e-08, + "loss": 0.3108, + "step": 26250 + }, + { + "epoch": 77.92284866468843, + "grad_norm": 0.1726427972316742, + "learning_rate": 2.2109955423476967e-08, + "loss": 0.1334, + "step": 26260 + }, + { + "epoch": 77.95252225519287, + "grad_norm": 0.31877174973487854, + "learning_rate": 2.2080237741456166e-08, + "loss": 0.1269, + "step": 26270 + }, + { + "epoch": 77.98219584569733, + "grad_norm": 7.224466323852539, + "learning_rate": 2.205052005943536e-08, + "loss": 0.1285, + "step": 26280 + }, + { + "epoch": 78.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.213401660323143, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.4618, + "eval_samples_per_second": 138.971, + "eval_steps_per_second": 17.487, + "step": 26286 + }, + { + "epoch": 78.01186943620178, + "grad_norm": 0.04212509095668793, + "learning_rate": 2.202080237741456e-08, + "loss": 0.0855, + "step": 26290 + }, + { + "epoch": 78.04154302670624, + "grad_norm": 10.362092971801758, + "learning_rate": 2.199108469539376e-08, + "loss": 0.0372, + "step": 26300 + }, + { + "epoch": 78.07121661721068, + "grad_norm": 13.871867179870605, + "learning_rate": 2.1961367013372955e-08, + "loss": 0.1931, + "step": 26310 + }, + { + "epoch": 78.10089020771514, + "grad_norm": 20.267847061157227, + "learning_rate": 2.1931649331352153e-08, + "loss": 0.0891, + "step": 26320 + }, + { + "epoch": 78.13056379821958, + "grad_norm": 0.7396399974822998, + "learning_rate": 2.1901931649331352e-08, + "loss": 0.1666, + "step": 26330 + }, + { + "epoch": 78.16023738872404, + "grad_norm": 1.1470648050308228, + "learning_rate": 2.1872213967310548e-08, + "loss": 0.1266, + "step": 26340 + }, + { + "epoch": 78.18991097922849, + "grad_norm": 5.975472450256348, + "learning_rate": 2.1842496285289747e-08, + "loss": 0.1085, + "step": 26350 + }, + { + "epoch": 78.21958456973294, + "grad_norm": 18.084653854370117, + "learning_rate": 2.1812778603268945e-08, + "loss": 0.096, + "step": 26360 + }, + { + "epoch": 78.24925816023739, + "grad_norm": 0.020633451640605927, + "learning_rate": 2.178306092124814e-08, + "loss": 0.0398, + "step": 26370 + }, + { + "epoch": 78.27893175074183, + "grad_norm": 7.701976776123047, + "learning_rate": 2.175334323922734e-08, + "loss": 0.1074, + "step": 26380 + }, + { + "epoch": 78.30860534124629, + "grad_norm": 0.6263688802719116, + "learning_rate": 2.172362555720654e-08, + "loss": 0.0932, + "step": 26390 + }, + { + "epoch": 78.33827893175074, + "grad_norm": 8.493802070617676, + "learning_rate": 2.1693907875185734e-08, + "loss": 0.2231, + "step": 26400 + }, + { + "epoch": 78.3679525222552, + "grad_norm": 0.01818101294338703, + "learning_rate": 2.1664190193164933e-08, + "loss": 0.0605, + "step": 26410 + }, + { + "epoch": 78.39762611275964, + "grad_norm": 12.521854400634766, + "learning_rate": 2.1634472511144132e-08, + "loss": 0.1077, + "step": 26420 + }, + { + "epoch": 78.4272997032641, + "grad_norm": 0.11882145702838898, + "learning_rate": 2.1604754829123327e-08, + "loss": 0.1338, + "step": 26430 + }, + { + "epoch": 78.45697329376854, + "grad_norm": 31.491666793823242, + "learning_rate": 2.1575037147102526e-08, + "loss": 0.2009, + "step": 26440 + }, + { + "epoch": 78.486646884273, + "grad_norm": 5.230401039123535, + "learning_rate": 2.1545319465081725e-08, + "loss": 0.0749, + "step": 26450 + }, + { + "epoch": 78.51632047477744, + "grad_norm": 23.666162490844727, + "learning_rate": 2.151560178306092e-08, + "loss": 0.2507, + "step": 26460 + }, + { + "epoch": 78.5459940652819, + "grad_norm": 2.4085304737091064, + "learning_rate": 2.148588410104012e-08, + "loss": 0.0585, + "step": 26470 + }, + { + "epoch": 78.57566765578635, + "grad_norm": 0.09617044776678085, + "learning_rate": 2.1456166419019315e-08, + "loss": 0.0766, + "step": 26480 + }, + { + "epoch": 78.6053412462908, + "grad_norm": 0.04346499592065811, + "learning_rate": 2.1426448736998513e-08, + "loss": 0.0113, + "step": 26490 + }, + { + "epoch": 78.63501483679525, + "grad_norm": 0.0854620411992073, + "learning_rate": 2.139673105497771e-08, + "loss": 0.1765, + "step": 26500 + }, + { + "epoch": 78.66468842729971, + "grad_norm": 18.375276565551758, + "learning_rate": 2.1367013372956908e-08, + "loss": 0.2732, + "step": 26510 + }, + { + "epoch": 78.69436201780415, + "grad_norm": 3.672478199005127, + "learning_rate": 2.1337295690936107e-08, + "loss": 0.0844, + "step": 26520 + }, + { + "epoch": 78.72403560830861, + "grad_norm": 7.994017124176025, + "learning_rate": 2.1307578008915302e-08, + "loss": 0.065, + "step": 26530 + }, + { + "epoch": 78.75370919881306, + "grad_norm": 1.933627963066101, + "learning_rate": 2.12778603268945e-08, + "loss": 0.1141, + "step": 26540 + }, + { + "epoch": 78.78338278931751, + "grad_norm": 0.17147549986839294, + "learning_rate": 2.12481426448737e-08, + "loss": 0.0704, + "step": 26550 + }, + { + "epoch": 78.81305637982196, + "grad_norm": 17.248750686645508, + "learning_rate": 2.1218424962852895e-08, + "loss": 0.0783, + "step": 26560 + }, + { + "epoch": 78.8427299703264, + "grad_norm": 34.62567138671875, + "learning_rate": 2.1188707280832094e-08, + "loss": 0.0878, + "step": 26570 + }, + { + "epoch": 78.87240356083086, + "grad_norm": 3.336759567260742, + "learning_rate": 2.1158989598811293e-08, + "loss": 0.1439, + "step": 26580 + }, + { + "epoch": 78.9020771513353, + "grad_norm": 0.39773112535476685, + "learning_rate": 2.112927191679049e-08, + "loss": 0.1191, + "step": 26590 + }, + { + "epoch": 78.93175074183976, + "grad_norm": 0.31942427158355713, + "learning_rate": 2.1099554234769687e-08, + "loss": 0.009, + "step": 26600 + }, + { + "epoch": 78.96142433234421, + "grad_norm": 16.504981994628906, + "learning_rate": 2.1069836552748886e-08, + "loss": 0.116, + "step": 26610 + }, + { + "epoch": 78.99109792284867, + "grad_norm": 0.2202264964580536, + "learning_rate": 2.104011887072808e-08, + "loss": 0.1778, + "step": 26620 + }, + { + "epoch": 79.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.21352869272232056, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 6.5447, + "eval_samples_per_second": 137.21, + "eval_steps_per_second": 17.266, + "step": 26623 + }, + { + "epoch": 79.02077151335311, + "grad_norm": 2.8517987728118896, + "learning_rate": 2.101040118870728e-08, + "loss": 0.1093, + "step": 26630 + }, + { + "epoch": 79.05044510385757, + "grad_norm": 4.262632846832275, + "learning_rate": 2.098068350668648e-08, + "loss": 0.0121, + "step": 26640 + }, + { + "epoch": 79.08011869436201, + "grad_norm": 1.357006311416626, + "learning_rate": 2.0950965824665675e-08, + "loss": 0.186, + "step": 26650 + }, + { + "epoch": 79.10979228486647, + "grad_norm": 0.022943919524550438, + "learning_rate": 2.0921248142644873e-08, + "loss": 0.1096, + "step": 26660 + }, + { + "epoch": 79.13946587537092, + "grad_norm": 0.2606510519981384, + "learning_rate": 2.0891530460624072e-08, + "loss": 0.0753, + "step": 26670 + }, + { + "epoch": 79.16913946587538, + "grad_norm": 0.25011762976646423, + "learning_rate": 2.0861812778603268e-08, + "loss": 0.2275, + "step": 26680 + }, + { + "epoch": 79.19881305637982, + "grad_norm": 0.036467213183641434, + "learning_rate": 2.0832095096582467e-08, + "loss": 0.1989, + "step": 26690 + }, + { + "epoch": 79.22848664688428, + "grad_norm": 0.7142317891120911, + "learning_rate": 2.0802377414561665e-08, + "loss": 0.0532, + "step": 26700 + }, + { + "epoch": 79.25816023738872, + "grad_norm": 0.9788093566894531, + "learning_rate": 2.077265973254086e-08, + "loss": 0.0876, + "step": 26710 + }, + { + "epoch": 79.28783382789318, + "grad_norm": 1.9511228799819946, + "learning_rate": 2.074294205052006e-08, + "loss": 0.0355, + "step": 26720 + }, + { + "epoch": 79.31750741839762, + "grad_norm": 0.11181000620126724, + "learning_rate": 2.071322436849926e-08, + "loss": 0.1182, + "step": 26730 + }, + { + "epoch": 79.34718100890208, + "grad_norm": 13.84111499786377, + "learning_rate": 2.068350668647845e-08, + "loss": 0.2738, + "step": 26740 + }, + { + "epoch": 79.37685459940653, + "grad_norm": 31.973907470703125, + "learning_rate": 2.065378900445765e-08, + "loss": 0.2642, + "step": 26750 + }, + { + "epoch": 79.40652818991099, + "grad_norm": 23.0803165435791, + "learning_rate": 2.062407132243685e-08, + "loss": 0.1224, + "step": 26760 + }, + { + "epoch": 79.43620178041543, + "grad_norm": 0.7115737795829773, + "learning_rate": 2.0594353640416047e-08, + "loss": 0.0738, + "step": 26770 + }, + { + "epoch": 79.46587537091987, + "grad_norm": 0.4020480811595917, + "learning_rate": 2.0564635958395243e-08, + "loss": 0.0351, + "step": 26780 + }, + { + "epoch": 79.49554896142433, + "grad_norm": 0.0648675337433815, + "learning_rate": 2.053491827637444e-08, + "loss": 0.017, + "step": 26790 + }, + { + "epoch": 79.52522255192878, + "grad_norm": 5.650177001953125, + "learning_rate": 2.050520059435364e-08, + "loss": 0.1657, + "step": 26800 + }, + { + "epoch": 79.55489614243324, + "grad_norm": 0.0836738795042038, + "learning_rate": 2.0475482912332836e-08, + "loss": 0.0341, + "step": 26810 + }, + { + "epoch": 79.58456973293768, + "grad_norm": 0.9935275316238403, + "learning_rate": 2.0445765230312035e-08, + "loss": 0.0799, + "step": 26820 + }, + { + "epoch": 79.61424332344214, + "grad_norm": 0.5052889585494995, + "learning_rate": 2.0416047548291234e-08, + "loss": 0.0339, + "step": 26830 + }, + { + "epoch": 79.64391691394658, + "grad_norm": 0.13222220540046692, + "learning_rate": 2.038632986627043e-08, + "loss": 0.0429, + "step": 26840 + }, + { + "epoch": 79.67359050445104, + "grad_norm": 0.15523843467235565, + "learning_rate": 2.0356612184249628e-08, + "loss": 0.1804, + "step": 26850 + }, + { + "epoch": 79.70326409495549, + "grad_norm": 1.961499810218811, + "learning_rate": 2.0326894502228827e-08, + "loss": 0.104, + "step": 26860 + }, + { + "epoch": 79.73293768545994, + "grad_norm": 0.04563461244106293, + "learning_rate": 2.0297176820208022e-08, + "loss": 0.0556, + "step": 26870 + }, + { + "epoch": 79.76261127596439, + "grad_norm": 17.272512435913086, + "learning_rate": 2.026745913818722e-08, + "loss": 0.1188, + "step": 26880 + }, + { + "epoch": 79.79228486646885, + "grad_norm": 0.02312118001282215, + "learning_rate": 2.023774145616642e-08, + "loss": 0.0697, + "step": 26890 + }, + { + "epoch": 79.82195845697329, + "grad_norm": 3.2783055305480957, + "learning_rate": 2.0208023774145615e-08, + "loss": 0.0544, + "step": 26900 + }, + { + "epoch": 79.85163204747775, + "grad_norm": 1.9862334728240967, + "learning_rate": 2.0178306092124814e-08, + "loss": 0.0702, + "step": 26910 + }, + { + "epoch": 79.8813056379822, + "grad_norm": 0.9559428095817566, + "learning_rate": 2.0148588410104013e-08, + "loss": 0.2199, + "step": 26920 + }, + { + "epoch": 79.91097922848665, + "grad_norm": 0.5850972533226013, + "learning_rate": 2.011887072808321e-08, + "loss": 0.0864, + "step": 26930 + }, + { + "epoch": 79.9406528189911, + "grad_norm": 0.09542850404977798, + "learning_rate": 2.0089153046062407e-08, + "loss": 0.099, + "step": 26940 + }, + { + "epoch": 79.97032640949556, + "grad_norm": 2.2142436504364014, + "learning_rate": 2.0059435364041606e-08, + "loss": 0.0117, + "step": 26950 + }, + { + "epoch": 80.0, + "grad_norm": 0.020497934892773628, + "learning_rate": 2.00297176820208e-08, + "loss": 0.1533, + "step": 26960 + }, + { + "epoch": 80.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.21397604048252106, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 10.4154, + "eval_samples_per_second": 86.218, + "eval_steps_per_second": 10.849, + "step": 26960 + }, + { + "epoch": 80.02967359050444, + "grad_norm": 0.16922056674957275, + "learning_rate": 2e-08, + "loss": 0.1101, + "step": 26970 + }, + { + "epoch": 80.0593471810089, + "grad_norm": 2.4993057250976562, + "learning_rate": 1.99702823179792e-08, + "loss": 0.1789, + "step": 26980 + }, + { + "epoch": 80.08902077151335, + "grad_norm": 5.443178653717041, + "learning_rate": 1.9940564635958395e-08, + "loss": 0.2045, + "step": 26990 + }, + { + "epoch": 80.1186943620178, + "grad_norm": 17.931726455688477, + "learning_rate": 1.991084695393759e-08, + "loss": 0.0827, + "step": 27000 + }, + { + "epoch": 80.14836795252225, + "grad_norm": 1.3117926120758057, + "learning_rate": 1.988112927191679e-08, + "loss": 0.0623, + "step": 27010 + }, + { + "epoch": 80.17804154302671, + "grad_norm": 0.04212156683206558, + "learning_rate": 1.9851411589895985e-08, + "loss": 0.0337, + "step": 27020 + }, + { + "epoch": 80.20771513353115, + "grad_norm": 30.411760330200195, + "learning_rate": 1.9821693907875183e-08, + "loss": 0.1699, + "step": 27030 + }, + { + "epoch": 80.23738872403561, + "grad_norm": 17.083648681640625, + "learning_rate": 1.9791976225854382e-08, + "loss": 0.1834, + "step": 27040 + }, + { + "epoch": 80.26706231454006, + "grad_norm": 29.79239845275879, + "learning_rate": 1.9762258543833578e-08, + "loss": 0.1508, + "step": 27050 + }, + { + "epoch": 80.29673590504451, + "grad_norm": 0.09154559671878815, + "learning_rate": 1.9732540861812777e-08, + "loss": 0.1478, + "step": 27060 + }, + { + "epoch": 80.32640949554896, + "grad_norm": 0.22691963613033295, + "learning_rate": 1.9702823179791975e-08, + "loss": 0.0192, + "step": 27070 + }, + { + "epoch": 80.35608308605342, + "grad_norm": 6.658444881439209, + "learning_rate": 1.967310549777117e-08, + "loss": 0.1224, + "step": 27080 + }, + { + "epoch": 80.38575667655786, + "grad_norm": 1.941637396812439, + "learning_rate": 1.964338781575037e-08, + "loss": 0.0408, + "step": 27090 + }, + { + "epoch": 80.41543026706232, + "grad_norm": 0.3739467263221741, + "learning_rate": 1.961367013372957e-08, + "loss": 0.1456, + "step": 27100 + }, + { + "epoch": 80.44510385756676, + "grad_norm": 18.225467681884766, + "learning_rate": 1.9583952451708764e-08, + "loss": 0.1051, + "step": 27110 + }, + { + "epoch": 80.47477744807122, + "grad_norm": 0.07907848805189133, + "learning_rate": 1.9554234769687963e-08, + "loss": 0.0106, + "step": 27120 + }, + { + "epoch": 80.50445103857567, + "grad_norm": 0.3770519196987152, + "learning_rate": 1.952451708766716e-08, + "loss": 0.1863, + "step": 27130 + }, + { + "epoch": 80.53412462908013, + "grad_norm": 17.150890350341797, + "learning_rate": 1.949479940564636e-08, + "loss": 0.1757, + "step": 27140 + }, + { + "epoch": 80.56379821958457, + "grad_norm": 16.281253814697266, + "learning_rate": 1.9465081723625556e-08, + "loss": 0.1089, + "step": 27150 + }, + { + "epoch": 80.59347181008901, + "grad_norm": 0.32494571805000305, + "learning_rate": 1.9435364041604755e-08, + "loss": 0.115, + "step": 27160 + }, + { + "epoch": 80.62314540059347, + "grad_norm": 0.696367084980011, + "learning_rate": 1.9405646359583954e-08, + "loss": 0.0975, + "step": 27170 + }, + { + "epoch": 80.65281899109792, + "grad_norm": 0.31002581119537354, + "learning_rate": 1.937592867756315e-08, + "loss": 0.0784, + "step": 27180 + }, + { + "epoch": 80.68249258160238, + "grad_norm": 15.45845890045166, + "learning_rate": 1.9346210995542348e-08, + "loss": 0.202, + "step": 27190 + }, + { + "epoch": 80.71216617210682, + "grad_norm": 0.06308470666408539, + "learning_rate": 1.9316493313521547e-08, + "loss": 0.0784, + "step": 27200 + }, + { + "epoch": 80.74183976261128, + "grad_norm": 0.2601000666618347, + "learning_rate": 1.9286775631500742e-08, + "loss": 0.0618, + "step": 27210 + }, + { + "epoch": 80.77151335311572, + "grad_norm": 0.09470527619123459, + "learning_rate": 1.925705794947994e-08, + "loss": 0.1194, + "step": 27220 + }, + { + "epoch": 80.80118694362018, + "grad_norm": 0.0804314985871315, + "learning_rate": 1.922734026745914e-08, + "loss": 0.2132, + "step": 27230 + }, + { + "epoch": 80.83086053412462, + "grad_norm": 26.108715057373047, + "learning_rate": 1.9197622585438335e-08, + "loss": 0.0685, + "step": 27240 + }, + { + "epoch": 80.86053412462908, + "grad_norm": 21.27816390991211, + "learning_rate": 1.9167904903417534e-08, + "loss": 0.0983, + "step": 27250 + }, + { + "epoch": 80.89020771513353, + "grad_norm": 0.19262142479419708, + "learning_rate": 1.913818722139673e-08, + "loss": 0.1868, + "step": 27260 + }, + { + "epoch": 80.91988130563799, + "grad_norm": 0.8089978694915771, + "learning_rate": 1.9108469539375925e-08, + "loss": 0.1119, + "step": 27270 + }, + { + "epoch": 80.94955489614243, + "grad_norm": 0.10320446640253067, + "learning_rate": 1.9078751857355124e-08, + "loss": 0.0426, + "step": 27280 + }, + { + "epoch": 80.97922848664689, + "grad_norm": 19.24039649963379, + "learning_rate": 1.9049034175334323e-08, + "loss": 0.0597, + "step": 27290 + }, + { + "epoch": 81.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.2139374315738678, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 10.88, + "eval_samples_per_second": 82.537, + "eval_steps_per_second": 10.386, + "step": 27297 + }, + { + "epoch": 81.00890207715133, + "grad_norm": 7.294345855712891, + "learning_rate": 1.901931649331352e-08, + "loss": 0.09, + "step": 27300 + }, + { + "epoch": 81.03857566765579, + "grad_norm": 0.7730750441551208, + "learning_rate": 1.8989598811292717e-08, + "loss": 0.1148, + "step": 27310 + }, + { + "epoch": 81.06824925816024, + "grad_norm": 6.2822585105896, + "learning_rate": 1.8959881129271916e-08, + "loss": 0.086, + "step": 27320 + }, + { + "epoch": 81.0979228486647, + "grad_norm": 0.738120973110199, + "learning_rate": 1.893016344725111e-08, + "loss": 0.244, + "step": 27330 + }, + { + "epoch": 81.12759643916914, + "grad_norm": 13.684704780578613, + "learning_rate": 1.890044576523031e-08, + "loss": 0.0438, + "step": 27340 + }, + { + "epoch": 81.1572700296736, + "grad_norm": 0.22157022356987, + "learning_rate": 1.887072808320951e-08, + "loss": 0.1703, + "step": 27350 + }, + { + "epoch": 81.18694362017804, + "grad_norm": 2.259430170059204, + "learning_rate": 1.8841010401188705e-08, + "loss": 0.0112, + "step": 27360 + }, + { + "epoch": 81.21661721068249, + "grad_norm": 5.085522651672363, + "learning_rate": 1.8811292719167903e-08, + "loss": 0.0538, + "step": 27370 + }, + { + "epoch": 81.24629080118694, + "grad_norm": 0.4749823808670044, + "learning_rate": 1.8781575037147102e-08, + "loss": 0.1626, + "step": 27380 + }, + { + "epoch": 81.27596439169139, + "grad_norm": 0.8185030817985535, + "learning_rate": 1.8751857355126298e-08, + "loss": 0.1443, + "step": 27390 + }, + { + "epoch": 81.30563798219585, + "grad_norm": 0.25875940918922424, + "learning_rate": 1.8722139673105497e-08, + "loss": 0.178, + "step": 27400 + }, + { + "epoch": 81.33531157270029, + "grad_norm": 23.436737060546875, + "learning_rate": 1.8692421991084695e-08, + "loss": 0.1784, + "step": 27410 + }, + { + "epoch": 81.36498516320475, + "grad_norm": 4.311031818389893, + "learning_rate": 1.866270430906389e-08, + "loss": 0.1384, + "step": 27420 + }, + { + "epoch": 81.3946587537092, + "grad_norm": 8.629722595214844, + "learning_rate": 1.863298662704309e-08, + "loss": 0.2011, + "step": 27430 + }, + { + "epoch": 81.42433234421365, + "grad_norm": 0.3798997104167938, + "learning_rate": 1.860326894502229e-08, + "loss": 0.1211, + "step": 27440 + }, + { + "epoch": 81.4540059347181, + "grad_norm": 0.19693385064601898, + "learning_rate": 1.8573551263001484e-08, + "loss": 0.0672, + "step": 27450 + }, + { + "epoch": 81.48367952522256, + "grad_norm": 0.06431056559085846, + "learning_rate": 1.8543833580980683e-08, + "loss": 0.0976, + "step": 27460 + }, + { + "epoch": 81.513353115727, + "grad_norm": 0.8909355401992798, + "learning_rate": 1.8514115898959882e-08, + "loss": 0.0603, + "step": 27470 + }, + { + "epoch": 81.54302670623146, + "grad_norm": 0.030679961666464806, + "learning_rate": 1.848439821693908e-08, + "loss": 0.0799, + "step": 27480 + }, + { + "epoch": 81.5727002967359, + "grad_norm": 0.12838003039360046, + "learning_rate": 1.8454680534918276e-08, + "loss": 0.0562, + "step": 27490 + }, + { + "epoch": 81.60237388724036, + "grad_norm": 0.3604932725429535, + "learning_rate": 1.8424962852897475e-08, + "loss": 0.0537, + "step": 27500 + }, + { + "epoch": 81.6320474777448, + "grad_norm": 10.764547348022461, + "learning_rate": 1.8395245170876674e-08, + "loss": 0.1023, + "step": 27510 + }, + { + "epoch": 81.66172106824926, + "grad_norm": 3.8812577724456787, + "learning_rate": 1.836552748885587e-08, + "loss": 0.0307, + "step": 27520 + }, + { + "epoch": 81.69139465875371, + "grad_norm": 0.28767287731170654, + "learning_rate": 1.8335809806835065e-08, + "loss": 0.069, + "step": 27530 + }, + { + "epoch": 81.72106824925817, + "grad_norm": 0.048010919243097305, + "learning_rate": 1.8306092124814263e-08, + "loss": 0.1224, + "step": 27540 + }, + { + "epoch": 81.75074183976261, + "grad_norm": 0.11187992990016937, + "learning_rate": 1.827637444279346e-08, + "loss": 0.1215, + "step": 27550 + }, + { + "epoch": 81.78041543026706, + "grad_norm": 1.023590326309204, + "learning_rate": 1.8246656760772658e-08, + "loss": 0.1653, + "step": 27560 + }, + { + "epoch": 81.81008902077151, + "grad_norm": 11.38036823272705, + "learning_rate": 1.8216939078751857e-08, + "loss": 0.1283, + "step": 27570 + }, + { + "epoch": 81.83976261127596, + "grad_norm": 11.852709770202637, + "learning_rate": 1.8187221396731052e-08, + "loss": 0.214, + "step": 27580 + }, + { + "epoch": 81.86943620178042, + "grad_norm": 0.4469172954559326, + "learning_rate": 1.815750371471025e-08, + "loss": 0.1442, + "step": 27590 + }, + { + "epoch": 81.89910979228486, + "grad_norm": 4.350419998168945, + "learning_rate": 1.812778603268945e-08, + "loss": 0.1164, + "step": 27600 + }, + { + "epoch": 81.92878338278932, + "grad_norm": 2.1953392028808594, + "learning_rate": 1.8098068350668645e-08, + "loss": 0.0996, + "step": 27610 + }, + { + "epoch": 81.95845697329376, + "grad_norm": 0.15866458415985107, + "learning_rate": 1.8068350668647844e-08, + "loss": 0.091, + "step": 27620 + }, + { + "epoch": 81.98813056379822, + "grad_norm": 0.7957971096038818, + "learning_rate": 1.8038632986627043e-08, + "loss": 0.1726, + "step": 27630 + }, + { + "epoch": 82.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.21378262341022491, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 11.3266, + "eval_samples_per_second": 79.282, + "eval_steps_per_second": 9.977, + "step": 27634 + }, + { + "epoch": 82.01780415430267, + "grad_norm": 0.8919824957847595, + "learning_rate": 1.800891530460624e-08, + "loss": 0.0154, + "step": 27640 + }, + { + "epoch": 82.04747774480713, + "grad_norm": 3.4661967754364014, + "learning_rate": 1.7979197622585437e-08, + "loss": 0.0948, + "step": 27650 + }, + { + "epoch": 82.07715133531157, + "grad_norm": 19.356382369995117, + "learning_rate": 1.7949479940564636e-08, + "loss": 0.1952, + "step": 27660 + }, + { + "epoch": 82.10682492581603, + "grad_norm": 0.6438494920730591, + "learning_rate": 1.791976225854383e-08, + "loss": 0.1784, + "step": 27670 + }, + { + "epoch": 82.13649851632047, + "grad_norm": 3.2123820781707764, + "learning_rate": 1.789004457652303e-08, + "loss": 0.0775, + "step": 27680 + }, + { + "epoch": 82.16617210682493, + "grad_norm": 40.37452697753906, + "learning_rate": 1.786032689450223e-08, + "loss": 0.1818, + "step": 27690 + }, + { + "epoch": 82.19584569732937, + "grad_norm": 0.26189881563186646, + "learning_rate": 1.7830609212481425e-08, + "loss": 0.0498, + "step": 27700 + }, + { + "epoch": 82.22551928783383, + "grad_norm": 0.06754841655492783, + "learning_rate": 1.7800891530460624e-08, + "loss": 0.0329, + "step": 27710 + }, + { + "epoch": 82.25519287833828, + "grad_norm": 0.6810848116874695, + "learning_rate": 1.7771173848439822e-08, + "loss": 0.1376, + "step": 27720 + }, + { + "epoch": 82.28486646884274, + "grad_norm": 24.079143524169922, + "learning_rate": 1.7741456166419018e-08, + "loss": 0.0649, + "step": 27730 + }, + { + "epoch": 82.31454005934718, + "grad_norm": 16.909772872924805, + "learning_rate": 1.7711738484398217e-08, + "loss": 0.1311, + "step": 27740 + }, + { + "epoch": 82.34421364985164, + "grad_norm": 0.2131173461675644, + "learning_rate": 1.7682020802377415e-08, + "loss": 0.1042, + "step": 27750 + }, + { + "epoch": 82.37388724035608, + "grad_norm": 33.43235778808594, + "learning_rate": 1.765230312035661e-08, + "loss": 0.2601, + "step": 27760 + }, + { + "epoch": 82.40356083086053, + "grad_norm": 17.585128784179688, + "learning_rate": 1.762258543833581e-08, + "loss": 0.0986, + "step": 27770 + }, + { + "epoch": 82.43323442136499, + "grad_norm": 13.549650192260742, + "learning_rate": 1.759286775631501e-08, + "loss": 0.2763, + "step": 27780 + }, + { + "epoch": 82.46290801186943, + "grad_norm": 0.13387444615364075, + "learning_rate": 1.7563150074294204e-08, + "loss": 0.0536, + "step": 27790 + }, + { + "epoch": 82.49258160237389, + "grad_norm": 24.840984344482422, + "learning_rate": 1.75334323922734e-08, + "loss": 0.2326, + "step": 27800 + }, + { + "epoch": 82.52225519287833, + "grad_norm": 0.18283355236053467, + "learning_rate": 1.75037147102526e-08, + "loss": 0.117, + "step": 27810 + }, + { + "epoch": 82.55192878338279, + "grad_norm": 0.07074689120054245, + "learning_rate": 1.7473997028231797e-08, + "loss": 0.0909, + "step": 27820 + }, + { + "epoch": 82.58160237388724, + "grad_norm": 0.33735862374305725, + "learning_rate": 1.7444279346210993e-08, + "loss": 0.1256, + "step": 27830 + }, + { + "epoch": 82.6112759643917, + "grad_norm": 0.34633269906044006, + "learning_rate": 1.741456166419019e-08, + "loss": 0.0762, + "step": 27840 + }, + { + "epoch": 82.64094955489614, + "grad_norm": 21.956138610839844, + "learning_rate": 1.738484398216939e-08, + "loss": 0.1453, + "step": 27850 + }, + { + "epoch": 82.6706231454006, + "grad_norm": 1.8026587963104248, + "learning_rate": 1.7355126300148586e-08, + "loss": 0.1533, + "step": 27860 + }, + { + "epoch": 82.70029673590504, + "grad_norm": 0.19867539405822754, + "learning_rate": 1.7325408618127785e-08, + "loss": 0.0826, + "step": 27870 + }, + { + "epoch": 82.7299703264095, + "grad_norm": 17.784053802490234, + "learning_rate": 1.7295690936106984e-08, + "loss": 0.0842, + "step": 27880 + }, + { + "epoch": 82.75964391691394, + "grad_norm": 9.351311683654785, + "learning_rate": 1.726597325408618e-08, + "loss": 0.0734, + "step": 27890 + }, + { + "epoch": 82.7893175074184, + "grad_norm": 1.1343674659729004, + "learning_rate": 1.7236255572065378e-08, + "loss": 0.0453, + "step": 27900 + }, + { + "epoch": 82.81899109792285, + "grad_norm": 15.587787628173828, + "learning_rate": 1.7206537890044577e-08, + "loss": 0.0771, + "step": 27910 + }, + { + "epoch": 82.8486646884273, + "grad_norm": 32.75673294067383, + "learning_rate": 1.7176820208023772e-08, + "loss": 0.0645, + "step": 27920 + }, + { + "epoch": 82.87833827893175, + "grad_norm": 0.7969672083854675, + "learning_rate": 1.714710252600297e-08, + "loss": 0.2024, + "step": 27930 + }, + { + "epoch": 82.90801186943621, + "grad_norm": 0.3952895998954773, + "learning_rate": 1.711738484398217e-08, + "loss": 0.1041, + "step": 27940 + }, + { + "epoch": 82.93768545994065, + "grad_norm": 2.693592071533203, + "learning_rate": 1.7087667161961365e-08, + "loss": 0.1618, + "step": 27950 + }, + { + "epoch": 82.9673590504451, + "grad_norm": 0.026422550901770592, + "learning_rate": 1.7057949479940564e-08, + "loss": 0.0365, + "step": 27960 + }, + { + "epoch": 82.99703264094956, + "grad_norm": 0.07192317396402359, + "learning_rate": 1.7028231797919763e-08, + "loss": 0.0296, + "step": 27970 + }, + { + "epoch": 83.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.21352386474609375, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 10.9333, + "eval_samples_per_second": 82.134, + "eval_steps_per_second": 10.335, + "step": 27971 + }, + { + "epoch": 83.026706231454, + "grad_norm": 11.18120002746582, + "learning_rate": 1.699851411589896e-08, + "loss": 0.1126, + "step": 27980 + }, + { + "epoch": 83.05637982195846, + "grad_norm": 32.3745002746582, + "learning_rate": 1.6968796433878157e-08, + "loss": 0.1224, + "step": 27990 + }, + { + "epoch": 83.0860534124629, + "grad_norm": 4.778066158294678, + "learning_rate": 1.6939078751857356e-08, + "loss": 0.019, + "step": 28000 + }, + { + "epoch": 83.11572700296736, + "grad_norm": 20.727210998535156, + "learning_rate": 1.690936106983655e-08, + "loss": 0.1617, + "step": 28010 + }, + { + "epoch": 83.1454005934718, + "grad_norm": 0.0697142481803894, + "learning_rate": 1.687964338781575e-08, + "loss": 0.1139, + "step": 28020 + }, + { + "epoch": 83.17507418397626, + "grad_norm": 0.036965034902095795, + "learning_rate": 1.684992570579495e-08, + "loss": 0.2253, + "step": 28030 + }, + { + "epoch": 83.20474777448071, + "grad_norm": 0.09314669668674469, + "learning_rate": 1.6820208023774145e-08, + "loss": 0.052, + "step": 28040 + }, + { + "epoch": 83.23442136498517, + "grad_norm": 3.744783878326416, + "learning_rate": 1.679049034175334e-08, + "loss": 0.0532, + "step": 28050 + }, + { + "epoch": 83.26409495548961, + "grad_norm": 44.12833786010742, + "learning_rate": 1.676077265973254e-08, + "loss": 0.1454, + "step": 28060 + }, + { + "epoch": 83.29376854599407, + "grad_norm": 0.07293867319822311, + "learning_rate": 1.6731054977711738e-08, + "loss": 0.3565, + "step": 28070 + }, + { + "epoch": 83.32344213649851, + "grad_norm": 2.117267370223999, + "learning_rate": 1.6701337295690933e-08, + "loss": 0.1309, + "step": 28080 + }, + { + "epoch": 83.35311572700297, + "grad_norm": 0.5280168652534485, + "learning_rate": 1.6671619613670132e-08, + "loss": 0.0278, + "step": 28090 + }, + { + "epoch": 83.38278931750742, + "grad_norm": 0.6754080057144165, + "learning_rate": 1.664190193164933e-08, + "loss": 0.0617, + "step": 28100 + }, + { + "epoch": 83.41246290801188, + "grad_norm": 5.945810794830322, + "learning_rate": 1.6612184249628527e-08, + "loss": 0.0493, + "step": 28110 + }, + { + "epoch": 83.44213649851632, + "grad_norm": 0.579098105430603, + "learning_rate": 1.6582466567607725e-08, + "loss": 0.034, + "step": 28120 + }, + { + "epoch": 83.47181008902078, + "grad_norm": 3.6690032482147217, + "learning_rate": 1.6552748885586924e-08, + "loss": 0.0901, + "step": 28130 + }, + { + "epoch": 83.50148367952522, + "grad_norm": 17.384838104248047, + "learning_rate": 1.652303120356612e-08, + "loss": 0.0866, + "step": 28140 + }, + { + "epoch": 83.53115727002967, + "grad_norm": 7.047588348388672, + "learning_rate": 1.649331352154532e-08, + "loss": 0.1634, + "step": 28150 + }, + { + "epoch": 83.56083086053413, + "grad_norm": 3.7520458698272705, + "learning_rate": 1.6463595839524517e-08, + "loss": 0.026, + "step": 28160 + }, + { + "epoch": 83.59050445103857, + "grad_norm": 1.1462608575820923, + "learning_rate": 1.6433878157503713e-08, + "loss": 0.0917, + "step": 28170 + }, + { + "epoch": 83.62017804154303, + "grad_norm": 34.25027084350586, + "learning_rate": 1.640416047548291e-08, + "loss": 0.0418, + "step": 28180 + }, + { + "epoch": 83.64985163204747, + "grad_norm": 0.09861575067043304, + "learning_rate": 1.637444279346211e-08, + "loss": 0.0474, + "step": 28190 + }, + { + "epoch": 83.67952522255193, + "grad_norm": 1.683854103088379, + "learning_rate": 1.6344725111441306e-08, + "loss": 0.1061, + "step": 28200 + }, + { + "epoch": 83.70919881305637, + "grad_norm": 3.0663466453552246, + "learning_rate": 1.6315007429420505e-08, + "loss": 0.0448, + "step": 28210 + }, + { + "epoch": 83.73887240356083, + "grad_norm": 0.11587049812078476, + "learning_rate": 1.6285289747399704e-08, + "loss": 0.062, + "step": 28220 + }, + { + "epoch": 83.76854599406528, + "grad_norm": 0.05032948777079582, + "learning_rate": 1.62555720653789e-08, + "loss": 0.0335, + "step": 28230 + }, + { + "epoch": 83.79821958456974, + "grad_norm": 0.1722813993692398, + "learning_rate": 1.6225854383358098e-08, + "loss": 0.145, + "step": 28240 + }, + { + "epoch": 83.82789317507418, + "grad_norm": 11.544601440429688, + "learning_rate": 1.6196136701337297e-08, + "loss": 0.1112, + "step": 28250 + }, + { + "epoch": 83.85756676557864, + "grad_norm": 22.941621780395508, + "learning_rate": 1.6166419019316492e-08, + "loss": 0.1553, + "step": 28260 + }, + { + "epoch": 83.88724035608308, + "grad_norm": 0.03257613629102707, + "learning_rate": 1.613670133729569e-08, + "loss": 0.1243, + "step": 28270 + }, + { + "epoch": 83.91691394658754, + "grad_norm": 24.448070526123047, + "learning_rate": 1.610698365527489e-08, + "loss": 0.096, + "step": 28280 + }, + { + "epoch": 83.94658753709199, + "grad_norm": 1.1939202547073364, + "learning_rate": 1.6077265973254085e-08, + "loss": 0.0656, + "step": 28290 + }, + { + "epoch": 83.97626112759644, + "grad_norm": 0.08685681968927383, + "learning_rate": 1.6047548291233284e-08, + "loss": 0.1076, + "step": 28300 + }, + { + "epoch": 84.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.2130586802959442, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 10.5465, + "eval_samples_per_second": 85.147, + "eval_steps_per_second": 10.714, + "step": 28308 + }, + { + "epoch": 84.00593471810089, + "grad_norm": 22.85161018371582, + "learning_rate": 1.6017830609212483e-08, + "loss": 0.0524, + "step": 28310 + }, + { + "epoch": 84.03560830860535, + "grad_norm": 8.842196464538574, + "learning_rate": 1.598811292719168e-08, + "loss": 0.1773, + "step": 28320 + }, + { + "epoch": 84.06528189910979, + "grad_norm": 3.384690284729004, + "learning_rate": 1.5958395245170874e-08, + "loss": 0.0689, + "step": 28330 + }, + { + "epoch": 84.09495548961425, + "grad_norm": 3.686152696609497, + "learning_rate": 1.5928677563150073e-08, + "loss": 0.0496, + "step": 28340 + }, + { + "epoch": 84.1246290801187, + "grad_norm": 0.40232014656066895, + "learning_rate": 1.5898959881129272e-08, + "loss": 0.0706, + "step": 28350 + }, + { + "epoch": 84.15430267062314, + "grad_norm": 31.858022689819336, + "learning_rate": 1.5869242199108467e-08, + "loss": 0.1613, + "step": 28360 + }, + { + "epoch": 84.1839762611276, + "grad_norm": 2.2435860633850098, + "learning_rate": 1.5839524517087666e-08, + "loss": 0.1563, + "step": 28370 + }, + { + "epoch": 84.21364985163204, + "grad_norm": 0.24213573336601257, + "learning_rate": 1.5809806835066865e-08, + "loss": 0.1785, + "step": 28380 + }, + { + "epoch": 84.2433234421365, + "grad_norm": 0.3544084131717682, + "learning_rate": 1.578008915304606e-08, + "loss": 0.0309, + "step": 28390 + }, + { + "epoch": 84.27299703264094, + "grad_norm": 7.069129943847656, + "learning_rate": 1.575037147102526e-08, + "loss": 0.0268, + "step": 28400 + }, + { + "epoch": 84.3026706231454, + "grad_norm": 1.1746786832809448, + "learning_rate": 1.5720653789004458e-08, + "loss": 0.1747, + "step": 28410 + }, + { + "epoch": 84.33234421364985, + "grad_norm": 0.32468223571777344, + "learning_rate": 1.5690936106983653e-08, + "loss": 0.1919, + "step": 28420 + }, + { + "epoch": 84.3620178041543, + "grad_norm": 2.767399311065674, + "learning_rate": 1.5661218424962852e-08, + "loss": 0.2795, + "step": 28430 + }, + { + "epoch": 84.39169139465875, + "grad_norm": 0.0653780922293663, + "learning_rate": 1.563150074294205e-08, + "loss": 0.0904, + "step": 28440 + }, + { + "epoch": 84.42136498516321, + "grad_norm": 0.0898735374212265, + "learning_rate": 1.5601783060921247e-08, + "loss": 0.1263, + "step": 28450 + }, + { + "epoch": 84.45103857566765, + "grad_norm": 11.350285530090332, + "learning_rate": 1.5572065378900445e-08, + "loss": 0.1282, + "step": 28460 + }, + { + "epoch": 84.48071216617211, + "grad_norm": 0.17841343581676483, + "learning_rate": 1.5542347696879644e-08, + "loss": 0.0434, + "step": 28470 + }, + { + "epoch": 84.51038575667656, + "grad_norm": 22.01127052307129, + "learning_rate": 1.551263001485884e-08, + "loss": 0.0737, + "step": 28480 + }, + { + "epoch": 84.54005934718101, + "grad_norm": 0.3674179017543793, + "learning_rate": 1.548291233283804e-08, + "loss": 0.0341, + "step": 28490 + }, + { + "epoch": 84.56973293768546, + "grad_norm": 2.568371057510376, + "learning_rate": 1.5453194650817237e-08, + "loss": 0.0541, + "step": 28500 + }, + { + "epoch": 84.59940652818992, + "grad_norm": 2.9547579288482666, + "learning_rate": 1.5423476968796433e-08, + "loss": 0.1305, + "step": 28510 + }, + { + "epoch": 84.62908011869436, + "grad_norm": 2.6825551986694336, + "learning_rate": 1.5393759286775632e-08, + "loss": 0.1587, + "step": 28520 + }, + { + "epoch": 84.65875370919882, + "grad_norm": 1.0777575969696045, + "learning_rate": 1.536404160475483e-08, + "loss": 0.1774, + "step": 28530 + }, + { + "epoch": 84.68842729970326, + "grad_norm": 0.7366266846656799, + "learning_rate": 1.5334323922734026e-08, + "loss": 0.088, + "step": 28540 + }, + { + "epoch": 84.71810089020771, + "grad_norm": 17.379196166992188, + "learning_rate": 1.5304606240713225e-08, + "loss": 0.2023, + "step": 28550 + }, + { + "epoch": 84.74777448071217, + "grad_norm": 0.7174727320671082, + "learning_rate": 1.5274888558692424e-08, + "loss": 0.0103, + "step": 28560 + }, + { + "epoch": 84.77744807121661, + "grad_norm": 13.0138521194458, + "learning_rate": 1.524517087667162e-08, + "loss": 0.2912, + "step": 28570 + }, + { + "epoch": 84.80712166172107, + "grad_norm": 0.3802364468574524, + "learning_rate": 1.5215453194650815e-08, + "loss": 0.1125, + "step": 28580 + }, + { + "epoch": 84.83679525222551, + "grad_norm": 0.06389407813549042, + "learning_rate": 1.5185735512630014e-08, + "loss": 0.0592, + "step": 28590 + }, + { + "epoch": 84.86646884272997, + "grad_norm": 0.29492753744125366, + "learning_rate": 1.5156017830609212e-08, + "loss": 0.0917, + "step": 28600 + }, + { + "epoch": 84.89614243323442, + "grad_norm": 0.5522872805595398, + "learning_rate": 1.5126300148588408e-08, + "loss": 0.1434, + "step": 28610 + }, + { + "epoch": 84.92581602373888, + "grad_norm": 19.511940002441406, + "learning_rate": 1.5096582466567607e-08, + "loss": 0.183, + "step": 28620 + }, + { + "epoch": 84.95548961424332, + "grad_norm": 10.489224433898926, + "learning_rate": 1.5066864784546805e-08, + "loss": 0.1276, + "step": 28630 + }, + { + "epoch": 84.98516320474778, + "grad_norm": 27.6662654876709, + "learning_rate": 1.5037147102526e-08, + "loss": 0.0687, + "step": 28640 + }, + { + "epoch": 85.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.2132398784160614, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 11.3139, + "eval_samples_per_second": 79.371, + "eval_steps_per_second": 9.988, + "step": 28645 + }, + { + "epoch": 85.01483679525222, + "grad_norm": 23.089805603027344, + "learning_rate": 1.50074294205052e-08, + "loss": 0.1166, + "step": 28650 + }, + { + "epoch": 85.04451038575668, + "grad_norm": 0.1670299619436264, + "learning_rate": 1.49777117384844e-08, + "loss": 0.008, + "step": 28660 + }, + { + "epoch": 85.07418397626112, + "grad_norm": 8.6953125, + "learning_rate": 1.4947994056463594e-08, + "loss": 0.0539, + "step": 28670 + }, + { + "epoch": 85.10385756676558, + "grad_norm": 1.966202735900879, + "learning_rate": 1.4918276374442793e-08, + "loss": 0.2707, + "step": 28680 + }, + { + "epoch": 85.13353115727003, + "grad_norm": 4.401321887969971, + "learning_rate": 1.488855869242199e-08, + "loss": 0.1385, + "step": 28690 + }, + { + "epoch": 85.16320474777449, + "grad_norm": 0.09851190447807312, + "learning_rate": 1.4858841010401187e-08, + "loss": 0.0986, + "step": 28700 + }, + { + "epoch": 85.19287833827893, + "grad_norm": 0.12188839912414551, + "learning_rate": 1.4829123328380386e-08, + "loss": 0.2748, + "step": 28710 + }, + { + "epoch": 85.22255192878339, + "grad_norm": 1.2028504610061646, + "learning_rate": 1.4799405646359583e-08, + "loss": 0.1946, + "step": 28720 + }, + { + "epoch": 85.25222551928783, + "grad_norm": 10.913734436035156, + "learning_rate": 1.476968796433878e-08, + "loss": 0.1035, + "step": 28730 + }, + { + "epoch": 85.28189910979229, + "grad_norm": 16.003097534179688, + "learning_rate": 1.473997028231798e-08, + "loss": 0.0892, + "step": 28740 + }, + { + "epoch": 85.31157270029674, + "grad_norm": 0.13552014529705048, + "learning_rate": 1.4710252600297176e-08, + "loss": 0.1155, + "step": 28750 + }, + { + "epoch": 85.34124629080118, + "grad_norm": 0.11473199725151062, + "learning_rate": 1.4680534918276375e-08, + "loss": 0.0664, + "step": 28760 + }, + { + "epoch": 85.37091988130564, + "grad_norm": 4.907312393188477, + "learning_rate": 1.4650817236255572e-08, + "loss": 0.1, + "step": 28770 + }, + { + "epoch": 85.40059347181008, + "grad_norm": 2.1858439445495605, + "learning_rate": 1.462109955423477e-08, + "loss": 0.1846, + "step": 28780 + }, + { + "epoch": 85.43026706231454, + "grad_norm": 3.654100179672241, + "learning_rate": 1.4591381872213968e-08, + "loss": 0.045, + "step": 28790 + }, + { + "epoch": 85.45994065281899, + "grad_norm": 0.015290791168808937, + "learning_rate": 1.4561664190193165e-08, + "loss": 0.0674, + "step": 28800 + }, + { + "epoch": 85.48961424332344, + "grad_norm": 28.390392303466797, + "learning_rate": 1.4531946508172363e-08, + "loss": 0.2568, + "step": 28810 + }, + { + "epoch": 85.51928783382789, + "grad_norm": 1.0182374715805054, + "learning_rate": 1.4502228826151561e-08, + "loss": 0.1356, + "step": 28820 + }, + { + "epoch": 85.54896142433235, + "grad_norm": 0.08205217868089676, + "learning_rate": 1.4472511144130759e-08, + "loss": 0.1351, + "step": 28830 + }, + { + "epoch": 85.57863501483679, + "grad_norm": 0.5871217846870422, + "learning_rate": 1.4442793462109954e-08, + "loss": 0.1939, + "step": 28840 + }, + { + "epoch": 85.60830860534125, + "grad_norm": 0.4097461700439453, + "learning_rate": 1.4413075780089151e-08, + "loss": 0.0883, + "step": 28850 + }, + { + "epoch": 85.6379821958457, + "grad_norm": 9.804461479187012, + "learning_rate": 1.4383358098068348e-08, + "loss": 0.1192, + "step": 28860 + }, + { + "epoch": 85.66765578635015, + "grad_norm": 0.0401284284889698, + "learning_rate": 1.4353640416047547e-08, + "loss": 0.0916, + "step": 28870 + }, + { + "epoch": 85.6973293768546, + "grad_norm": 0.12768255174160004, + "learning_rate": 1.4323922734026744e-08, + "loss": 0.0455, + "step": 28880 + }, + { + "epoch": 85.72700296735906, + "grad_norm": 0.26914459466934204, + "learning_rate": 1.4294205052005942e-08, + "loss": 0.1442, + "step": 28890 + }, + { + "epoch": 85.7566765578635, + "grad_norm": 0.34466543793678284, + "learning_rate": 1.426448736998514e-08, + "loss": 0.1564, + "step": 28900 + }, + { + "epoch": 85.78635014836796, + "grad_norm": 0.8766911625862122, + "learning_rate": 1.4234769687964338e-08, + "loss": 0.0751, + "step": 28910 + }, + { + "epoch": 85.8160237388724, + "grad_norm": 19.319326400756836, + "learning_rate": 1.4205052005943535e-08, + "loss": 0.161, + "step": 28920 + }, + { + "epoch": 85.84569732937686, + "grad_norm": 4.767066478729248, + "learning_rate": 1.4175334323922734e-08, + "loss": 0.1011, + "step": 28930 + }, + { + "epoch": 85.8753709198813, + "grad_norm": 0.2467062771320343, + "learning_rate": 1.414561664190193e-08, + "loss": 0.0778, + "step": 28940 + }, + { + "epoch": 85.90504451038575, + "grad_norm": 2.0842819213867188, + "learning_rate": 1.4115898959881128e-08, + "loss": 0.0256, + "step": 28950 + }, + { + "epoch": 85.93471810089021, + "grad_norm": 1.1600841283798218, + "learning_rate": 1.4086181277860327e-08, + "loss": 0.2101, + "step": 28960 + }, + { + "epoch": 85.96439169139465, + "grad_norm": 7.500657081604004, + "learning_rate": 1.4056463595839524e-08, + "loss": 0.1384, + "step": 28970 + }, + { + "epoch": 85.99406528189911, + "grad_norm": 2.217595100402832, + "learning_rate": 1.4026745913818721e-08, + "loss": 0.0172, + "step": 28980 + }, + { + "epoch": 86.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.21298570930957794, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 10.4677, + "eval_samples_per_second": 85.788, + "eval_steps_per_second": 10.795, + "step": 28982 + }, + { + "epoch": 86.02373887240356, + "grad_norm": 0.7808936834335327, + "learning_rate": 1.399702823179792e-08, + "loss": 0.1496, + "step": 28990 + }, + { + "epoch": 86.05341246290801, + "grad_norm": 33.77977752685547, + "learning_rate": 1.3967310549777117e-08, + "loss": 0.1482, + "step": 29000 + }, + { + "epoch": 86.08308605341246, + "grad_norm": 24.7419376373291, + "learning_rate": 1.3937592867756314e-08, + "loss": 0.0898, + "step": 29010 + }, + { + "epoch": 86.11275964391692, + "grad_norm": 0.04733268916606903, + "learning_rate": 1.3907875185735513e-08, + "loss": 0.0449, + "step": 29020 + }, + { + "epoch": 86.14243323442136, + "grad_norm": 0.07367201894521713, + "learning_rate": 1.387815750371471e-08, + "loss": 0.1016, + "step": 29030 + }, + { + "epoch": 86.17210682492582, + "grad_norm": 0.014619083143770695, + "learning_rate": 1.3848439821693907e-08, + "loss": 0.1714, + "step": 29040 + }, + { + "epoch": 86.20178041543026, + "grad_norm": 9.828795433044434, + "learning_rate": 1.3818722139673106e-08, + "loss": 0.1277, + "step": 29050 + }, + { + "epoch": 86.23145400593472, + "grad_norm": 0.10482776165008545, + "learning_rate": 1.3789004457652303e-08, + "loss": 0.1385, + "step": 29060 + }, + { + "epoch": 86.26112759643917, + "grad_norm": 7.518313884735107, + "learning_rate": 1.37592867756315e-08, + "loss": 0.0724, + "step": 29070 + }, + { + "epoch": 86.29080118694363, + "grad_norm": 28.430492401123047, + "learning_rate": 1.37295690936107e-08, + "loss": 0.029, + "step": 29080 + }, + { + "epoch": 86.32047477744807, + "grad_norm": 1.7795649766921997, + "learning_rate": 1.3699851411589896e-08, + "loss": 0.1278, + "step": 29090 + }, + { + "epoch": 86.35014836795253, + "grad_norm": 17.413206100463867, + "learning_rate": 1.3670133729569092e-08, + "loss": 0.2039, + "step": 29100 + }, + { + "epoch": 86.37982195845697, + "grad_norm": 0.4141519069671631, + "learning_rate": 1.3640416047548289e-08, + "loss": 0.0666, + "step": 29110 + }, + { + "epoch": 86.40949554896143, + "grad_norm": 2.242079019546509, + "learning_rate": 1.3610698365527486e-08, + "loss": 0.1099, + "step": 29120 + }, + { + "epoch": 86.43916913946587, + "grad_norm": 0.7124994993209839, + "learning_rate": 1.3580980683506685e-08, + "loss": 0.2028, + "step": 29130 + }, + { + "epoch": 86.46884272997033, + "grad_norm": 2.0175364017486572, + "learning_rate": 1.3551263001485882e-08, + "loss": 0.055, + "step": 29140 + }, + { + "epoch": 86.49851632047478, + "grad_norm": 16.202972412109375, + "learning_rate": 1.3521545319465081e-08, + "loss": 0.088, + "step": 29150 + }, + { + "epoch": 86.52818991097922, + "grad_norm": 5.537349700927734, + "learning_rate": 1.3491827637444278e-08, + "loss": 0.0641, + "step": 29160 + }, + { + "epoch": 86.55786350148368, + "grad_norm": 6.025691032409668, + "learning_rate": 1.3462109955423475e-08, + "loss": 0.0675, + "step": 29170 + }, + { + "epoch": 86.58753709198812, + "grad_norm": 6.648719310760498, + "learning_rate": 1.3432392273402674e-08, + "loss": 0.1959, + "step": 29180 + }, + { + "epoch": 86.61721068249258, + "grad_norm": 10.574997901916504, + "learning_rate": 1.3402674591381871e-08, + "loss": 0.0799, + "step": 29190 + }, + { + "epoch": 86.64688427299703, + "grad_norm": 0.9952903985977173, + "learning_rate": 1.3372956909361069e-08, + "loss": 0.029, + "step": 29200 + }, + { + "epoch": 86.67655786350149, + "grad_norm": 0.12743324041366577, + "learning_rate": 1.3343239227340267e-08, + "loss": 0.0328, + "step": 29210 + }, + { + "epoch": 86.70623145400593, + "grad_norm": 0.2018243819475174, + "learning_rate": 1.3313521545319465e-08, + "loss": 0.1381, + "step": 29220 + }, + { + "epoch": 86.73590504451039, + "grad_norm": 0.046130578964948654, + "learning_rate": 1.3283803863298662e-08, + "loss": 0.0774, + "step": 29230 + }, + { + "epoch": 86.76557863501483, + "grad_norm": 0.6335132122039795, + "learning_rate": 1.325408618127786e-08, + "loss": 0.1147, + "step": 29240 + }, + { + "epoch": 86.79525222551929, + "grad_norm": 6.955982685089111, + "learning_rate": 1.3224368499257058e-08, + "loss": 0.0717, + "step": 29250 + }, + { + "epoch": 86.82492581602374, + "grad_norm": 0.31542277336120605, + "learning_rate": 1.3194650817236255e-08, + "loss": 0.0789, + "step": 29260 + }, + { + "epoch": 86.8545994065282, + "grad_norm": 2.1231513023376465, + "learning_rate": 1.3164933135215454e-08, + "loss": 0.1398, + "step": 29270 + }, + { + "epoch": 86.88427299703264, + "grad_norm": 13.030653953552246, + "learning_rate": 1.313521545319465e-08, + "loss": 0.0487, + "step": 29280 + }, + { + "epoch": 86.9139465875371, + "grad_norm": 16.07188606262207, + "learning_rate": 1.3105497771173848e-08, + "loss": 0.2191, + "step": 29290 + }, + { + "epoch": 86.94362017804154, + "grad_norm": 0.3996366560459137, + "learning_rate": 1.3075780089153047e-08, + "loss": 0.1055, + "step": 29300 + }, + { + "epoch": 86.973293768546, + "grad_norm": 28.550018310546875, + "learning_rate": 1.3046062407132244e-08, + "loss": 0.2303, + "step": 29310 + }, + { + "epoch": 87.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.21310210227966309, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 10.8502, + "eval_samples_per_second": 82.763, + "eval_steps_per_second": 10.415, + "step": 29319 + }, + { + "epoch": 87.00296735905044, + "grad_norm": 3.760838508605957, + "learning_rate": 1.3016344725111441e-08, + "loss": 0.1672, + "step": 29320 + }, + { + "epoch": 87.0326409495549, + "grad_norm": 14.336623191833496, + "learning_rate": 1.298662704309064e-08, + "loss": 0.1411, + "step": 29330 + }, + { + "epoch": 87.06231454005935, + "grad_norm": 14.28589916229248, + "learning_rate": 1.2956909361069837e-08, + "loss": 0.1386, + "step": 29340 + }, + { + "epoch": 87.09198813056379, + "grad_norm": 3.1250691413879395, + "learning_rate": 1.2927191679049034e-08, + "loss": 0.1051, + "step": 29350 + }, + { + "epoch": 87.12166172106825, + "grad_norm": 0.21983224153518677, + "learning_rate": 1.2897473997028233e-08, + "loss": 0.0317, + "step": 29360 + }, + { + "epoch": 87.1513353115727, + "grad_norm": 27.801578521728516, + "learning_rate": 1.2867756315007427e-08, + "loss": 0.06, + "step": 29370 + }, + { + "epoch": 87.18100890207715, + "grad_norm": 26.937803268432617, + "learning_rate": 1.2838038632986626e-08, + "loss": 0.1268, + "step": 29380 + }, + { + "epoch": 87.2106824925816, + "grad_norm": 0.3838219940662384, + "learning_rate": 1.2808320950965823e-08, + "loss": 0.03, + "step": 29390 + }, + { + "epoch": 87.24035608308606, + "grad_norm": 41.79075622558594, + "learning_rate": 1.277860326894502e-08, + "loss": 0.135, + "step": 29400 + }, + { + "epoch": 87.2700296735905, + "grad_norm": 4.700495719909668, + "learning_rate": 1.2748885586924219e-08, + "loss": 0.2317, + "step": 29410 + }, + { + "epoch": 87.29970326409496, + "grad_norm": 14.798096656799316, + "learning_rate": 1.2719167904903416e-08, + "loss": 0.1454, + "step": 29420 + }, + { + "epoch": 87.3293768545994, + "grad_norm": 0.4070928692817688, + "learning_rate": 1.2689450222882613e-08, + "loss": 0.1289, + "step": 29430 + }, + { + "epoch": 87.35905044510386, + "grad_norm": 0.05226895958185196, + "learning_rate": 1.2659732540861812e-08, + "loss": 0.1033, + "step": 29440 + }, + { + "epoch": 87.3887240356083, + "grad_norm": 11.051858901977539, + "learning_rate": 1.2630014858841009e-08, + "loss": 0.0809, + "step": 29450 + }, + { + "epoch": 87.41839762611276, + "grad_norm": 0.439992755651474, + "learning_rate": 1.2600297176820206e-08, + "loss": 0.1003, + "step": 29460 + }, + { + "epoch": 87.44807121661721, + "grad_norm": 19.918548583984375, + "learning_rate": 1.2570579494799405e-08, + "loss": 0.185, + "step": 29470 + }, + { + "epoch": 87.47774480712167, + "grad_norm": 3.4985172748565674, + "learning_rate": 1.2540861812778602e-08, + "loss": 0.1189, + "step": 29480 + }, + { + "epoch": 87.50741839762611, + "grad_norm": 0.9021595120429993, + "learning_rate": 1.25111441307578e-08, + "loss": 0.0729, + "step": 29490 + }, + { + "epoch": 87.53709198813057, + "grad_norm": 0.4054378271102905, + "learning_rate": 1.2481426448736998e-08, + "loss": 0.0734, + "step": 29500 + }, + { + "epoch": 87.56676557863501, + "grad_norm": 2.1523725986480713, + "learning_rate": 1.2451708766716195e-08, + "loss": 0.0206, + "step": 29510 + }, + { + "epoch": 87.59643916913947, + "grad_norm": 0.8323065042495728, + "learning_rate": 1.2421991084695394e-08, + "loss": 0.1118, + "step": 29520 + }, + { + "epoch": 87.62611275964392, + "grad_norm": 8.514741897583008, + "learning_rate": 1.2392273402674591e-08, + "loss": 0.224, + "step": 29530 + }, + { + "epoch": 87.65578635014836, + "grad_norm": 0.02887376770377159, + "learning_rate": 1.2362555720653789e-08, + "loss": 0.0755, + "step": 29540 + }, + { + "epoch": 87.68545994065282, + "grad_norm": 1.9842942953109741, + "learning_rate": 1.2332838038632987e-08, + "loss": 0.0094, + "step": 29550 + }, + { + "epoch": 87.71513353115726, + "grad_norm": 3.0420639514923096, + "learning_rate": 1.2303120356612183e-08, + "loss": 0.0366, + "step": 29560 + }, + { + "epoch": 87.74480712166172, + "grad_norm": 0.6588202118873596, + "learning_rate": 1.227340267459138e-08, + "loss": 0.1198, + "step": 29570 + }, + { + "epoch": 87.77448071216617, + "grad_norm": 0.267260879278183, + "learning_rate": 1.2243684992570579e-08, + "loss": 0.0905, + "step": 29580 + }, + { + "epoch": 87.80415430267063, + "grad_norm": 0.28564050793647766, + "learning_rate": 1.2213967310549776e-08, + "loss": 0.0919, + "step": 29590 + }, + { + "epoch": 87.83382789317507, + "grad_norm": 7.72484016418457, + "learning_rate": 1.2184249628528973e-08, + "loss": 0.2361, + "step": 29600 + }, + { + "epoch": 87.86350148367953, + "grad_norm": 0.017937902361154556, + "learning_rate": 1.2154531946508172e-08, + "loss": 0.0475, + "step": 29610 + }, + { + "epoch": 87.89317507418397, + "grad_norm": 0.039205048233270645, + "learning_rate": 1.212481426448737e-08, + "loss": 0.1759, + "step": 29620 + }, + { + "epoch": 87.92284866468843, + "grad_norm": 12.069364547729492, + "learning_rate": 1.2095096582466566e-08, + "loss": 0.0809, + "step": 29630 + }, + { + "epoch": 87.95252225519287, + "grad_norm": 4.965465068817139, + "learning_rate": 1.2065378900445765e-08, + "loss": 0.0442, + "step": 29640 + }, + { + "epoch": 87.98219584569733, + "grad_norm": 2.05121111869812, + "learning_rate": 1.2035661218424962e-08, + "loss": 0.0952, + "step": 29650 + }, + { + "epoch": 88.0, + "eval_accuracy": 0.9365256124721604, + "eval_loss": 0.21283374726772308, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 11.187, + "eval_samples_per_second": 80.272, + "eval_steps_per_second": 10.101, + "step": 29656 + }, + { + "epoch": 88.01186943620178, + "grad_norm": 21.134258270263672, + "learning_rate": 1.200594353640416e-08, + "loss": 0.2052, + "step": 29660 + }, + { + "epoch": 88.04154302670624, + "grad_norm": 0.6930063962936401, + "learning_rate": 1.1976225854383358e-08, + "loss": 0.0961, + "step": 29670 + }, + { + "epoch": 88.07121661721068, + "grad_norm": 28.321861267089844, + "learning_rate": 1.1946508172362555e-08, + "loss": 0.1832, + "step": 29680 + }, + { + "epoch": 88.10089020771514, + "grad_norm": 0.27329081296920776, + "learning_rate": 1.1916790490341754e-08, + "loss": 0.0102, + "step": 29690 + }, + { + "epoch": 88.13056379821958, + "grad_norm": 20.957929611206055, + "learning_rate": 1.188707280832095e-08, + "loss": 0.1245, + "step": 29700 + }, + { + "epoch": 88.16023738872404, + "grad_norm": 30.407072067260742, + "learning_rate": 1.1857355126300147e-08, + "loss": 0.1158, + "step": 29710 + }, + { + "epoch": 88.18991097922849, + "grad_norm": 20.763898849487305, + "learning_rate": 1.1827637444279346e-08, + "loss": 0.1586, + "step": 29720 + }, + { + "epoch": 88.21958456973294, + "grad_norm": 0.9294232726097107, + "learning_rate": 1.1797919762258543e-08, + "loss": 0.1209, + "step": 29730 + }, + { + "epoch": 88.24925816023739, + "grad_norm": 3.7630012035369873, + "learning_rate": 1.176820208023774e-08, + "loss": 0.2644, + "step": 29740 + }, + { + "epoch": 88.27893175074183, + "grad_norm": 0.03508307784795761, + "learning_rate": 1.1738484398216939e-08, + "loss": 0.0465, + "step": 29750 + }, + { + "epoch": 88.30860534124629, + "grad_norm": 0.1786014884710312, + "learning_rate": 1.1708766716196136e-08, + "loss": 0.0373, + "step": 29760 + }, + { + "epoch": 88.33827893175074, + "grad_norm": 0.05669807270169258, + "learning_rate": 1.1679049034175333e-08, + "loss": 0.107, + "step": 29770 + }, + { + "epoch": 88.3679525222552, + "grad_norm": 0.05636304244399071, + "learning_rate": 1.1649331352154532e-08, + "loss": 0.0152, + "step": 29780 + }, + { + "epoch": 88.39762611275964, + "grad_norm": 3.7968850135803223, + "learning_rate": 1.161961367013373e-08, + "loss": 0.1255, + "step": 29790 + }, + { + "epoch": 88.4272997032641, + "grad_norm": 22.258941650390625, + "learning_rate": 1.1589895988112926e-08, + "loss": 0.0837, + "step": 29800 + }, + { + "epoch": 88.45697329376854, + "grad_norm": 29.58533477783203, + "learning_rate": 1.1560178306092125e-08, + "loss": 0.103, + "step": 29810 + }, + { + "epoch": 88.486646884273, + "grad_norm": 0.02337433397769928, + "learning_rate": 1.1530460624071322e-08, + "loss": 0.2417, + "step": 29820 + }, + { + "epoch": 88.51632047477744, + "grad_norm": 11.630218505859375, + "learning_rate": 1.150074294205052e-08, + "loss": 0.1233, + "step": 29830 + }, + { + "epoch": 88.5459940652819, + "grad_norm": 2.452780246734619, + "learning_rate": 1.1471025260029717e-08, + "loss": 0.0685, + "step": 29840 + }, + { + "epoch": 88.57566765578635, + "grad_norm": 0.43817776441574097, + "learning_rate": 1.1441307578008914e-08, + "loss": 0.0244, + "step": 29850 + }, + { + "epoch": 88.6053412462908, + "grad_norm": 16.086467742919922, + "learning_rate": 1.1411589895988113e-08, + "loss": 0.1021, + "step": 29860 + }, + { + "epoch": 88.63501483679525, + "grad_norm": 0.3081299364566803, + "learning_rate": 1.138187221396731e-08, + "loss": 0.0424, + "step": 29870 + }, + { + "epoch": 88.66468842729971, + "grad_norm": 0.7208992838859558, + "learning_rate": 1.1352154531946507e-08, + "loss": 0.1214, + "step": 29880 + }, + { + "epoch": 88.69436201780415, + "grad_norm": 0.11997605860233307, + "learning_rate": 1.1322436849925706e-08, + "loss": 0.146, + "step": 29890 + }, + { + "epoch": 88.72403560830861, + "grad_norm": 0.5597664713859558, + "learning_rate": 1.1292719167904903e-08, + "loss": 0.154, + "step": 29900 + }, + { + "epoch": 88.75370919881306, + "grad_norm": 0.04396020621061325, + "learning_rate": 1.12630014858841e-08, + "loss": 0.1257, + "step": 29910 + }, + { + "epoch": 88.78338278931751, + "grad_norm": 0.08378578722476959, + "learning_rate": 1.1233283803863299e-08, + "loss": 0.1244, + "step": 29920 + }, + { + "epoch": 88.81305637982196, + "grad_norm": 0.063012033700943, + "learning_rate": 1.1203566121842496e-08, + "loss": 0.1176, + "step": 29930 + }, + { + "epoch": 88.8427299703264, + "grad_norm": 1.3153387308120728, + "learning_rate": 1.1173848439821693e-08, + "loss": 0.302, + "step": 29940 + }, + { + "epoch": 88.87240356083086, + "grad_norm": 0.5059306621551514, + "learning_rate": 1.1144130757800892e-08, + "loss": 0.0492, + "step": 29950 + }, + { + "epoch": 88.9020771513353, + "grad_norm": 23.17951202392578, + "learning_rate": 1.1114413075780088e-08, + "loss": 0.3289, + "step": 29960 + }, + { + "epoch": 88.93175074183976, + "grad_norm": 0.03366761654615402, + "learning_rate": 1.1084695393759286e-08, + "loss": 0.0294, + "step": 29970 + }, + { + "epoch": 88.96142433234421, + "grad_norm": 1.7499821186065674, + "learning_rate": 1.1054977711738484e-08, + "loss": 0.0441, + "step": 29980 + }, + { + "epoch": 88.99109792284867, + "grad_norm": 1.3240936994552612, + "learning_rate": 1.102526002971768e-08, + "loss": 0.1733, + "step": 29990 + }, + { + "epoch": 89.0, + "eval_accuracy": 0.9376391982182628, + "eval_loss": 0.2127562165260315, + "eval_model_preparation_time": 0.0035, + "eval_runtime": 11.6032, + "eval_samples_per_second": 77.393, + "eval_steps_per_second": 9.739, + "step": 29993 + } + ], + "logging_steps": 10, + "max_steps": 33700, + "num_input_tokens_seen": 0, + "num_train_epochs": 100, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.8566650015392375e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}