{ "best_metric": null, "best_model_checkpoint": null, "epoch": 500.0, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": NaN, "learning_rate": 0.0, "loss": 5.453, "step": 1 }, { "epoch": 2.0, "grad_norm": 2.349853515625, "learning_rate": 0.0001, "loss": 5.453, "step": 2 }, { "epoch": 3.0, "grad_norm": 2.349853515625, "learning_rate": 0.0002, "loss": 5.453, "step": 3 }, { "epoch": 4.0, "grad_norm": 2.8772072792053223, "learning_rate": 0.0001995983935742972, "loss": 5.1576, "step": 4 }, { "epoch": 5.0, "grad_norm": 5.726413726806641, "learning_rate": 0.0001991967871485944, "loss": 4.6773, "step": 5 }, { "epoch": 6.0, "grad_norm": 8.641162872314453, "learning_rate": 0.00019879518072289158, "loss": 4.2517, "step": 6 }, { "epoch": 7.0, "grad_norm": 11.281049728393555, "learning_rate": 0.00019839357429718877, "loss": 3.8401, "step": 7 }, { "epoch": 8.0, "grad_norm": 13.561233520507812, "learning_rate": 0.00019799196787148596, "loss": 3.428, "step": 8 }, { "epoch": 9.0, "grad_norm": Infinity, "learning_rate": 0.00019799196787148596, "loss": 2.9945, "step": 9 }, { "epoch": 10.0, "grad_norm": 15.408284187316895, "learning_rate": 0.00019759036144578314, "loss": 2.9945, "step": 10 }, { "epoch": 11.0, "grad_norm": 16.737504959106445, "learning_rate": 0.00019718875502008033, "loss": 2.5293, "step": 11 }, { "epoch": 12.0, "grad_norm": 17.475238800048828, "learning_rate": 0.00019678714859437752, "loss": 2.0314, "step": 12 }, { "epoch": 13.0, "grad_norm": 17.607587814331055, "learning_rate": 0.0001963855421686747, "loss": 1.5013, "step": 13 }, { "epoch": 14.0, "grad_norm": 17.160503387451172, "learning_rate": 0.0001959839357429719, "loss": 0.9549, "step": 14 }, { "epoch": 15.0, "grad_norm": 16.20315933227539, "learning_rate": 0.00019558232931726906, "loss": 0.4056, "step": 15 }, { "epoch": 16.0, "grad_norm": 2.948519229888916, "learning_rate": 0.00019518072289156628, "loss": 0.1432, "step": 16 }, { "epoch": 17.0, "grad_norm": 3.228358507156372, "learning_rate": 0.00019477911646586347, "loss": 0.1246, "step": 17 }, { "epoch": 18.0, "grad_norm": 3.2282843589782715, "learning_rate": 0.00019437751004016066, "loss": 0.1034, "step": 18 }, { "epoch": 19.0, "grad_norm": 1.0867902040481567, "learning_rate": 0.00019397590361445782, "loss": 0.0786, "step": 19 }, { "epoch": 20.0, "grad_norm": 0.5086488723754883, "learning_rate": 0.00019357429718875504, "loss": 0.0706, "step": 20 }, { "epoch": 21.0, "grad_norm": 0.4829910397529602, "learning_rate": 0.00019317269076305223, "loss": 0.063, "step": 21 }, { "epoch": 22.0, "grad_norm": 0.5215936899185181, "learning_rate": 0.00019277108433734942, "loss": 0.0565, "step": 22 }, { "epoch": 23.0, "grad_norm": 0.5226811766624451, "learning_rate": 0.00019236947791164658, "loss": 0.0445, "step": 23 }, { "epoch": 24.0, "grad_norm": 0.5145213603973389, "learning_rate": 0.00019196787148594377, "loss": 0.0316, "step": 24 }, { "epoch": 25.0, "grad_norm": 0.50876384973526, "learning_rate": 0.00019156626506024098, "loss": 0.0197, "step": 25 }, { "epoch": 26.0, "grad_norm": 0.23416705429553986, "learning_rate": 0.00019116465863453817, "loss": 0.0081, "step": 26 }, { "epoch": 27.0, "grad_norm": 0.07123460620641708, "learning_rate": 0.00019076305220883533, "loss": 0.0055, "step": 27 }, { "epoch": 28.0, "grad_norm": 0.17463913559913635, "learning_rate": 0.00019036144578313252, "loss": 0.0056, "step": 28 }, { "epoch": 29.0, "grad_norm": 0.08099503815174103, "learning_rate": 0.00018995983935742974, "loss": 0.005, "step": 29 }, { "epoch": 30.0, "grad_norm": 0.10401125252246857, "learning_rate": 0.00018955823293172693, "loss": 0.005, "step": 30 }, { "epoch": 31.0, "grad_norm": 0.18095582723617554, "learning_rate": 0.0001891566265060241, "loss": 0.0057, "step": 31 }, { "epoch": 32.0, "grad_norm": 0.056214649230241776, "learning_rate": 0.00018875502008032128, "loss": 0.0048, "step": 32 }, { "epoch": 33.0, "grad_norm": 0.12311957776546478, "learning_rate": 0.0001883534136546185, "loss": 0.0051, "step": 33 }, { "epoch": 34.0, "grad_norm": 0.1186085045337677, "learning_rate": 0.00018795180722891569, "loss": 0.0052, "step": 34 }, { "epoch": 35.0, "grad_norm": 0.05622096359729767, "learning_rate": 0.00018755020080321285, "loss": 0.0048, "step": 35 }, { "epoch": 36.0, "grad_norm": 0.1179257333278656, "learning_rate": 0.00018714859437751004, "loss": 0.0051, "step": 36 }, { "epoch": 37.0, "grad_norm": 0.02104870229959488, "learning_rate": 0.00018674698795180723, "loss": 0.005, "step": 37 }, { "epoch": 38.0, "grad_norm": 0.09374430030584335, "learning_rate": 0.00018634538152610444, "loss": 0.005, "step": 38 }, { "epoch": 39.0, "grad_norm": 0.03337598219513893, "learning_rate": 0.0001859437751004016, "loss": 0.0048, "step": 39 }, { "epoch": 40.0, "grad_norm": 0.09240953624248505, "learning_rate": 0.0001855421686746988, "loss": 0.005, "step": 40 }, { "epoch": 41.0, "grad_norm": 0.02956731617450714, "learning_rate": 0.00018514056224899598, "loss": 0.0048, "step": 41 }, { "epoch": 42.0, "grad_norm": 0.08981137722730637, "learning_rate": 0.0001847389558232932, "loss": 0.005, "step": 42 }, { "epoch": 43.0, "grad_norm": 0.0320754237473011, "learning_rate": 0.00018433734939759036, "loss": 0.0048, "step": 43 }, { "epoch": 44.0, "grad_norm": 0.088747039437294, "learning_rate": 0.00018393574297188755, "loss": 0.005, "step": 44 }, { "epoch": 45.0, "grad_norm": 0.04862065240740776, "learning_rate": 0.00018353413654618474, "loss": 0.0048, "step": 45 }, { "epoch": 46.0, "grad_norm": 0.08655441552400589, "learning_rate": 0.00018313253012048193, "loss": 0.005, "step": 46 }, { "epoch": 47.0, "grad_norm": 0.06792537122964859, "learning_rate": 0.00018273092369477912, "loss": 0.0049, "step": 47 }, { "epoch": 48.0, "grad_norm": 0.08522295206785202, "learning_rate": 0.0001823293172690763, "loss": 0.005, "step": 48 }, { "epoch": 49.0, "grad_norm": 0.04663475975394249, "learning_rate": 0.0001819277108433735, "loss": 0.0048, "step": 49 }, { "epoch": 50.0, "grad_norm": 0.029744861647486687, "learning_rate": 0.0001815261044176707, "loss": 0.0048, "step": 50 }, { "epoch": 51.0, "grad_norm": 0.06534643471240997, "learning_rate": 0.0001811244979919679, "loss": 0.0049, "step": 51 }, { "epoch": 52.0, "grad_norm": 0.007808285299688578, "learning_rate": 0.00018072289156626507, "loss": 0.0047, "step": 52 }, { "epoch": 53.0, "grad_norm": 0.0546131432056427, "learning_rate": 0.00018032128514056225, "loss": 0.0051, "step": 53 }, { "epoch": 54.0, "grad_norm": 0.016065070405602455, "learning_rate": 0.00017991967871485944, "loss": 0.0045, "step": 54 }, { "epoch": 55.0, "grad_norm": 0.02857162430882454, "learning_rate": 0.00017951807228915663, "loss": 0.0048, "step": 55 }, { "epoch": 56.0, "grad_norm": 0.03628357872366905, "learning_rate": 0.00017911646586345382, "loss": 0.005, "step": 56 }, { "epoch": 57.0, "grad_norm": 0.0075448257848620415, "learning_rate": 0.000178714859437751, "loss": 0.0047, "step": 57 }, { "epoch": 58.0, "grad_norm": 0.04328390583395958, "learning_rate": 0.0001783132530120482, "loss": 0.0048, "step": 58 }, { "epoch": 59.0, "grad_norm": 0.0074650561437010765, "learning_rate": 0.0001779116465863454, "loss": 0.0047, "step": 59 }, { "epoch": 60.0, "grad_norm": 0.04476163163781166, "learning_rate": 0.00017751004016064258, "loss": 0.0048, "step": 60 }, { "epoch": 61.0, "grad_norm": 0.010204787366092205, "learning_rate": 0.00017710843373493977, "loss": 0.0047, "step": 61 }, { "epoch": 62.0, "grad_norm": 0.007382046431303024, "learning_rate": 0.00017670682730923696, "loss": 0.0047, "step": 62 }, { "epoch": 63.0, "grad_norm": 0.042141955345869064, "learning_rate": 0.00017630522088353415, "loss": 0.0048, "step": 63 }, { "epoch": 64.0, "grad_norm": 0.007281558588147163, "learning_rate": 0.00017590361445783134, "loss": 0.0047, "step": 64 }, { "epoch": 65.0, "grad_norm": 0.04382139816880226, "learning_rate": 0.00017550200803212853, "loss": 0.0048, "step": 65 }, { "epoch": 66.0, "grad_norm": 0.027046991512179375, "learning_rate": 0.00017510040160642571, "loss": 0.0048, "step": 66 }, { "epoch": 67.0, "grad_norm": 0.041668713092803955, "learning_rate": 0.0001746987951807229, "loss": 0.0048, "step": 67 }, { "epoch": 68.0, "grad_norm": 0.024397345259785652, "learning_rate": 0.0001742971887550201, "loss": 0.0048, "step": 68 }, { "epoch": 69.0, "grad_norm": 0.010038570500910282, "learning_rate": 0.00017389558232931728, "loss": 0.0047, "step": 69 }, { "epoch": 70.0, "grad_norm": 0.04328250139951706, "learning_rate": 0.00017349397590361447, "loss": 0.0048, "step": 70 }, { "epoch": 71.0, "grad_norm": 0.007169268559664488, "learning_rate": 0.00017309236947791166, "loss": 0.0047, "step": 71 }, { "epoch": 72.0, "grad_norm": 0.024196363985538483, "learning_rate": 0.00017269076305220885, "loss": 0.0048, "step": 72 }, { "epoch": 73.0, "grad_norm": 0.016377076506614685, "learning_rate": 0.00017228915662650604, "loss": 0.005, "step": 73 }, { "epoch": 74.0, "grad_norm": 0.009935774840414524, "learning_rate": 0.00017188755020080323, "loss": 0.0047, "step": 74 }, { "epoch": 75.0, "grad_norm": 0.026620058342814445, "learning_rate": 0.00017148594377510042, "loss": 0.0048, "step": 75 }, { "epoch": 76.0, "grad_norm": 0.00991370715200901, "learning_rate": 0.0001710843373493976, "loss": 0.0047, "step": 76 }, { "epoch": 77.0, "grad_norm": 0.041056908667087555, "learning_rate": 0.00017068273092369477, "loss": 0.0048, "step": 77 }, { "epoch": 78.0, "grad_norm": 0.0022562043741345406, "learning_rate": 0.00017028112449799199, "loss": 0.0045, "step": 78 }, { "epoch": 79.0, "grad_norm": 0.009892994537949562, "learning_rate": 0.00016987951807228917, "loss": 0.0047, "step": 79 }, { "epoch": 80.0, "grad_norm": 0.026598593220114708, "learning_rate": 0.00016947791164658636, "loss": 0.0048, "step": 80 }, { "epoch": 81.0, "grad_norm": 0.007094119675457478, "learning_rate": 0.00016907630522088353, "loss": 0.0047, "step": 81 }, { "epoch": 82.0, "grad_norm": 0.024074360728263855, "learning_rate": 0.00016867469879518074, "loss": 0.0048, "step": 82 }, { "epoch": 83.0, "grad_norm": 0.007143808528780937, "learning_rate": 0.00016827309236947793, "loss": 0.0047, "step": 83 }, { "epoch": 84.0, "grad_norm": 0.00991752091795206, "learning_rate": 0.00016787148594377512, "loss": 0.0047, "step": 84 }, { "epoch": 85.0, "grad_norm": 0.026643570512533188, "learning_rate": 0.00016746987951807228, "loss": 0.0048, "step": 85 }, { "epoch": 86.0, "grad_norm": 0.0071501159109175205, "learning_rate": 0.00016706827309236947, "loss": 0.0047, "step": 86 }, { "epoch": 87.0, "grad_norm": 0.041228219866752625, "learning_rate": 0.0001666666666666667, "loss": 0.0048, "step": 87 }, { "epoch": 88.0, "grad_norm": 0.026733651757240295, "learning_rate": 0.00016626506024096388, "loss": 0.0048, "step": 88 }, { "epoch": 89.0, "grad_norm": 0.01906830444931984, "learning_rate": 0.00016586345381526104, "loss": 0.0045, "step": 89 }, { "epoch": 90.0, "grad_norm": 0.016484271734952927, "learning_rate": 0.00016546184738955823, "loss": 0.005, "step": 90 }, { "epoch": 91.0, "grad_norm": 0.007174884434789419, "learning_rate": 0.00016506024096385545, "loss": 0.0047, "step": 91 }, { "epoch": 92.0, "grad_norm": 0.007238594815135002, "learning_rate": 0.00016465863453815263, "loss": 0.0047, "step": 92 }, { "epoch": 93.0, "grad_norm": 0.010034309700131416, "learning_rate": 0.0001642570281124498, "loss": 0.0047, "step": 93 }, { "epoch": 94.0, "grad_norm": 0.02700984664261341, "learning_rate": 0.00016385542168674699, "loss": 0.0048, "step": 94 }, { "epoch": 95.0, "grad_norm": 0.015189659781754017, "learning_rate": 0.00016345381526104417, "loss": 0.0045, "step": 95 }, { "epoch": 96.0, "grad_norm": 0.024589456617832184, "learning_rate": 0.0001630522088353414, "loss": 0.0048, "step": 96 }, { "epoch": 97.0, "grad_norm": 0.001375726773403585, "learning_rate": 0.00016265060240963855, "loss": 0.0049, "step": 97 }, { "epoch": 98.0, "grad_norm": 0.04419811815023422, "learning_rate": 0.00016224899598393574, "loss": 0.0048, "step": 98 }, { "epoch": 99.0, "grad_norm": 0.007295660208910704, "learning_rate": 0.00016184738955823293, "loss": 0.0047, "step": 99 }, { "epoch": 100.0, "grad_norm": 0.0422837920486927, "learning_rate": 0.00016144578313253015, "loss": 0.0048, "step": 100 }, { "epoch": 101.0, "grad_norm": 0.007468740921467543, "learning_rate": 0.0001610441767068273, "loss": 0.0047, "step": 101 }, { "epoch": 102.0, "grad_norm": 0.052220698446035385, "learning_rate": 0.0001606425702811245, "loss": 0.0051, "step": 102 }, { "epoch": 103.0, "grad_norm": 0.010211293585598469, "learning_rate": 0.0001602409638554217, "loss": 0.0047, "step": 103 }, { "epoch": 104.0, "grad_norm": 0.0423286072909832, "learning_rate": 0.00015983935742971888, "loss": 0.0048, "step": 104 }, { "epoch": 105.0, "grad_norm": 0.02506270445883274, "learning_rate": 0.00015943775100401607, "loss": 0.0048, "step": 105 }, { "epoch": 106.0, "grad_norm": 0.04458456113934517, "learning_rate": 0.00015903614457831326, "loss": 0.0048, "step": 106 }, { "epoch": 107.0, "grad_norm": 0.02753458172082901, "learning_rate": 0.00015863453815261045, "loss": 0.0048, "step": 107 }, { "epoch": 108.0, "grad_norm": 0.04234497249126434, "learning_rate": 0.00015823293172690763, "loss": 0.0048, "step": 108 }, { "epoch": 109.0, "grad_norm": 0.03287286311388016, "learning_rate": 0.00015783132530120482, "loss": 0.0046, "step": 109 }, { "epoch": 110.0, "grad_norm": 0.03671009838581085, "learning_rate": 0.000157429718875502, "loss": 0.0046, "step": 110 }, { "epoch": 111.0, "grad_norm": 0.044372282922267914, "learning_rate": 0.0001570281124497992, "loss": 0.0048, "step": 111 }, { "epoch": 112.0, "grad_norm": 0.03433435037732124, "learning_rate": 0.0001566265060240964, "loss": 0.005, "step": 112 }, { "epoch": 113.0, "grad_norm": 0.04201051965355873, "learning_rate": 0.00015622489959839358, "loss": 0.0048, "step": 113 }, { "epoch": 114.0, "grad_norm": 0.027228495106101036, "learning_rate": 0.00015582329317269077, "loss": 0.0048, "step": 114 }, { "epoch": 115.0, "grad_norm": 0.04395684599876404, "learning_rate": 0.00015542168674698796, "loss": 0.0048, "step": 115 }, { "epoch": 116.0, "grad_norm": 0.0418144129216671, "learning_rate": 0.00015502008032128515, "loss": 0.0048, "step": 116 }, { "epoch": 117.0, "grad_norm": 0.015051459893584251, "learning_rate": 0.00015461847389558234, "loss": 0.0045, "step": 117 }, { "epoch": 118.0, "grad_norm": 0.010102898813784122, "learning_rate": 0.00015421686746987953, "loss": 0.0047, "step": 118 }, { "epoch": 119.0, "grad_norm": 0.026845330372452736, "learning_rate": 0.00015381526104417672, "loss": 0.0048, "step": 119 }, { "epoch": 120.0, "grad_norm": 0.010000316426157951, "learning_rate": 0.0001534136546184739, "loss": 0.0047, "step": 120 }, { "epoch": 121.0, "grad_norm": 0.041023530066013336, "learning_rate": 0.0001530120481927711, "loss": 0.0048, "step": 121 }, { "epoch": 122.0, "grad_norm": 0.0072840056382119656, "learning_rate": 0.00015261044176706828, "loss": 0.0047, "step": 122 }, { "epoch": 123.0, "grad_norm": 0.02653086557984352, "learning_rate": 0.00015220883534136547, "loss": 0.0048, "step": 123 }, { "epoch": 124.0, "grad_norm": 0.01739220879971981, "learning_rate": 0.00015180722891566266, "loss": 0.005, "step": 124 }, { "epoch": 125.0, "grad_norm": 0.007263750769197941, "learning_rate": 0.00015140562248995985, "loss": 0.0047, "step": 125 }, { "epoch": 126.0, "grad_norm": 0.04079929739236832, "learning_rate": 0.00015100401606425701, "loss": 0.0048, "step": 126 }, { "epoch": 127.0, "grad_norm": 0.009831869974732399, "learning_rate": 0.00015060240963855423, "loss": 0.0047, "step": 127 }, { "epoch": 128.0, "grad_norm": 0.026599382981657982, "learning_rate": 0.00015020080321285142, "loss": 0.0048, "step": 128 }, { "epoch": 129.0, "grad_norm": 0.018224092200398445, "learning_rate": 0.0001497991967871486, "loss": 0.0047, "step": 129 }, { "epoch": 130.0, "grad_norm": 0.02792244777083397, "learning_rate": 0.00014939759036144577, "loss": 0.0047, "step": 130 }, { "epoch": 131.0, "grad_norm": 0.024047773331403732, "learning_rate": 0.000148995983935743, "loss": 0.0048, "step": 131 }, { "epoch": 132.0, "grad_norm": 0.018245236948132515, "learning_rate": 0.00014859437751004018, "loss": 0.0047, "step": 132 }, { "epoch": 133.0, "grad_norm": 0.026581475511193275, "learning_rate": 0.00014819277108433737, "loss": 0.0048, "step": 133 }, { "epoch": 134.0, "grad_norm": 0.007153503131121397, "learning_rate": 0.00014779116465863453, "loss": 0.0047, "step": 134 }, { "epoch": 135.0, "grad_norm": 0.0323675237596035, "learning_rate": 0.00014738955823293172, "loss": 0.0048, "step": 135 }, { "epoch": 136.0, "grad_norm": 0.005924216937273741, "learning_rate": 0.00014698795180722893, "loss": 0.0046, "step": 136 }, { "epoch": 137.0, "grad_norm": 0.026371264830231667, "learning_rate": 0.00014658634538152612, "loss": 0.0048, "step": 137 }, { "epoch": 138.0, "grad_norm": 0.002094594296067953, "learning_rate": 0.00014618473895582328, "loss": 0.0047, "step": 138 }, { "epoch": 139.0, "grad_norm": 0.023835647851228714, "learning_rate": 0.00014578313253012047, "loss": 0.0048, "step": 139 }, { "epoch": 140.0, "grad_norm": 0.007189361844211817, "learning_rate": 0.0001453815261044177, "loss": 0.0047, "step": 140 }, { "epoch": 141.0, "grad_norm": 0.021903127431869507, "learning_rate": 0.00014497991967871488, "loss": 0.0049, "step": 141 }, { "epoch": 142.0, "grad_norm": 0.009862157516181469, "learning_rate": 0.00014457831325301204, "loss": 0.0047, "step": 142 }, { "epoch": 143.0, "grad_norm": 0.01557657215744257, "learning_rate": 0.00014417670682730923, "loss": 0.0047, "step": 143 }, { "epoch": 144.0, "grad_norm": 0.01178675051778555, "learning_rate": 0.00014377510040160642, "loss": 0.0048, "step": 144 }, { "epoch": 145.0, "grad_norm": 0.009945346042513847, "learning_rate": 0.00014337349397590364, "loss": 0.0047, "step": 145 }, { "epoch": 146.0, "grad_norm": 0.013681800104677677, "learning_rate": 0.0001429718875502008, "loss": 0.0048, "step": 146 }, { "epoch": 147.0, "grad_norm": 0.0072199697606265545, "learning_rate": 0.000142570281124498, "loss": 0.0047, "step": 147 }, { "epoch": 148.0, "grad_norm": 0.015607825480401516, "learning_rate": 0.00014216867469879518, "loss": 0.0047, "step": 148 }, { "epoch": 149.0, "grad_norm": 0.00990898534655571, "learning_rate": 0.0001417670682730924, "loss": 0.0047, "step": 149 }, { "epoch": 150.0, "grad_norm": 0.009972168132662773, "learning_rate": 0.00014136546184738956, "loss": 0.0047, "step": 150 }, { "epoch": 151.0, "grad_norm": 0.007324350066483021, "learning_rate": 0.00014096385542168674, "loss": 0.0047, "step": 151 }, { "epoch": 152.0, "grad_norm": 0.007397031411528587, "learning_rate": 0.00014056224899598393, "loss": 0.0047, "step": 152 }, { "epoch": 153.0, "grad_norm": 0.00632756482809782, "learning_rate": 0.00014016064257028115, "loss": 0.0046, "step": 153 }, { "epoch": 154.0, "grad_norm": 0.0021140226162970066, "learning_rate": 0.00013975903614457834, "loss": 0.0047, "step": 154 }, { "epoch": 155.0, "grad_norm": 0.0021610369440168142, "learning_rate": 0.0001393574297188755, "loss": 0.0047, "step": 155 }, { "epoch": 156.0, "grad_norm": 0.004180264193564653, "learning_rate": 0.0001389558232931727, "loss": 0.0048, "step": 156 }, { "epoch": 157.0, "grad_norm": 0.002081150421872735, "learning_rate": 0.00013855421686746988, "loss": 0.0047, "step": 157 }, { "epoch": 158.0, "grad_norm": 0.004214874934405088, "learning_rate": 0.0001381526104417671, "loss": 0.0048, "step": 158 }, { "epoch": 159.0, "grad_norm": 0.002199581591412425, "learning_rate": 0.00013775100401606426, "loss": 0.0047, "step": 159 }, { "epoch": 160.0, "grad_norm": 0.010448895394802094, "learning_rate": 0.00013734939759036145, "loss": 0.0047, "step": 160 }, { "epoch": 161.0, "grad_norm": 0.007638930808752775, "learning_rate": 0.00013694779116465864, "loss": 0.0047, "step": 161 }, { "epoch": 162.0, "grad_norm": 0.007661975454539061, "learning_rate": 0.00013654618473895585, "loss": 0.0047, "step": 162 }, { "epoch": 163.0, "grad_norm": 0.0014610282378271222, "learning_rate": 0.00013614457831325302, "loss": 0.0047, "step": 163 }, { "epoch": 164.0, "grad_norm": 0.01049406360834837, "learning_rate": 0.0001357429718875502, "loss": 0.0047, "step": 164 }, { "epoch": 165.0, "grad_norm": 0.0021653317380696535, "learning_rate": 0.0001353413654618474, "loss": 0.0047, "step": 165 }, { "epoch": 166.0, "grad_norm": 0.016612282022833824, "learning_rate": 0.00013493975903614458, "loss": 0.0047, "step": 166 }, { "epoch": 167.0, "grad_norm": 0.005656089633703232, "learning_rate": 0.00013453815261044177, "loss": 0.0048, "step": 167 }, { "epoch": 168.0, "grad_norm": 0.015557671897113323, "learning_rate": 0.00013413654618473896, "loss": 0.0046, "step": 168 }, { "epoch": 169.0, "grad_norm": 0.00793896708637476, "learning_rate": 0.00013373493975903615, "loss": 0.0047, "step": 169 }, { "epoch": 170.0, "grad_norm": 0.016965791583061218, "learning_rate": 0.00013333333333333334, "loss": 0.0047, "step": 170 }, { "epoch": 171.0, "grad_norm": 0.010896142572164536, "learning_rate": 0.00013293172690763053, "loss": 0.0047, "step": 171 }, { "epoch": 172.0, "grad_norm": 0.014977889135479927, "learning_rate": 0.00013253012048192772, "loss": 0.0048, "step": 172 }, { "epoch": 173.0, "grad_norm": 0.007990415208041668, "learning_rate": 0.0001321285140562249, "loss": 0.0047, "step": 173 }, { "epoch": 174.0, "grad_norm": 0.01746082492172718, "learning_rate": 0.0001317269076305221, "loss": 0.0047, "step": 174 }, { "epoch": 175.0, "grad_norm": 0.011155808344483376, "learning_rate": 0.00013132530120481929, "loss": 0.0047, "step": 175 }, { "epoch": 176.0, "grad_norm": 0.020374253392219543, "learning_rate": 0.00013092369477911648, "loss": 0.0047, "step": 176 }, { "epoch": 177.0, "grad_norm": 0.027189314365386963, "learning_rate": 0.00013052208835341366, "loss": 0.0048, "step": 177 }, { "epoch": 178.0, "grad_norm": 0.0035853274166584015, "learning_rate": 0.00013012048192771085, "loss": 0.0046, "step": 178 }, { "epoch": 179.0, "grad_norm": 0.02057839184999466, "learning_rate": 0.00012971887550200804, "loss": 0.0047, "step": 179 }, { "epoch": 180.0, "grad_norm": 0.004144900944083929, "learning_rate": 0.00012931726907630523, "loss": 0.0047, "step": 180 }, { "epoch": 181.0, "grad_norm": 0.027396870777010918, "learning_rate": 0.00012891566265060242, "loss": 0.0048, "step": 181 }, { "epoch": 182.0, "grad_norm": 0.004257265478372574, "learning_rate": 0.0001285140562248996, "loss": 0.0048, "step": 182 }, { "epoch": 183.0, "grad_norm": 0.029999705031514168, "learning_rate": 0.0001281124497991968, "loss": 0.0048, "step": 183 }, { "epoch": 184.0, "grad_norm": 0.012998619116842747, "learning_rate": 0.00012771084337349396, "loss": 0.0047, "step": 184 }, { "epoch": 185.0, "grad_norm": 0.027277518063783646, "learning_rate": 0.00012730923694779118, "loss": 0.0048, "step": 185 }, { "epoch": 186.0, "grad_norm": 0.020436229184269905, "learning_rate": 0.00012690763052208837, "loss": 0.0047, "step": 186 }, { "epoch": 187.0, "grad_norm": 0.017912449315190315, "learning_rate": 0.00012650602409638556, "loss": 0.0048, "step": 187 }, { "epoch": 188.0, "grad_norm": 0.022624023258686066, "learning_rate": 0.00012610441767068272, "loss": 0.0047, "step": 188 }, { "epoch": 189.0, "grad_norm": 0.00817954819649458, "learning_rate": 0.00012570281124497994, "loss": 0.0047, "step": 189 }, { "epoch": 190.0, "grad_norm": 0.01837238110601902, "learning_rate": 0.00012530120481927712, "loss": 0.0047, "step": 190 }, { "epoch": 191.0, "grad_norm": 0.006303516216576099, "learning_rate": 0.0001248995983935743, "loss": 0.0047, "step": 191 }, { "epoch": 192.0, "grad_norm": 0.02228759415447712, "learning_rate": 0.00012449799196787148, "loss": 0.0047, "step": 192 }, { "epoch": 193.0, "grad_norm": 0.004886090289801359, "learning_rate": 0.0001240963855421687, "loss": 0.0047, "step": 193 }, { "epoch": 194.0, "grad_norm": 0.017957722768187523, "learning_rate": 0.00012369477911646588, "loss": 0.0048, "step": 194 }, { "epoch": 195.0, "grad_norm": 0.008203946985304356, "learning_rate": 0.00012329317269076307, "loss": 0.0047, "step": 195 }, { "epoch": 196.0, "grad_norm": 0.013346477411687374, "learning_rate": 0.00012289156626506023, "loss": 0.0047, "step": 196 }, { "epoch": 197.0, "grad_norm": 0.008646669797599316, "learning_rate": 0.00012248995983935742, "loss": 0.0048, "step": 197 }, { "epoch": 198.0, "grad_norm": 0.011337646283209324, "learning_rate": 0.00012208835341365464, "loss": 0.0047, "step": 198 }, { "epoch": 199.0, "grad_norm": 0.01342825498431921, "learning_rate": 0.00012168674698795181, "loss": 0.0047, "step": 199 }, { "epoch": 200.0, "grad_norm": 0.006050780415534973, "learning_rate": 0.000121285140562249, "loss": 0.0048, "step": 200 }, { "epoch": 201.0, "grad_norm": 0.01836731843650341, "learning_rate": 0.00012088353413654618, "loss": 0.0048, "step": 201 }, { "epoch": 202.0, "grad_norm": 0.008505699224770069, "learning_rate": 0.0001204819277108434, "loss": 0.0047, "step": 202 }, { "epoch": 203.0, "grad_norm": 0.008753190748393536, "learning_rate": 0.00012008032128514057, "loss": 0.0047, "step": 203 }, { "epoch": 204.0, "grad_norm": 0.009133870713412762, "learning_rate": 0.00011967871485943776, "loss": 0.0048, "step": 204 }, { "epoch": 205.0, "grad_norm": 0.0029124633874744177, "learning_rate": 0.00011927710843373494, "loss": 0.0048, "step": 205 }, { "epoch": 206.0, "grad_norm": 0.00847614649683237, "learning_rate": 0.00011887550200803212, "loss": 0.0047, "step": 206 }, { "epoch": 207.0, "grad_norm": 0.0028652322944253683, "learning_rate": 0.00011847389558232933, "loss": 0.0047, "step": 207 }, { "epoch": 208.0, "grad_norm": 0.009550940245389938, "learning_rate": 0.00011807228915662652, "loss": 0.0047, "step": 208 }, { "epoch": 209.0, "grad_norm": 0.008687314577400684, "learning_rate": 0.00011767068273092369, "loss": 0.0047, "step": 209 }, { "epoch": 210.0, "grad_norm": 0.004487687721848488, "learning_rate": 0.00011726907630522088, "loss": 0.0047, "step": 210 }, { "epoch": 211.0, "grad_norm": 0.009615003131330013, "learning_rate": 0.00011686746987951808, "loss": 0.0047, "step": 211 }, { "epoch": 212.0, "grad_norm": 0.001644686795771122, "learning_rate": 0.00011646586345381527, "loss": 0.0047, "step": 212 }, { "epoch": 213.0, "grad_norm": 0.012544874101877213, "learning_rate": 0.00011606425702811245, "loss": 0.0047, "step": 213 }, { "epoch": 214.0, "grad_norm": 0.009784480556845665, "learning_rate": 0.00011566265060240964, "loss": 0.0047, "step": 214 }, { "epoch": 215.0, "grad_norm": 0.008885402232408524, "learning_rate": 0.00011526104417670683, "loss": 0.0047, "step": 215 }, { "epoch": 216.0, "grad_norm": 0.015116319991648197, "learning_rate": 0.00011485943775100403, "loss": 0.0047, "step": 216 }, { "epoch": 217.0, "grad_norm": 0.0030038722325116396, "learning_rate": 0.0001144578313253012, "loss": 0.0047, "step": 217 }, { "epoch": 218.0, "grad_norm": 0.014432215131819248, "learning_rate": 0.0001140562248995984, "loss": 0.0047, "step": 218 }, { "epoch": 219.0, "grad_norm": 0.012555493041872978, "learning_rate": 0.00011365461847389558, "loss": 0.0047, "step": 219 }, { "epoch": 220.0, "grad_norm": 0.004350865725427866, "learning_rate": 0.00011325301204819279, "loss": 0.0047, "step": 220 }, { "epoch": 221.0, "grad_norm": 0.011415142565965652, "learning_rate": 0.00011285140562248996, "loss": 0.0047, "step": 221 }, { "epoch": 222.0, "grad_norm": 0.002292638411745429, "learning_rate": 0.00011244979919678715, "loss": 0.0047, "step": 222 }, { "epoch": 223.0, "grad_norm": 0.011944664642214775, "learning_rate": 0.00011204819277108434, "loss": 0.0047, "step": 223 }, { "epoch": 224.0, "grad_norm": 0.009497747756540775, "learning_rate": 0.00011164658634538152, "loss": 0.0047, "step": 224 }, { "epoch": 225.0, "grad_norm": 0.004834707360714674, "learning_rate": 0.00011124497991967872, "loss": 0.0047, "step": 225 }, { "epoch": 226.0, "grad_norm": 0.012152622453868389, "learning_rate": 0.00011084337349397591, "loss": 0.0048, "step": 226 }, { "epoch": 227.0, "grad_norm": 0.005508288741111755, "learning_rate": 0.0001104417670682731, "loss": 0.0047, "step": 227 }, { "epoch": 228.0, "grad_norm": 0.004662630148231983, "learning_rate": 0.00011004016064257027, "loss": 0.0047, "step": 228 }, { "epoch": 229.0, "grad_norm": 0.0046459161676466465, "learning_rate": 0.00010963855421686749, "loss": 0.0047, "step": 229 }, { "epoch": 230.0, "grad_norm": 0.003354718443006277, "learning_rate": 0.00010923694779116467, "loss": 0.0047, "step": 230 }, { "epoch": 231.0, "grad_norm": 0.005278999917209148, "learning_rate": 0.00010883534136546186, "loss": 0.0048, "step": 231 }, { "epoch": 232.0, "grad_norm": 0.006012341473251581, "learning_rate": 0.00010843373493975903, "loss": 0.0047, "step": 232 }, { "epoch": 233.0, "grad_norm": 0.0016911854036152363, "learning_rate": 0.00010803212851405625, "loss": 0.0047, "step": 233 }, { "epoch": 234.0, "grad_norm": 0.004478626884520054, "learning_rate": 0.00010763052208835342, "loss": 0.0047, "step": 234 }, { "epoch": 235.0, "grad_norm": 0.005508603993803263, "learning_rate": 0.00010722891566265061, "loss": 0.0047, "step": 235 }, { "epoch": 236.0, "grad_norm": 0.0032062928657978773, "learning_rate": 0.00010682730923694779, "loss": 0.0047, "step": 236 }, { "epoch": 237.0, "grad_norm": 0.006789966020733118, "learning_rate": 0.00010642570281124498, "loss": 0.0048, "step": 237 }, { "epoch": 238.0, "grad_norm": 0.005682968068867922, "learning_rate": 0.00010602409638554218, "loss": 0.0047, "step": 238 }, { "epoch": 239.0, "grad_norm": 0.00296304514631629, "learning_rate": 0.00010562248995983937, "loss": 0.0047, "step": 239 }, { "epoch": 240.0, "grad_norm": 0.006762088742107153, "learning_rate": 0.00010522088353413654, "loss": 0.0047, "step": 240 }, { "epoch": 241.0, "grad_norm": 0.0028762409929186106, "learning_rate": 0.00010481927710843373, "loss": 0.0047, "step": 241 }, { "epoch": 242.0, "grad_norm": 0.009846026077866554, "learning_rate": 0.00010441767068273094, "loss": 0.0047, "step": 242 }, { "epoch": 243.0, "grad_norm": 0.01071733795106411, "learning_rate": 0.00010401606425702813, "loss": 0.0047, "step": 243 }, { "epoch": 244.0, "grad_norm": 0.001721803448162973, "learning_rate": 0.0001036144578313253, "loss": 0.0047, "step": 244 }, { "epoch": 245.0, "grad_norm": 0.015101822093129158, "learning_rate": 0.00010321285140562249, "loss": 0.0047, "step": 245 }, { "epoch": 246.0, "grad_norm": 0.012098951265215874, "learning_rate": 0.00010281124497991968, "loss": 0.0047, "step": 246 }, { "epoch": 247.0, "grad_norm": 0.006853122264146805, "learning_rate": 0.00010240963855421688, "loss": 0.0047, "step": 247 }, { "epoch": 248.0, "grad_norm": 0.016420654952526093, "learning_rate": 0.00010200803212851406, "loss": 0.0047, "step": 248 }, { "epoch": 249.0, "grad_norm": 0.00433447165414691, "learning_rate": 0.00010160642570281125, "loss": 0.0047, "step": 249 }, { "epoch": 250.0, "grad_norm": 0.012108503840863705, "learning_rate": 0.00010120481927710844, "loss": 0.0047, "step": 250 }, { "epoch": 251.0, "grad_norm": 0.008616012521088123, "learning_rate": 0.00010080321285140564, "loss": 0.0047, "step": 251 }, { "epoch": 252.0, "grad_norm": 0.008652638643980026, "learning_rate": 0.00010040160642570282, "loss": 0.0047, "step": 252 }, { "epoch": 253.0, "grad_norm": 0.01692233793437481, "learning_rate": 0.0001, "loss": 0.0047, "step": 253 }, { "epoch": 254.0, "grad_norm": 0.006091867107897997, "learning_rate": 9.95983935742972e-05, "loss": 0.0047, "step": 254 }, { "epoch": 255.0, "grad_norm": 0.011222707107663155, "learning_rate": 9.919678714859438e-05, "loss": 0.0047, "step": 255 }, { "epoch": 256.0, "grad_norm": 0.011931284330785275, "learning_rate": 9.879518072289157e-05, "loss": 0.0047, "step": 256 }, { "epoch": 257.0, "grad_norm": 0.004332751967012882, "learning_rate": 9.839357429718876e-05, "loss": 0.0048, "step": 257 }, { "epoch": 258.0, "grad_norm": 0.013347852043807507, "learning_rate": 9.799196787148595e-05, "loss": 0.0047, "step": 258 }, { "epoch": 259.0, "grad_norm": 0.006608230993151665, "learning_rate": 9.759036144578314e-05, "loss": 0.0047, "step": 259 }, { "epoch": 260.0, "grad_norm": 0.004314239602535963, "learning_rate": 9.718875502008033e-05, "loss": 0.0048, "step": 260 }, { "epoch": 261.0, "grad_norm": 0.0060504162684082985, "learning_rate": 9.678714859437752e-05, "loss": 0.0048, "step": 261 }, { "epoch": 262.0, "grad_norm": 0.0033889045007526875, "learning_rate": 9.638554216867471e-05, "loss": 0.0047, "step": 262 }, { "epoch": 263.0, "grad_norm": 0.008607766591012478, "learning_rate": 9.598393574297188e-05, "loss": 0.0048, "step": 263 }, { "epoch": 264.0, "grad_norm": 0.001759338192641735, "learning_rate": 9.558232931726909e-05, "loss": 0.0047, "step": 264 }, { "epoch": 265.0, "grad_norm": 0.008092672564089298, "learning_rate": 9.518072289156626e-05, "loss": 0.0047, "step": 265 }, { "epoch": 266.0, "grad_norm": 0.0017322164494544268, "learning_rate": 9.477911646586346e-05, "loss": 0.0047, "step": 266 }, { "epoch": 267.0, "grad_norm": 0.0017360730562359095, "learning_rate": 9.437751004016064e-05, "loss": 0.0047, "step": 267 }, { "epoch": 268.0, "grad_norm": 0.0017291605472564697, "learning_rate": 9.397590361445784e-05, "loss": 0.0047, "step": 268 }, { "epoch": 269.0, "grad_norm": 0.0016873552231118083, "learning_rate": 9.357429718875502e-05, "loss": 0.0047, "step": 269 }, { "epoch": 270.0, "grad_norm": 0.007920237258076668, "learning_rate": 9.317269076305222e-05, "loss": 0.0047, "step": 270 }, { "epoch": 271.0, "grad_norm": 0.0036651096306741238, "learning_rate": 9.27710843373494e-05, "loss": 0.0048, "step": 271 }, { "epoch": 272.0, "grad_norm": 0.01999688521027565, "learning_rate": 9.23694779116466e-05, "loss": 0.0047, "step": 272 }, { "epoch": 273.0, "grad_norm": 0.007801530417054892, "learning_rate": 9.196787148594378e-05, "loss": 0.0047, "step": 273 }, { "epoch": 274.0, "grad_norm": 0.012862344272434711, "learning_rate": 9.156626506024096e-05, "loss": 0.0048, "step": 274 }, { "epoch": 275.0, "grad_norm": 0.0016800492303445935, "learning_rate": 9.116465863453815e-05, "loss": 0.0047, "step": 275 }, { "epoch": 276.0, "grad_norm": 0.019683390855789185, "learning_rate": 9.076305220883534e-05, "loss": 0.0047, "step": 276 }, { "epoch": 277.0, "grad_norm": 0.0017172418301925063, "learning_rate": 9.036144578313253e-05, "loss": 0.0047, "step": 277 }, { "epoch": 278.0, "grad_norm": 0.030774248763918877, "learning_rate": 8.995983935742972e-05, "loss": 0.0049, "step": 278 }, { "epoch": 279.0, "grad_norm": 0.0035772966220974922, "learning_rate": 8.955823293172691e-05, "loss": 0.0048, "step": 279 }, { "epoch": 280.0, "grad_norm": 0.03694264590740204, "learning_rate": 8.91566265060241e-05, "loss": 0.0048, "step": 280 }, { "epoch": 281.0, "grad_norm": 0.0031574727036058903, "learning_rate": 8.875502008032129e-05, "loss": 0.0045, "step": 281 }, { "epoch": 282.0, "grad_norm": 0.03539184108376503, "learning_rate": 8.835341365461848e-05, "loss": 0.005, "step": 282 }, { "epoch": 283.0, "grad_norm": 0.010516048409044743, "learning_rate": 8.795180722891567e-05, "loss": 0.0047, "step": 283 }, { "epoch": 284.0, "grad_norm": 0.010487216524779797, "learning_rate": 8.755020080321286e-05, "loss": 0.0047, "step": 284 }, { "epoch": 285.0, "grad_norm": 0.010434217751026154, "learning_rate": 8.714859437751005e-05, "loss": 0.0047, "step": 285 }, { "epoch": 286.0, "grad_norm": 0.007612716872245073, "learning_rate": 8.674698795180724e-05, "loss": 0.0047, "step": 286 }, { "epoch": 287.0, "grad_norm": 0.025456130504608154, "learning_rate": 8.634538152610442e-05, "loss": 0.0048, "step": 287 }, { "epoch": 288.0, "grad_norm": 0.010371106676757336, "learning_rate": 8.594377510040161e-05, "loss": 0.0047, "step": 288 }, { "epoch": 289.0, "grad_norm": 0.028378885239362717, "learning_rate": 8.55421686746988e-05, "loss": 0.0048, "step": 289 }, { "epoch": 290.0, "grad_norm": 0.007674542721360922, "learning_rate": 8.514056224899599e-05, "loss": 0.0047, "step": 290 }, { "epoch": 291.0, "grad_norm": 0.025507695972919464, "learning_rate": 8.473895582329318e-05, "loss": 0.0048, "step": 291 }, { "epoch": 292.0, "grad_norm": 0.0009824644075706601, "learning_rate": 8.433734939759037e-05, "loss": 0.0049, "step": 292 }, { "epoch": 293.0, "grad_norm": 0.03781874477863312, "learning_rate": 8.393574297188756e-05, "loss": 0.0046, "step": 293 }, { "epoch": 294.0, "grad_norm": 0.007710340432822704, "learning_rate": 8.353413654618474e-05, "loss": 0.0047, "step": 294 }, { "epoch": 295.0, "grad_norm": 0.03508257865905762, "learning_rate": 8.313253012048194e-05, "loss": 0.005, "step": 295 }, { "epoch": 296.0, "grad_norm": 0.007686258759349585, "learning_rate": 8.273092369477911e-05, "loss": 0.0047, "step": 296 }, { "epoch": 297.0, "grad_norm": 0.04559750109910965, "learning_rate": 8.232931726907632e-05, "loss": 0.0048, "step": 297 }, { "epoch": 298.0, "grad_norm": 0.010225856676697731, "learning_rate": 8.192771084337349e-05, "loss": 0.0047, "step": 298 }, { "epoch": 299.0, "grad_norm": 0.02517218515276909, "learning_rate": 8.15261044176707e-05, "loss": 0.0048, "step": 299 }, { "epoch": 300.0, "grad_norm": 0.025102809071540833, "learning_rate": 8.112449799196787e-05, "loss": 0.0048, "step": 300 }, { "epoch": 301.0, "grad_norm": 0.0025317783001810312, "learning_rate": 8.072289156626507e-05, "loss": 0.0045, "step": 301 }, { "epoch": 302.0, "grad_norm": 0.0447322279214859, "learning_rate": 8.032128514056225e-05, "loss": 0.0048, "step": 302 }, { "epoch": 303.0, "grad_norm": 0.0074921357445418835, "learning_rate": 7.991967871485944e-05, "loss": 0.0047, "step": 303 }, { "epoch": 304.0, "grad_norm": 0.02473229542374611, "learning_rate": 7.951807228915663e-05, "loss": 0.0048, "step": 304 }, { "epoch": 305.0, "grad_norm": 0.007424222771078348, "learning_rate": 7.911646586345382e-05, "loss": 0.0047, "step": 305 }, { "epoch": 306.0, "grad_norm": 0.027306661009788513, "learning_rate": 7.8714859437751e-05, "loss": 0.0048, "step": 306 }, { "epoch": 307.0, "grad_norm": 0.007345912978053093, "learning_rate": 7.83132530120482e-05, "loss": 0.0047, "step": 307 }, { "epoch": 308.0, "grad_norm": 0.02444930374622345, "learning_rate": 7.791164658634539e-05, "loss": 0.0048, "step": 308 }, { "epoch": 309.0, "grad_norm": 0.02708006091415882, "learning_rate": 7.751004016064257e-05, "loss": 0.0048, "step": 309 }, { "epoch": 310.0, "grad_norm": 0.007334452122449875, "learning_rate": 7.710843373493976e-05, "loss": 0.0047, "step": 310 }, { "epoch": 311.0, "grad_norm": 0.009786078706383705, "learning_rate": 7.670682730923695e-05, "loss": 0.0047, "step": 311 }, { "epoch": 312.0, "grad_norm": 0.007255980744957924, "learning_rate": 7.630522088353414e-05, "loss": 0.0047, "step": 312 }, { "epoch": 313.0, "grad_norm": 0.014863966032862663, "learning_rate": 7.590361445783133e-05, "loss": 0.0045, "step": 313 }, { "epoch": 314.0, "grad_norm": 0.009730237536132336, "learning_rate": 7.550200803212851e-05, "loss": 0.0047, "step": 314 }, { "epoch": 315.0, "grad_norm": 0.00974965188652277, "learning_rate": 7.510040160642571e-05, "loss": 0.0047, "step": 315 }, { "epoch": 316.0, "grad_norm": 0.0008471576729789376, "learning_rate": 7.469879518072289e-05, "loss": 0.0049, "step": 316 }, { "epoch": 317.0, "grad_norm": 0.007247288711369038, "learning_rate": 7.429718875502009e-05, "loss": 0.0047, "step": 317 }, { "epoch": 318.0, "grad_norm": 0.00724770175293088, "learning_rate": 7.389558232931726e-05, "loss": 0.0047, "step": 318 }, { "epoch": 319.0, "grad_norm": 0.009726744145154953, "learning_rate": 7.349397590361447e-05, "loss": 0.0047, "step": 319 }, { "epoch": 320.0, "grad_norm": 0.002370405476540327, "learning_rate": 7.309236947791164e-05, "loss": 0.0045, "step": 320 }, { "epoch": 321.0, "grad_norm": 0.007258490659296513, "learning_rate": 7.269076305220885e-05, "loss": 0.0047, "step": 321 }, { "epoch": 322.0, "grad_norm": 0.009763582609593868, "learning_rate": 7.228915662650602e-05, "loss": 0.0047, "step": 322 }, { "epoch": 323.0, "grad_norm": 0.0072934250347316265, "learning_rate": 7.188755020080321e-05, "loss": 0.0047, "step": 323 }, { "epoch": 324.0, "grad_norm": 0.007325456012040377, "learning_rate": 7.14859437751004e-05, "loss": 0.0047, "step": 324 }, { "epoch": 325.0, "grad_norm": 0.02716483175754547, "learning_rate": 7.108433734939759e-05, "loss": 0.0048, "step": 325 }, { "epoch": 326.0, "grad_norm": 0.007358341012150049, "learning_rate": 7.068273092369478e-05, "loss": 0.0047, "step": 326 }, { "epoch": 327.0, "grad_norm": 0.04177234321832657, "learning_rate": 7.028112449799197e-05, "loss": 0.0048, "step": 327 }, { "epoch": 328.0, "grad_norm": 0.027355682104825974, "learning_rate": 6.987951807228917e-05, "loss": 0.0048, "step": 328 }, { "epoch": 329.0, "grad_norm": 0.02739114686846733, "learning_rate": 6.947791164658635e-05, "loss": 0.0048, "step": 329 }, { "epoch": 330.0, "grad_norm": 0.024521242827177048, "learning_rate": 6.907630522088355e-05, "loss": 0.0048, "step": 330 }, { "epoch": 331.0, "grad_norm": 0.007352576591074467, "learning_rate": 6.867469879518072e-05, "loss": 0.0047, "step": 331 }, { "epoch": 332.0, "grad_norm": 0.0023073432967066765, "learning_rate": 6.827309236947793e-05, "loss": 0.0045, "step": 332 }, { "epoch": 333.0, "grad_norm": 0.009870602749288082, "learning_rate": 6.78714859437751e-05, "loss": 0.0047, "step": 333 }, { "epoch": 334.0, "grad_norm": 0.00988033413887024, "learning_rate": 6.746987951807229e-05, "loss": 0.0047, "step": 334 }, { "epoch": 335.0, "grad_norm": 0.024536525830626488, "learning_rate": 6.706827309236948e-05, "loss": 0.0048, "step": 335 }, { "epoch": 336.0, "grad_norm": 0.009867388755083084, "learning_rate": 6.666666666666667e-05, "loss": 0.0047, "step": 336 }, { "epoch": 337.0, "grad_norm": 0.00988020095974207, "learning_rate": 6.626506024096386e-05, "loss": 0.0047, "step": 337 }, { "epoch": 338.0, "grad_norm": 0.007355022244155407, "learning_rate": 6.586345381526105e-05, "loss": 0.0047, "step": 338 }, { "epoch": 339.0, "grad_norm": 0.007368543650954962, "learning_rate": 6.546184738955824e-05, "loss": 0.0047, "step": 339 }, { "epoch": 340.0, "grad_norm": 0.0073932805098593235, "learning_rate": 6.506024096385543e-05, "loss": 0.0047, "step": 340 }, { "epoch": 341.0, "grad_norm": 0.027419747784733772, "learning_rate": 6.465863453815262e-05, "loss": 0.0048, "step": 341 }, { "epoch": 342.0, "grad_norm": 0.009928101673722267, "learning_rate": 6.42570281124498e-05, "loss": 0.0047, "step": 342 }, { "epoch": 343.0, "grad_norm": 0.03391006961464882, "learning_rate": 6.385542168674698e-05, "loss": 0.005, "step": 343 }, { "epoch": 344.0, "grad_norm": 0.024691926315426826, "learning_rate": 6.345381526104418e-05, "loss": 0.0048, "step": 344 }, { "epoch": 345.0, "grad_norm": 0.036551907658576965, "learning_rate": 6.305220883534136e-05, "loss": 0.0046, "step": 345 }, { "epoch": 346.0, "grad_norm": 0.027336876839399338, "learning_rate": 6.265060240963856e-05, "loss": 0.0048, "step": 346 }, { "epoch": 347.0, "grad_norm": 0.016788320615887642, "learning_rate": 6.224899598393574e-05, "loss": 0.005, "step": 347 }, { "epoch": 348.0, "grad_norm": 0.0243778508156538, "learning_rate": 6.184738955823294e-05, "loss": 0.0048, "step": 348 }, { "epoch": 349.0, "grad_norm": 0.007300014141947031, "learning_rate": 6.144578313253012e-05, "loss": 0.0047, "step": 349 }, { "epoch": 350.0, "grad_norm": 0.01766958087682724, "learning_rate": 6.104417670682732e-05, "loss": 0.005, "step": 350 }, { "epoch": 351.0, "grad_norm": 0.026927420869469643, "learning_rate": 6.06425702811245e-05, "loss": 0.0048, "step": 351 }, { "epoch": 352.0, "grad_norm": 0.016552148386836052, "learning_rate": 6.02409638554217e-05, "loss": 0.005, "step": 352 }, { "epoch": 353.0, "grad_norm": 0.02408471331000328, "learning_rate": 5.983935742971888e-05, "loss": 0.0048, "step": 353 }, { "epoch": 354.0, "grad_norm": 0.0007216089288704097, "learning_rate": 5.943775100401606e-05, "loss": 0.0049, "step": 354 }, { "epoch": 355.0, "grad_norm": 0.009755726903676987, "learning_rate": 5.903614457831326e-05, "loss": 0.0047, "step": 355 }, { "epoch": 356.0, "grad_norm": 0.026764798909425735, "learning_rate": 5.863453815261044e-05, "loss": 0.0048, "step": 356 }, { "epoch": 357.0, "grad_norm": 0.007179186213761568, "learning_rate": 5.823293172690764e-05, "loss": 0.0047, "step": 357 }, { "epoch": 358.0, "grad_norm": 0.023883482441306114, "learning_rate": 5.783132530120482e-05, "loss": 0.0048, "step": 358 }, { "epoch": 359.0, "grad_norm": 0.007125131320208311, "learning_rate": 5.7429718875502015e-05, "loss": 0.0047, "step": 359 }, { "epoch": 360.0, "grad_norm": 0.026448102667927742, "learning_rate": 5.70281124497992e-05, "loss": 0.0048, "step": 360 }, { "epoch": 361.0, "grad_norm": 0.009603764861822128, "learning_rate": 5.6626506024096394e-05, "loss": 0.0047, "step": 361 }, { "epoch": 362.0, "grad_norm": 0.023749757558107376, "learning_rate": 5.6224899598393576e-05, "loss": 0.0048, "step": 362 }, { "epoch": 363.0, "grad_norm": 0.007082722615450621, "learning_rate": 5.582329317269076e-05, "loss": 0.0047, "step": 363 }, { "epoch": 364.0, "grad_norm": 0.0006819483824074268, "learning_rate": 5.5421686746987955e-05, "loss": 0.0049, "step": 364 }, { "epoch": 365.0, "grad_norm": 0.026299143210053444, "learning_rate": 5.502008032128514e-05, "loss": 0.0048, "step": 365 }, { "epoch": 366.0, "grad_norm": 0.007077427115291357, "learning_rate": 5.461847389558233e-05, "loss": 0.0047, "step": 366 }, { "epoch": 367.0, "grad_norm": 0.0070725963450968266, "learning_rate": 5.4216867469879516e-05, "loss": 0.0047, "step": 367 }, { "epoch": 368.0, "grad_norm": 0.007042643614113331, "learning_rate": 5.381526104417671e-05, "loss": 0.0047, "step": 368 }, { "epoch": 369.0, "grad_norm": 0.009521468542516232, "learning_rate": 5.3413654618473894e-05, "loss": 0.0047, "step": 369 }, { "epoch": 370.0, "grad_norm": 0.0022259766701608896, "learning_rate": 5.301204819277109e-05, "loss": 0.0045, "step": 370 }, { "epoch": 371.0, "grad_norm": 0.023667145520448685, "learning_rate": 5.261044176706827e-05, "loss": 0.0048, "step": 371 }, { "epoch": 372.0, "grad_norm": 0.009567582048475742, "learning_rate": 5.220883534136547e-05, "loss": 0.0047, "step": 372 }, { "epoch": 373.0, "grad_norm": 0.00958697684109211, "learning_rate": 5.180722891566265e-05, "loss": 0.0047, "step": 373 }, { "epoch": 374.0, "grad_norm": 0.00959386583417654, "learning_rate": 5.140562248995984e-05, "loss": 0.0047, "step": 374 }, { "epoch": 375.0, "grad_norm": 0.00957813672721386, "learning_rate": 5.100401606425703e-05, "loss": 0.0047, "step": 375 }, { "epoch": 376.0, "grad_norm": 0.016183258965611458, "learning_rate": 5.060240963855422e-05, "loss": 0.005, "step": 376 }, { "epoch": 377.0, "grad_norm": 0.007078688126057386, "learning_rate": 5.020080321285141e-05, "loss": 0.0047, "step": 377 }, { "epoch": 378.0, "grad_norm": 0.007077342830598354, "learning_rate": 4.97991967871486e-05, "loss": 0.0047, "step": 378 }, { "epoch": 379.0, "grad_norm": 0.002201406517997384, "learning_rate": 4.9397590361445786e-05, "loss": 0.0045, "step": 379 }, { "epoch": 380.0, "grad_norm": 0.009655999019742012, "learning_rate": 4.8995983935742975e-05, "loss": 0.0047, "step": 380 }, { "epoch": 381.0, "grad_norm": 0.017296917736530304, "learning_rate": 4.8594377510040165e-05, "loss": 0.005, "step": 381 }, { "epoch": 382.0, "grad_norm": 0.007125664968043566, "learning_rate": 4.8192771084337354e-05, "loss": 0.0047, "step": 382 }, { "epoch": 383.0, "grad_norm": 0.0238560251891613, "learning_rate": 4.779116465863454e-05, "loss": 0.0048, "step": 383 }, { "epoch": 384.0, "grad_norm": 0.009672388434410095, "learning_rate": 4.738955823293173e-05, "loss": 0.0047, "step": 384 }, { "epoch": 385.0, "grad_norm": 0.0022134389728307724, "learning_rate": 4.698795180722892e-05, "loss": 0.0045, "step": 385 }, { "epoch": 386.0, "grad_norm": 0.00969780795276165, "learning_rate": 4.658634538152611e-05, "loss": 0.0047, "step": 386 }, { "epoch": 387.0, "grad_norm": 0.007160300388932228, "learning_rate": 4.61847389558233e-05, "loss": 0.0047, "step": 387 }, { "epoch": 388.0, "grad_norm": 0.007146279327571392, "learning_rate": 4.578313253012048e-05, "loss": 0.0047, "step": 388 }, { "epoch": 389.0, "grad_norm": 0.002192781073972583, "learning_rate": 4.538152610441767e-05, "loss": 0.0045, "step": 389 }, { "epoch": 390.0, "grad_norm": 0.009754459373652935, "learning_rate": 4.497991967871486e-05, "loss": 0.0047, "step": 390 }, { "epoch": 391.0, "grad_norm": 0.00970425084233284, "learning_rate": 4.457831325301205e-05, "loss": 0.0047, "step": 391 }, { "epoch": 392.0, "grad_norm": 0.007177860010415316, "learning_rate": 4.417670682730924e-05, "loss": 0.0047, "step": 392 }, { "epoch": 393.0, "grad_norm": 0.024075862020254135, "learning_rate": 4.377510040160643e-05, "loss": 0.0048, "step": 393 }, { "epoch": 394.0, "grad_norm": 0.009754209779202938, "learning_rate": 4.337349397590362e-05, "loss": 0.0047, "step": 394 }, { "epoch": 395.0, "grad_norm": 0.009791336953639984, "learning_rate": 4.297188755020081e-05, "loss": 0.0047, "step": 395 }, { "epoch": 396.0, "grad_norm": 0.009780234657227993, "learning_rate": 4.2570281124497996e-05, "loss": 0.0047, "step": 396 }, { "epoch": 397.0, "grad_norm": 0.007180201821029186, "learning_rate": 4.2168674698795186e-05, "loss": 0.0047, "step": 397 }, { "epoch": 398.0, "grad_norm": 0.007196042221039534, "learning_rate": 4.176706827309237e-05, "loss": 0.0047, "step": 398 }, { "epoch": 399.0, "grad_norm": 0.00722030783072114, "learning_rate": 4.136546184738956e-05, "loss": 0.0047, "step": 399 }, { "epoch": 400.0, "grad_norm": 0.009809617884457111, "learning_rate": 4.0963855421686746e-05, "loss": 0.0047, "step": 400 }, { "epoch": 401.0, "grad_norm": 0.007194210775196552, "learning_rate": 4.0562248995983936e-05, "loss": 0.0047, "step": 401 }, { "epoch": 402.0, "grad_norm": 0.00984671525657177, "learning_rate": 4.0160642570281125e-05, "loss": 0.0047, "step": 402 }, { "epoch": 403.0, "grad_norm": 0.00983081478625536, "learning_rate": 3.9759036144578314e-05, "loss": 0.0047, "step": 403 }, { "epoch": 404.0, "grad_norm": 0.007195820100605488, "learning_rate": 3.93574297188755e-05, "loss": 0.0047, "step": 404 }, { "epoch": 405.0, "grad_norm": 0.007182563189417124, "learning_rate": 3.895582329317269e-05, "loss": 0.0047, "step": 405 }, { "epoch": 406.0, "grad_norm": 0.007222824264317751, "learning_rate": 3.855421686746988e-05, "loss": 0.0047, "step": 406 }, { "epoch": 407.0, "grad_norm": 0.007211462128907442, "learning_rate": 3.815261044176707e-05, "loss": 0.0047, "step": 407 }, { "epoch": 408.0, "grad_norm": 0.009921679273247719, "learning_rate": 3.7751004016064253e-05, "loss": 0.0047, "step": 408 }, { "epoch": 409.0, "grad_norm": 0.02687370777130127, "learning_rate": 3.734939759036144e-05, "loss": 0.0048, "step": 409 }, { "epoch": 410.0, "grad_norm": 0.009853348135948181, "learning_rate": 3.694779116465863e-05, "loss": 0.0047, "step": 410 }, { "epoch": 411.0, "grad_norm": 0.024240443482995033, "learning_rate": 3.654618473895582e-05, "loss": 0.0048, "step": 411 }, { "epoch": 412.0, "grad_norm": 0.04131508618593216, "learning_rate": 3.614457831325301e-05, "loss": 0.0048, "step": 412 }, { "epoch": 413.0, "grad_norm": 0.009910643100738525, "learning_rate": 3.57429718875502e-05, "loss": 0.0047, "step": 413 }, { "epoch": 414.0, "grad_norm": 0.03583821654319763, "learning_rate": 3.534136546184739e-05, "loss": 0.0046, "step": 414 }, { "epoch": 415.0, "grad_norm": 0.017576098442077637, "learning_rate": 3.4939759036144585e-05, "loss": 0.005, "step": 415 }, { "epoch": 416.0, "grad_norm": 0.01653764583170414, "learning_rate": 3.4538152610441774e-05, "loss": 0.005, "step": 416 }, { "epoch": 417.0, "grad_norm": 0.024247560650110245, "learning_rate": 3.413654618473896e-05, "loss": 0.0048, "step": 417 }, { "epoch": 418.0, "grad_norm": 0.007164428010582924, "learning_rate": 3.3734939759036146e-05, "loss": 0.0047, "step": 418 }, { "epoch": 419.0, "grad_norm": 0.0098257539793849, "learning_rate": 3.3333333333333335e-05, "loss": 0.0047, "step": 419 }, { "epoch": 420.0, "grad_norm": 0.017520597204566002, "learning_rate": 3.2931726907630524e-05, "loss": 0.005, "step": 420 }, { "epoch": 421.0, "grad_norm": 0.018933523446321487, "learning_rate": 3.253012048192771e-05, "loss": 0.0045, "step": 421 }, { "epoch": 422.0, "grad_norm": 0.007097120396792889, "learning_rate": 3.21285140562249e-05, "loss": 0.0047, "step": 422 }, { "epoch": 423.0, "grad_norm": 0.024037552997469902, "learning_rate": 3.172690763052209e-05, "loss": 0.0048, "step": 423 }, { "epoch": 424.0, "grad_norm": 0.024066118523478508, "learning_rate": 3.132530120481928e-05, "loss": 0.0048, "step": 424 }, { "epoch": 425.0, "grad_norm": 0.0006517537985928357, "learning_rate": 3.092369477911647e-05, "loss": 0.0049, "step": 425 }, { "epoch": 426.0, "grad_norm": 0.034052085131406784, "learning_rate": 3.052208835341366e-05, "loss": 0.005, "step": 426 }, { "epoch": 427.0, "grad_norm": 0.017422359436750412, "learning_rate": 3.012048192771085e-05, "loss": 0.005, "step": 427 }, { "epoch": 428.0, "grad_norm": 0.009725161828100681, "learning_rate": 2.971887550200803e-05, "loss": 0.0047, "step": 428 }, { "epoch": 429.0, "grad_norm": 0.023919757455587387, "learning_rate": 2.931726907630522e-05, "loss": 0.0048, "step": 429 }, { "epoch": 430.0, "grad_norm": 0.04076967015862465, "learning_rate": 2.891566265060241e-05, "loss": 0.0048, "step": 430 }, { "epoch": 431.0, "grad_norm": 0.002202529925853014, "learning_rate": 2.85140562248996e-05, "loss": 0.0045, "step": 431 }, { "epoch": 432.0, "grad_norm": 0.026460327208042145, "learning_rate": 2.8112449799196788e-05, "loss": 0.0048, "step": 432 }, { "epoch": 433.0, "grad_norm": 0.026399288326501846, "learning_rate": 2.7710843373493977e-05, "loss": 0.0048, "step": 433 }, { "epoch": 434.0, "grad_norm": 0.007069241255521774, "learning_rate": 2.7309236947791167e-05, "loss": 0.0047, "step": 434 }, { "epoch": 435.0, "grad_norm": 0.023836608976125717, "learning_rate": 2.6907630522088356e-05, "loss": 0.0048, "step": 435 }, { "epoch": 436.0, "grad_norm": 0.007101175840944052, "learning_rate": 2.6506024096385545e-05, "loss": 0.0047, "step": 436 }, { "epoch": 437.0, "grad_norm": 0.009633008390665054, "learning_rate": 2.6104417670682734e-05, "loss": 0.0047, "step": 437 }, { "epoch": 438.0, "grad_norm": 0.00965973362326622, "learning_rate": 2.570281124497992e-05, "loss": 0.0047, "step": 438 }, { "epoch": 439.0, "grad_norm": 0.009659296832978725, "learning_rate": 2.530120481927711e-05, "loss": 0.0047, "step": 439 }, { "epoch": 440.0, "grad_norm": 0.00703496765345335, "learning_rate": 2.48995983935743e-05, "loss": 0.0047, "step": 440 }, { "epoch": 441.0, "grad_norm": 0.007030785549432039, "learning_rate": 2.4497991967871488e-05, "loss": 0.0047, "step": 441 }, { "epoch": 442.0, "grad_norm": 0.007043666671961546, "learning_rate": 2.4096385542168677e-05, "loss": 0.0047, "step": 442 }, { "epoch": 443.0, "grad_norm": 0.007040888071060181, "learning_rate": 2.3694779116465866e-05, "loss": 0.0047, "step": 443 }, { "epoch": 444.0, "grad_norm": 0.009699525311589241, "learning_rate": 2.3293172690763055e-05, "loss": 0.0047, "step": 444 }, { "epoch": 445.0, "grad_norm": 0.0007015722803771496, "learning_rate": 2.289156626506024e-05, "loss": 0.0049, "step": 445 }, { "epoch": 446.0, "grad_norm": 0.00966339185833931, "learning_rate": 2.248995983935743e-05, "loss": 0.0047, "step": 446 }, { "epoch": 447.0, "grad_norm": 0.009668245911598206, "learning_rate": 2.208835341365462e-05, "loss": 0.0047, "step": 447 }, { "epoch": 448.0, "grad_norm": 0.007046937942504883, "learning_rate": 2.168674698795181e-05, "loss": 0.0047, "step": 448 }, { "epoch": 449.0, "grad_norm": 0.0070527587085962296, "learning_rate": 2.1285140562248998e-05, "loss": 0.0047, "step": 449 }, { "epoch": 450.0, "grad_norm": 0.007038488052785397, "learning_rate": 2.0883534136546184e-05, "loss": 0.0047, "step": 450 }, { "epoch": 451.0, "grad_norm": 0.0021306683775037527, "learning_rate": 2.0481927710843373e-05, "loss": 0.0045, "step": 451 }, { "epoch": 452.0, "grad_norm": 0.007048788480460644, "learning_rate": 2.0080321285140562e-05, "loss": 0.0047, "step": 452 }, { "epoch": 453.0, "grad_norm": 0.0006838923436589539, "learning_rate": 1.967871485943775e-05, "loss": 0.0049, "step": 453 }, { "epoch": 454.0, "grad_norm": 0.00969866942614317, "learning_rate": 1.927710843373494e-05, "loss": 0.0047, "step": 454 }, { "epoch": 455.0, "grad_norm": 0.009712344966828823, "learning_rate": 1.8875502008032127e-05, "loss": 0.0047, "step": 455 }, { "epoch": 456.0, "grad_norm": 0.009688866324722767, "learning_rate": 1.8473895582329316e-05, "loss": 0.0047, "step": 456 }, { "epoch": 457.0, "grad_norm": 0.007071357686072588, "learning_rate": 1.8072289156626505e-05, "loss": 0.0047, "step": 457 }, { "epoch": 458.0, "grad_norm": 0.014747700653970242, "learning_rate": 1.7670682730923694e-05, "loss": 0.0045, "step": 458 }, { "epoch": 459.0, "grad_norm": 0.007095323875546455, "learning_rate": 1.7269076305220887e-05, "loss": 0.0047, "step": 459 }, { "epoch": 460.0, "grad_norm": 0.0021590902470052242, "learning_rate": 1.6867469879518073e-05, "loss": 0.0045, "step": 460 }, { "epoch": 461.0, "grad_norm": 0.0006770145264454186, "learning_rate": 1.6465863453815262e-05, "loss": 0.0049, "step": 461 }, { "epoch": 462.0, "grad_norm": 0.0006719469674862921, "learning_rate": 1.606425702811245e-05, "loss": 0.0049, "step": 462 }, { "epoch": 463.0, "grad_norm": 0.009731734171509743, "learning_rate": 1.566265060240964e-05, "loss": 0.0047, "step": 463 }, { "epoch": 464.0, "grad_norm": 0.009717794135212898, "learning_rate": 1.526104417670683e-05, "loss": 0.0047, "step": 464 }, { "epoch": 465.0, "grad_norm": 0.009746178984642029, "learning_rate": 1.4859437751004016e-05, "loss": 0.0047, "step": 465 }, { "epoch": 466.0, "grad_norm": 0.007102122530341148, "learning_rate": 1.4457831325301205e-05, "loss": 0.0047, "step": 466 }, { "epoch": 467.0, "grad_norm": 0.00708032725378871, "learning_rate": 1.4056224899598394e-05, "loss": 0.0047, "step": 467 }, { "epoch": 468.0, "grad_norm": 0.007086113095283508, "learning_rate": 1.3654618473895583e-05, "loss": 0.0047, "step": 468 }, { "epoch": 469.0, "grad_norm": 0.014743141829967499, "learning_rate": 1.3253012048192772e-05, "loss": 0.0045, "step": 469 }, { "epoch": 470.0, "grad_norm": 0.0021814049687236547, "learning_rate": 1.285140562248996e-05, "loss": 0.0045, "step": 470 }, { "epoch": 471.0, "grad_norm": 0.007091291714459658, "learning_rate": 1.244979919678715e-05, "loss": 0.0047, "step": 471 }, { "epoch": 472.0, "grad_norm": 0.0071353972889482975, "learning_rate": 1.2048192771084338e-05, "loss": 0.0047, "step": 472 }, { "epoch": 473.0, "grad_norm": 0.009766222909092903, "learning_rate": 1.1646586345381528e-05, "loss": 0.0047, "step": 473 }, { "epoch": 474.0, "grad_norm": 0.017397722229361534, "learning_rate": 1.1244979919678715e-05, "loss": 0.005, "step": 474 }, { "epoch": 475.0, "grad_norm": 0.026444077491760254, "learning_rate": 1.0843373493975904e-05, "loss": 0.0048, "step": 475 }, { "epoch": 476.0, "grad_norm": 0.009789888747036457, "learning_rate": 1.0441767068273092e-05, "loss": 0.0047, "step": 476 }, { "epoch": 477.0, "grad_norm": 0.007101455237716436, "learning_rate": 1.0040160642570281e-05, "loss": 0.0047, "step": 477 }, { "epoch": 478.0, "grad_norm": 0.007093328982591629, "learning_rate": 9.63855421686747e-06, "loss": 0.0047, "step": 478 }, { "epoch": 479.0, "grad_norm": 0.014785111881792545, "learning_rate": 9.236947791164658e-06, "loss": 0.0045, "step": 479 }, { "epoch": 480.0, "grad_norm": 0.014800012111663818, "learning_rate": 8.835341365461847e-06, "loss": 0.0045, "step": 480 }, { "epoch": 481.0, "grad_norm": 0.00710406294092536, "learning_rate": 8.433734939759036e-06, "loss": 0.0047, "step": 481 }, { "epoch": 482.0, "grad_norm": 0.0163735318928957, "learning_rate": 8.032128514056226e-06, "loss": 0.005, "step": 482 }, { "epoch": 483.0, "grad_norm": 0.00218359031714499, "learning_rate": 7.630522088353415e-06, "loss": 0.0045, "step": 483 }, { "epoch": 484.0, "grad_norm": 0.009749580174684525, "learning_rate": 7.228915662650602e-06, "loss": 0.0047, "step": 484 }, { "epoch": 485.0, "grad_norm": 0.009765205904841423, "learning_rate": 6.827309236947792e-06, "loss": 0.0047, "step": 485 }, { "epoch": 486.0, "grad_norm": 0.000685015635099262, "learning_rate": 6.42570281124498e-06, "loss": 0.0049, "step": 486 }, { "epoch": 487.0, "grad_norm": 0.00976163987070322, "learning_rate": 6.024096385542169e-06, "loss": 0.0047, "step": 487 }, { "epoch": 488.0, "grad_norm": 0.0006887281779199839, "learning_rate": 5.622489959839358e-06, "loss": 0.0049, "step": 488 }, { "epoch": 489.0, "grad_norm": 0.009809976443648338, "learning_rate": 5.220883534136546e-06, "loss": 0.0047, "step": 489 }, { "epoch": 490.0, "grad_norm": 0.009795522317290306, "learning_rate": 4.819277108433735e-06, "loss": 0.0047, "step": 490 }, { "epoch": 491.0, "grad_norm": 0.009749695658683777, "learning_rate": 4.417670682730924e-06, "loss": 0.0047, "step": 491 }, { "epoch": 492.0, "grad_norm": 0.0021824862342327833, "learning_rate": 4.016064257028113e-06, "loss": 0.0045, "step": 492 }, { "epoch": 493.0, "grad_norm": 0.007140113040804863, "learning_rate": 3.614457831325301e-06, "loss": 0.0047, "step": 493 }, { "epoch": 494.0, "grad_norm": 0.007124903611838818, "learning_rate": 3.21285140562249e-06, "loss": 0.0047, "step": 494 }, { "epoch": 495.0, "grad_norm": 0.0071158865466713905, "learning_rate": 2.811244979919679e-06, "loss": 0.0047, "step": 495 }, { "epoch": 496.0, "grad_norm": 0.007119073532521725, "learning_rate": 2.4096385542168676e-06, "loss": 0.0047, "step": 496 }, { "epoch": 497.0, "grad_norm": 0.007112898863852024, "learning_rate": 2.0080321285140564e-06, "loss": 0.0047, "step": 497 }, { "epoch": 498.0, "grad_norm": 0.014866248704493046, "learning_rate": 1.606425702811245e-06, "loss": 0.0045, "step": 498 }, { "epoch": 499.0, "grad_norm": 0.007127212826162577, "learning_rate": 1.2048192771084338e-06, "loss": 0.0047, "step": 499 }, { "epoch": 500.0, "grad_norm": 0.007142535876482725, "learning_rate": 8.032128514056225e-07, "loss": 0.0047, "step": 500 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 500, "save_steps": 500, "total_flos": 3475984143360000.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }