{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9789590254706533, "eval_steps": 113, "global_step": 902, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 3.428594393312656, "learning_rate": 4.000000000000001e-06, "loss": 2.1469, "step": 1 }, { "epoch": 0.0, "eval_loss": 2.179504632949829, "eval_runtime": 172.7557, "eval_samples_per_second": 13.447, "eval_steps_per_second": 0.423, "step": 1 }, { "epoch": 0.0, "grad_norm": 3.7159828982580754, "learning_rate": 8.000000000000001e-06, "loss": 2.1946, "step": 2 }, { "epoch": 0.01, "grad_norm": 3.643082734610657, "learning_rate": 1.2e-05, "loss": 2.232, "step": 3 }, { "epoch": 0.01, "grad_norm": 3.75889245136616, "learning_rate": 1.6000000000000003e-05, "loss": 2.1482, "step": 4 }, { "epoch": 0.01, "grad_norm": 3.121791718587376, "learning_rate": 2e-05, "loss": 1.9094, "step": 5 }, { "epoch": 0.01, "grad_norm": 3.357880867767518, "learning_rate": 2.4e-05, "loss": 1.5871, "step": 6 }, { "epoch": 0.02, "grad_norm": 3.946129682154882, "learning_rate": 2.8000000000000003e-05, "loss": 1.326, "step": 7 }, { "epoch": 0.02, "grad_norm": 2.108395610487625, "learning_rate": 3.2000000000000005e-05, "loss": 1.0602, "step": 8 }, { "epoch": 0.02, "grad_norm": 2.130447473758986, "learning_rate": 3.6e-05, "loss": 0.962, "step": 9 }, { "epoch": 0.02, "grad_norm": 2.250625735269271, "learning_rate": 4e-05, "loss": 0.7239, "step": 10 }, { "epoch": 0.02, "grad_norm": 2.086877732413652, "learning_rate": 4.4000000000000006e-05, "loss": 0.6204, "step": 11 }, { "epoch": 0.03, "grad_norm": 1.1321791358110234, "learning_rate": 4.8e-05, "loss": 0.5483, "step": 12 }, { "epoch": 0.03, "grad_norm": 0.8099243715089002, "learning_rate": 5.2000000000000004e-05, "loss": 0.5086, "step": 13 }, { "epoch": 0.03, "grad_norm": 0.8079844588753853, "learning_rate": 5.6000000000000006e-05, "loss": 0.5112, "step": 14 }, { "epoch": 0.03, "grad_norm": 0.9169759048137539, "learning_rate": 6e-05, "loss": 0.4889, "step": 15 }, { "epoch": 0.04, "grad_norm": 0.6917931287891316, "learning_rate": 6.400000000000001e-05, "loss": 0.4691, "step": 16 }, { "epoch": 0.04, "grad_norm": 0.6294590670553853, "learning_rate": 6.800000000000001e-05, "loss": 0.4549, "step": 17 }, { "epoch": 0.04, "grad_norm": 0.6001819771732335, "learning_rate": 7.2e-05, "loss": 0.4412, "step": 18 }, { "epoch": 0.04, "grad_norm": 0.6389927422614288, "learning_rate": 7.6e-05, "loss": 0.4077, "step": 19 }, { "epoch": 0.04, "grad_norm": 0.5185334799357709, "learning_rate": 8e-05, "loss": 0.4219, "step": 20 }, { "epoch": 0.05, "grad_norm": 0.5274718517549238, "learning_rate": 8.4e-05, "loss": 0.4124, "step": 21 }, { "epoch": 0.05, "grad_norm": 0.5333597700699511, "learning_rate": 8.800000000000001e-05, "loss": 0.4333, "step": 22 }, { "epoch": 0.05, "grad_norm": 0.4817408646893783, "learning_rate": 9.200000000000001e-05, "loss": 0.4022, "step": 23 }, { "epoch": 0.05, "grad_norm": 0.5041871581629092, "learning_rate": 9.6e-05, "loss": 0.3971, "step": 24 }, { "epoch": 0.06, "grad_norm": 0.5892498993207921, "learning_rate": 0.0001, "loss": 0.3769, "step": 25 }, { "epoch": 0.06, "grad_norm": 0.4448471314677223, "learning_rate": 0.00010400000000000001, "loss": 0.3905, "step": 26 }, { "epoch": 0.06, "grad_norm": 0.41460492934076704, "learning_rate": 0.00010800000000000001, "loss": 0.3814, "step": 27 }, { "epoch": 0.06, "grad_norm": 0.39759977947743247, "learning_rate": 0.00011200000000000001, "loss": 0.3675, "step": 28 }, { "epoch": 0.06, "grad_norm": 0.42656614420804834, "learning_rate": 0.000116, "loss": 0.3949, "step": 29 }, { "epoch": 0.07, "grad_norm": 0.3248894659390531, "learning_rate": 0.00012, "loss": 0.3734, "step": 30 }, { "epoch": 0.07, "grad_norm": 0.35364315198613105, "learning_rate": 0.000124, "loss": 0.3452, "step": 31 }, { "epoch": 0.07, "grad_norm": 0.3377660731261332, "learning_rate": 0.00012800000000000002, "loss": 0.365, "step": 32 }, { "epoch": 0.07, "grad_norm": 0.3029616210555195, "learning_rate": 0.000132, "loss": 0.3502, "step": 33 }, { "epoch": 0.08, "grad_norm": 0.43234275224940705, "learning_rate": 0.00013600000000000003, "loss": 0.3728, "step": 34 }, { "epoch": 0.08, "grad_norm": 0.3831666942704003, "learning_rate": 0.00014, "loss": 0.3922, "step": 35 }, { "epoch": 0.08, "grad_norm": 0.3421076533380305, "learning_rate": 0.000144, "loss": 0.3544, "step": 36 }, { "epoch": 0.08, "grad_norm": 0.30146273904800347, "learning_rate": 0.000148, "loss": 0.3237, "step": 37 }, { "epoch": 0.08, "grad_norm": 0.3234891150012619, "learning_rate": 0.000152, "loss": 0.3771, "step": 38 }, { "epoch": 0.09, "grad_norm": 0.32536194802044366, "learning_rate": 0.00015600000000000002, "loss": 0.3719, "step": 39 }, { "epoch": 0.09, "grad_norm": 0.32489341033312524, "learning_rate": 0.00016, "loss": 0.3716, "step": 40 }, { "epoch": 0.09, "grad_norm": 0.3135978131489568, "learning_rate": 0.000164, "loss": 0.3818, "step": 41 }, { "epoch": 0.09, "grad_norm": 0.28403149178513853, "learning_rate": 0.000168, "loss": 0.3576, "step": 42 }, { "epoch": 0.1, "grad_norm": 0.30747408490186307, "learning_rate": 0.000172, "loss": 0.3801, "step": 43 }, { "epoch": 0.1, "grad_norm": 0.3491185114838396, "learning_rate": 0.00017600000000000002, "loss": 0.3612, "step": 44 }, { "epoch": 0.1, "grad_norm": 0.4062709802932099, "learning_rate": 0.00018, "loss": 0.3752, "step": 45 }, { "epoch": 0.1, "grad_norm": 0.4174448098463489, "learning_rate": 0.00018400000000000003, "loss": 0.3705, "step": 46 }, { "epoch": 0.1, "grad_norm": 0.30034892483536746, "learning_rate": 0.000188, "loss": 0.3504, "step": 47 }, { "epoch": 0.11, "grad_norm": 0.24699047743823555, "learning_rate": 0.000192, "loss": 0.3622, "step": 48 }, { "epoch": 0.11, "grad_norm": 0.2944161708858025, "learning_rate": 0.000196, "loss": 0.3462, "step": 49 }, { "epoch": 0.11, "grad_norm": 0.24968532785704994, "learning_rate": 0.0002, "loss": 0.3074, "step": 50 }, { "epoch": 0.11, "grad_norm": 0.4989866478305154, "learning_rate": 0.00019999932018605637, "loss": 0.3426, "step": 51 }, { "epoch": 0.12, "grad_norm": 0.2729667152979656, "learning_rate": 0.00019999728075346845, "loss": 0.3452, "step": 52 }, { "epoch": 0.12, "grad_norm": 0.26553719135078474, "learning_rate": 0.00019999388172996495, "loss": 0.3771, "step": 53 }, { "epoch": 0.12, "grad_norm": 0.2563892875653707, "learning_rate": 0.0001999891231617599, "loss": 0.3709, "step": 54 }, { "epoch": 0.12, "grad_norm": 0.40493567306146466, "learning_rate": 0.0001999830051135521, "loss": 0.3659, "step": 55 }, { "epoch": 0.12, "grad_norm": 0.282074263507646, "learning_rate": 0.00019997552766852432, "loss": 0.3579, "step": 56 }, { "epoch": 0.13, "grad_norm": 0.24830919197340875, "learning_rate": 0.00019996669092834193, "loss": 0.3164, "step": 57 }, { "epoch": 0.13, "grad_norm": 0.31153665113975915, "learning_rate": 0.0001999564950131517, "loss": 0.3554, "step": 58 }, { "epoch": 0.13, "grad_norm": 0.2883397422934534, "learning_rate": 0.00019994494006158017, "loss": 0.3318, "step": 59 }, { "epoch": 0.13, "grad_norm": 0.29607763624002476, "learning_rate": 0.00019993202623073172, "loss": 0.3515, "step": 60 }, { "epoch": 0.14, "grad_norm": 0.26631042457457627, "learning_rate": 0.0001999177536961863, "loss": 0.3404, "step": 61 }, { "epoch": 0.14, "grad_norm": 0.30709845993919116, "learning_rate": 0.00019990212265199738, "loss": 0.3454, "step": 62 }, { "epoch": 0.14, "grad_norm": 0.2689959836388791, "learning_rate": 0.0001998851333106889, "loss": 0.3448, "step": 63 }, { "epoch": 0.14, "grad_norm": 0.28079893446027976, "learning_rate": 0.00019986678590325273, "loss": 0.3537, "step": 64 }, { "epoch": 0.14, "grad_norm": 0.2777161046281277, "learning_rate": 0.00019984708067914532, "loss": 0.3512, "step": 65 }, { "epoch": 0.15, "grad_norm": 0.2612936496089595, "learning_rate": 0.0001998260179062844, "loss": 0.3355, "step": 66 }, { "epoch": 0.15, "grad_norm": 0.270747618988818, "learning_rate": 0.0001998035978710453, "loss": 0.3557, "step": 67 }, { "epoch": 0.15, "grad_norm": 0.2524425572972277, "learning_rate": 0.00019977982087825713, "loss": 0.324, "step": 68 }, { "epoch": 0.15, "grad_norm": 0.2655824178335376, "learning_rate": 0.00019975468725119843, "loss": 0.3274, "step": 69 }, { "epoch": 0.16, "grad_norm": 0.29931436509996834, "learning_rate": 0.000199728197331593, "loss": 0.3733, "step": 70 }, { "epoch": 0.16, "grad_norm": 0.2945779980864737, "learning_rate": 0.00019970035147960524, "loss": 0.3161, "step": 71 }, { "epoch": 0.16, "grad_norm": 0.42061603746317366, "learning_rate": 0.00019967115007383507, "loss": 0.3486, "step": 72 }, { "epoch": 0.16, "grad_norm": 0.24899746628510128, "learning_rate": 0.000199640593511313, "loss": 0.3428, "step": 73 }, { "epoch": 0.16, "grad_norm": 0.2748778960772334, "learning_rate": 0.00019960868220749448, "loss": 0.3215, "step": 74 }, { "epoch": 0.17, "grad_norm": 0.2826057447583215, "learning_rate": 0.00019957541659625458, "loss": 0.3663, "step": 75 }, { "epoch": 0.17, "grad_norm": 0.26758814883425874, "learning_rate": 0.00019954079712988183, "loss": 0.3473, "step": 76 }, { "epoch": 0.17, "grad_norm": 0.26191530390935236, "learning_rate": 0.00019950482427907211, "loss": 0.3464, "step": 77 }, { "epoch": 0.17, "grad_norm": 0.27009995213133564, "learning_rate": 0.00019946749853292232, "loss": 0.3427, "step": 78 }, { "epoch": 0.17, "grad_norm": 0.2828252836889427, "learning_rate": 0.00019942882039892377, "loss": 0.3369, "step": 79 }, { "epoch": 0.18, "grad_norm": 0.28359570095347486, "learning_rate": 0.00019938879040295508, "loss": 0.3474, "step": 80 }, { "epoch": 0.18, "grad_norm": 0.23726568176198512, "learning_rate": 0.0001993474090892753, "loss": 0.3348, "step": 81 }, { "epoch": 0.18, "grad_norm": 0.28359002617947426, "learning_rate": 0.00019930467702051628, "loss": 0.3434, "step": 82 }, { "epoch": 0.18, "grad_norm": 0.24507300294017476, "learning_rate": 0.0001992605947776752, "loss": 0.3211, "step": 83 }, { "epoch": 0.19, "grad_norm": 0.2459171433488338, "learning_rate": 0.00019921516296010644, "loss": 0.3539, "step": 84 }, { "epoch": 0.19, "grad_norm": 0.23982609581354497, "learning_rate": 0.0001991683821855137, "loss": 0.3367, "step": 85 }, { "epoch": 0.19, "grad_norm": 0.24948097532302946, "learning_rate": 0.00019912025308994148, "loss": 0.3313, "step": 86 }, { "epoch": 0.19, "grad_norm": 0.25673241215001325, "learning_rate": 0.00019907077632776632, "loss": 0.3384, "step": 87 }, { "epoch": 0.19, "grad_norm": 0.2248520328170477, "learning_rate": 0.00019901995257168807, "loss": 0.3075, "step": 88 }, { "epoch": 0.2, "grad_norm": 0.2785744560779341, "learning_rate": 0.00019896778251272078, "loss": 0.3505, "step": 89 }, { "epoch": 0.2, "grad_norm": 0.2612264615045669, "learning_rate": 0.00019891426686018305, "loss": 0.3319, "step": 90 }, { "epoch": 0.2, "grad_norm": 0.2588107693763323, "learning_rate": 0.00019885940634168864, "loss": 0.3036, "step": 91 }, { "epoch": 0.2, "grad_norm": 0.2539991795599821, "learning_rate": 0.0001988032017031364, "loss": 0.3275, "step": 92 }, { "epoch": 0.21, "grad_norm": 0.2228639788098274, "learning_rate": 0.00019874565370870038, "loss": 0.3128, "step": 93 }, { "epoch": 0.21, "grad_norm": 0.2295330254763288, "learning_rate": 0.00019868676314081904, "loss": 0.3226, "step": 94 }, { "epoch": 0.21, "grad_norm": 0.26774082666954935, "learning_rate": 0.00019862653080018506, "loss": 0.3438, "step": 95 }, { "epoch": 0.21, "grad_norm": 0.2096704558396889, "learning_rate": 0.0001985649575057341, "loss": 0.3158, "step": 96 }, { "epoch": 0.21, "grad_norm": 0.222343383957648, "learning_rate": 0.00019850204409463385, "loss": 0.3127, "step": 97 }, { "epoch": 0.22, "grad_norm": 0.24355910663526567, "learning_rate": 0.00019843779142227256, "loss": 0.3278, "step": 98 }, { "epoch": 0.22, "grad_norm": 0.2589007730313338, "learning_rate": 0.00019837220036224756, "loss": 0.3433, "step": 99 }, { "epoch": 0.22, "grad_norm": 0.2048208439355147, "learning_rate": 0.00019830527180635308, "loss": 0.3038, "step": 100 }, { "epoch": 0.22, "grad_norm": 0.22100911829011585, "learning_rate": 0.00019823700666456853, "loss": 0.3295, "step": 101 }, { "epoch": 0.23, "grad_norm": 0.2865876628348663, "learning_rate": 0.0001981674058650458, "loss": 0.3374, "step": 102 }, { "epoch": 0.23, "grad_norm": 0.23267706317431597, "learning_rate": 0.00019809647035409672, "loss": 0.3146, "step": 103 }, { "epoch": 0.23, "grad_norm": 0.22896912398059824, "learning_rate": 0.0001980242010961803, "loss": 0.3302, "step": 104 }, { "epoch": 0.23, "grad_norm": 0.21199810980682918, "learning_rate": 0.00019795059907388952, "loss": 0.3089, "step": 105 }, { "epoch": 0.23, "grad_norm": 0.2314314536279127, "learning_rate": 0.00019787566528793807, "loss": 0.3219, "step": 106 }, { "epoch": 0.24, "grad_norm": 0.22578014749699987, "learning_rate": 0.00019779940075714648, "loss": 0.3089, "step": 107 }, { "epoch": 0.24, "grad_norm": 0.26739038717214403, "learning_rate": 0.0001977218065184287, "loss": 0.3314, "step": 108 }, { "epoch": 0.24, "grad_norm": 0.27640406493967007, "learning_rate": 0.00019764288362677753, "loss": 0.315, "step": 109 }, { "epoch": 0.24, "grad_norm": 0.275170691575301, "learning_rate": 0.0001975626331552507, "loss": 0.3192, "step": 110 }, { "epoch": 0.25, "grad_norm": 0.28590415176453904, "learning_rate": 0.00019748105619495594, "loss": 0.3235, "step": 111 }, { "epoch": 0.25, "grad_norm": 0.26618886451702856, "learning_rate": 0.0001973981538550364, "loss": 0.3235, "step": 112 }, { "epoch": 0.25, "grad_norm": 0.22783361042781863, "learning_rate": 0.00019731392726265537, "loss": 0.3319, "step": 113 }, { "epoch": 0.25, "eval_loss": 0.3324408233165741, "eval_runtime": 173.7614, "eval_samples_per_second": 13.369, "eval_steps_per_second": 0.42, "step": 113 }, { "epoch": 0.25, "grad_norm": 0.25003343596979083, "learning_rate": 0.00019722837756298113, "loss": 0.3269, "step": 114 }, { "epoch": 0.25, "grad_norm": 0.2051643008869916, "learning_rate": 0.0001971415059191712, "loss": 0.3331, "step": 115 }, { "epoch": 0.26, "grad_norm": 0.26679577582874325, "learning_rate": 0.00019705331351235674, "loss": 0.319, "step": 116 }, { "epoch": 0.26, "grad_norm": 0.22959410656566043, "learning_rate": 0.0001969638015416263, "loss": 0.326, "step": 117 }, { "epoch": 0.26, "grad_norm": 0.24732040292675736, "learning_rate": 0.0001968729712240095, "loss": 0.3456, "step": 118 }, { "epoch": 0.26, "grad_norm": 0.21472772586501115, "learning_rate": 0.00019678082379446078, "loss": 0.3154, "step": 119 }, { "epoch": 0.27, "grad_norm": 0.22004349633491344, "learning_rate": 0.00019668736050584224, "loss": 0.3296, "step": 120 }, { "epoch": 0.27, "grad_norm": 0.20481829759367912, "learning_rate": 0.00019659258262890683, "loss": 0.3168, "step": 121 }, { "epoch": 0.27, "grad_norm": 0.22596082871640896, "learning_rate": 0.00019649649145228102, "loss": 0.3198, "step": 122 }, { "epoch": 0.27, "grad_norm": 0.22416198789548378, "learning_rate": 0.00019639908828244718, "loss": 0.3122, "step": 123 }, { "epoch": 0.27, "grad_norm": 0.25019709447245486, "learning_rate": 0.000196300374443726, "loss": 0.3256, "step": 124 }, { "epoch": 0.28, "grad_norm": 0.23470807250735432, "learning_rate": 0.0001962003512782584, "loss": 0.3245, "step": 125 }, { "epoch": 0.28, "grad_norm": 0.24693935678122386, "learning_rate": 0.00019609902014598718, "loss": 0.3292, "step": 126 }, { "epoch": 0.28, "grad_norm": 0.2260534058582815, "learning_rate": 0.00019599638242463868, "loss": 0.325, "step": 127 }, { "epoch": 0.28, "grad_norm": 0.23667285933893983, "learning_rate": 0.00019589243950970402, "loss": 0.3337, "step": 128 }, { "epoch": 0.29, "grad_norm": 0.21298574370036155, "learning_rate": 0.00019578719281442003, "loss": 0.3092, "step": 129 }, { "epoch": 0.29, "grad_norm": 0.23372716379048458, "learning_rate": 0.00019568064376975012, "loss": 0.317, "step": 130 }, { "epoch": 0.29, "grad_norm": 0.22334413345013637, "learning_rate": 0.0001955727938243648, "loss": 0.2834, "step": 131 }, { "epoch": 0.29, "grad_norm": 0.2248381817587477, "learning_rate": 0.00019546364444462207, "loss": 0.302, "step": 132 }, { "epoch": 0.29, "grad_norm": 0.2360215129073668, "learning_rate": 0.00019535319711454728, "loss": 0.3293, "step": 133 }, { "epoch": 0.3, "grad_norm": 0.22623309491159815, "learning_rate": 0.00019524145333581317, "loss": 0.3273, "step": 134 }, { "epoch": 0.3, "grad_norm": 0.2070555672669519, "learning_rate": 0.00019512841462771924, "loss": 0.2972, "step": 135 }, { "epoch": 0.3, "grad_norm": 0.22460201914808312, "learning_rate": 0.00019501408252717138, "loss": 0.3019, "step": 136 }, { "epoch": 0.3, "grad_norm": 0.2225333175600629, "learning_rate": 0.00019489845858866066, "loss": 0.2983, "step": 137 }, { "epoch": 0.31, "grad_norm": 0.21017634852282913, "learning_rate": 0.0001947815443842424, "loss": 0.3002, "step": 138 }, { "epoch": 0.31, "grad_norm": 0.24382065213677345, "learning_rate": 0.00019466334150351476, "loss": 0.3061, "step": 139 }, { "epoch": 0.31, "grad_norm": 0.22222229848717998, "learning_rate": 0.00019454385155359702, "loss": 0.3189, "step": 140 }, { "epoch": 0.31, "grad_norm": 0.21144991906498956, "learning_rate": 0.00019442307615910793, "loss": 0.3093, "step": 141 }, { "epoch": 0.31, "grad_norm": 0.20966684015092005, "learning_rate": 0.00019430101696214336, "loss": 0.2975, "step": 142 }, { "epoch": 0.32, "grad_norm": 0.2285163015371888, "learning_rate": 0.0001941776756222542, "loss": 0.3291, "step": 143 }, { "epoch": 0.32, "grad_norm": 0.22173438888731478, "learning_rate": 0.00019405305381642375, "loss": 0.3052, "step": 144 }, { "epoch": 0.32, "grad_norm": 0.23472615797287633, "learning_rate": 0.00019392715323904481, "loss": 0.3158, "step": 145 }, { "epoch": 0.32, "grad_norm": 0.19206597044565454, "learning_rate": 0.00019379997560189675, "loss": 0.3047, "step": 146 }, { "epoch": 0.33, "grad_norm": 0.21928888599234125, "learning_rate": 0.00019367152263412217, "loss": 0.3196, "step": 147 }, { "epoch": 0.33, "grad_norm": 0.21469742386937682, "learning_rate": 0.00019354179608220348, "loss": 0.2981, "step": 148 }, { "epoch": 0.33, "grad_norm": 0.39037984784670976, "learning_rate": 0.000193410797709939, "loss": 0.2962, "step": 149 }, { "epoch": 0.33, "grad_norm": 0.3224683875107671, "learning_rate": 0.00019327852929841916, "loss": 0.3268, "step": 150 }, { "epoch": 0.33, "grad_norm": 0.22377973835731155, "learning_rate": 0.0001931449926460022, "loss": 0.3093, "step": 151 }, { "epoch": 0.34, "grad_norm": 0.21525051451055552, "learning_rate": 0.00019301018956828964, "loss": 0.3173, "step": 152 }, { "epoch": 0.34, "grad_norm": 0.19693490433987484, "learning_rate": 0.00019287412189810172, "loss": 0.3069, "step": 153 }, { "epoch": 0.34, "grad_norm": 0.1985719221741534, "learning_rate": 0.00019273679148545245, "loss": 0.3239, "step": 154 }, { "epoch": 0.34, "grad_norm": 0.19043619193681682, "learning_rate": 0.00019259820019752443, "loss": 0.3041, "step": 155 }, { "epoch": 0.35, "grad_norm": 0.2029259146228578, "learning_rate": 0.0001924583499186434, "loss": 0.3049, "step": 156 }, { "epoch": 0.35, "grad_norm": 0.1913024173068621, "learning_rate": 0.00019231724255025284, "loss": 0.3263, "step": 157 }, { "epoch": 0.35, "grad_norm": 0.20418535912509125, "learning_rate": 0.00019217488001088784, "loss": 0.3093, "step": 158 }, { "epoch": 0.35, "grad_norm": 0.21506493193668452, "learning_rate": 0.00019203126423614916, "loss": 0.3386, "step": 159 }, { "epoch": 0.35, "grad_norm": 0.2078781708947467, "learning_rate": 0.00019188639717867696, "loss": 0.3098, "step": 160 }, { "epoch": 0.36, "grad_norm": 0.21203565844547, "learning_rate": 0.00019174028080812415, "loss": 0.3245, "step": 161 }, { "epoch": 0.36, "grad_norm": 0.22944231505967116, "learning_rate": 0.0001915929171111296, "loss": 0.3174, "step": 162 }, { "epoch": 0.36, "grad_norm": 0.19389795184150505, "learning_rate": 0.00019144430809129128, "loss": 0.2985, "step": 163 }, { "epoch": 0.36, "grad_norm": 0.2294568217196969, "learning_rate": 0.00019129445576913888, "loss": 0.2916, "step": 164 }, { "epoch": 0.37, "grad_norm": 0.21471312626755512, "learning_rate": 0.00019114336218210634, "loss": 0.3203, "step": 165 }, { "epoch": 0.37, "grad_norm": 0.200256096520887, "learning_rate": 0.00019099102938450416, "loss": 0.314, "step": 166 }, { "epoch": 0.37, "grad_norm": 0.20286230376549838, "learning_rate": 0.00019083745944749162, "loss": 0.2953, "step": 167 }, { "epoch": 0.37, "grad_norm": 0.22752630009694177, "learning_rate": 0.00019068265445904836, "loss": 0.3098, "step": 168 }, { "epoch": 0.37, "grad_norm": 0.1969764402722988, "learning_rate": 0.00019052661652394618, "loss": 0.2798, "step": 169 }, { "epoch": 0.38, "grad_norm": 0.21419927863312113, "learning_rate": 0.0001903693477637204, "loss": 0.315, "step": 170 }, { "epoch": 0.38, "grad_norm": 0.1978129640154948, "learning_rate": 0.00019021085031664087, "loss": 0.2769, "step": 171 }, { "epoch": 0.38, "grad_norm": 0.1832605646042805, "learning_rate": 0.00019005112633768313, "loss": 0.2787, "step": 172 }, { "epoch": 0.38, "grad_norm": 0.25130865190581947, "learning_rate": 0.00018989017799849896, "loss": 0.3042, "step": 173 }, { "epoch": 0.39, "grad_norm": 0.22333188194958617, "learning_rate": 0.0001897280074873868, "loss": 0.3366, "step": 174 }, { "epoch": 0.39, "grad_norm": 0.23303955610127655, "learning_rate": 0.00018956461700926215, "loss": 0.3069, "step": 175 }, { "epoch": 0.39, "grad_norm": 0.21946538873928723, "learning_rate": 0.00018940000878562758, "loss": 0.3026, "step": 176 }, { "epoch": 0.39, "grad_norm": 0.22034016868322132, "learning_rate": 0.00018923418505454237, "loss": 0.3031, "step": 177 }, { "epoch": 0.39, "grad_norm": 0.1918273691770089, "learning_rate": 0.00018906714807059218, "loss": 0.287, "step": 178 }, { "epoch": 0.4, "grad_norm": 0.1984010069624954, "learning_rate": 0.00018889890010485847, "loss": 0.3039, "step": 179 }, { "epoch": 0.4, "grad_norm": 0.19139523377149348, "learning_rate": 0.00018872944344488747, "loss": 0.3152, "step": 180 }, { "epoch": 0.4, "grad_norm": 0.18841860181390324, "learning_rate": 0.0001885587803946592, "loss": 0.3171, "step": 181 }, { "epoch": 0.4, "grad_norm": 0.19736334769102976, "learning_rate": 0.0001883869132745561, "loss": 0.2851, "step": 182 }, { "epoch": 0.41, "grad_norm": 0.19610383893082234, "learning_rate": 0.00018821384442133145, "loss": 0.307, "step": 183 }, { "epoch": 0.41, "grad_norm": 0.19693236953217128, "learning_rate": 0.00018803957618807764, "loss": 0.3219, "step": 184 }, { "epoch": 0.41, "grad_norm": 0.1978461815572098, "learning_rate": 0.0001878641109441942, "loss": 0.2936, "step": 185 }, { "epoch": 0.41, "grad_norm": 0.22453262012783057, "learning_rate": 0.00018768745107535542, "loss": 0.3225, "step": 186 }, { "epoch": 0.41, "grad_norm": 0.1951859636493649, "learning_rate": 0.00018750959898347825, "loss": 0.2892, "step": 187 }, { "epoch": 0.42, "grad_norm": 0.19051492201816908, "learning_rate": 0.00018733055708668926, "loss": 0.2922, "step": 188 }, { "epoch": 0.42, "grad_norm": 0.19631810708829814, "learning_rate": 0.00018715032781929208, "loss": 0.2928, "step": 189 }, { "epoch": 0.42, "grad_norm": 0.20533375929006709, "learning_rate": 0.00018696891363173405, "loss": 0.3212, "step": 190 }, { "epoch": 0.42, "grad_norm": 0.21698315667726276, "learning_rate": 0.00018678631699057302, "loss": 0.3419, "step": 191 }, { "epoch": 0.43, "grad_norm": 0.23742145444149265, "learning_rate": 0.00018660254037844388, "loss": 0.3081, "step": 192 }, { "epoch": 0.43, "grad_norm": 0.22429285552735956, "learning_rate": 0.00018641758629402467, "loss": 0.3132, "step": 193 }, { "epoch": 0.43, "grad_norm": 0.20957107721564286, "learning_rate": 0.00018623145725200278, "loss": 0.3176, "step": 194 }, { "epoch": 0.43, "grad_norm": 0.23119517784848204, "learning_rate": 0.0001860441557830405, "loss": 0.3174, "step": 195 }, { "epoch": 0.43, "grad_norm": 0.22081382473360098, "learning_rate": 0.00018585568443374087, "loss": 0.3029, "step": 196 }, { "epoch": 0.44, "grad_norm": 0.20857278454177905, "learning_rate": 0.00018566604576661288, "loss": 0.2803, "step": 197 }, { "epoch": 0.44, "grad_norm": 0.20277792365487685, "learning_rate": 0.00018547524236003674, "loss": 0.3032, "step": 198 }, { "epoch": 0.44, "grad_norm": 0.22068595819466755, "learning_rate": 0.0001852832768082288, "loss": 0.3196, "step": 199 }, { "epoch": 0.44, "grad_norm": 0.22826508701764245, "learning_rate": 0.00018509015172120621, "loss": 0.307, "step": 200 }, { "epoch": 0.45, "grad_norm": 0.23318100222090005, "learning_rate": 0.00018489586972475155, "loss": 0.3243, "step": 201 }, { "epoch": 0.45, "grad_norm": 0.20790305208366794, "learning_rate": 0.00018470043346037698, "loss": 0.3026, "step": 202 }, { "epoch": 0.45, "grad_norm": 0.23045901908374763, "learning_rate": 0.00018450384558528845, "loss": 0.3215, "step": 203 }, { "epoch": 0.45, "grad_norm": 0.1895969081217627, "learning_rate": 0.0001843061087723496, "loss": 0.2827, "step": 204 }, { "epoch": 0.45, "grad_norm": 0.21471234551245583, "learning_rate": 0.00018410722571004522, "loss": 0.2758, "step": 205 }, { "epoch": 0.46, "grad_norm": 0.19174460263968052, "learning_rate": 0.00018390719910244487, "loss": 0.2935, "step": 206 }, { "epoch": 0.46, "grad_norm": 0.2042082548764658, "learning_rate": 0.00018370603166916616, "loss": 0.3219, "step": 207 }, { "epoch": 0.46, "grad_norm": 0.20862412820843024, "learning_rate": 0.00018350372614533753, "loss": 0.3079, "step": 208 }, { "epoch": 0.46, "grad_norm": 0.19365588486473745, "learning_rate": 0.00018330028528156138, "loss": 0.2878, "step": 209 }, { "epoch": 0.47, "grad_norm": 0.20923759592087007, "learning_rate": 0.0001830957118438764, "loss": 0.3052, "step": 210 }, { "epoch": 0.47, "grad_norm": 0.19346679246935292, "learning_rate": 0.00018289000861372007, "loss": 0.2873, "step": 211 }, { "epoch": 0.47, "grad_norm": 0.20707125547144337, "learning_rate": 0.00018268317838789088, "loss": 0.2888, "step": 212 }, { "epoch": 0.47, "grad_norm": 0.210272509030541, "learning_rate": 0.00018247522397851028, "loss": 0.3063, "step": 213 }, { "epoch": 0.47, "grad_norm": 0.22635533463893145, "learning_rate": 0.0001822661482129844, "loss": 0.2844, "step": 214 }, { "epoch": 0.48, "grad_norm": 0.20691107915045812, "learning_rate": 0.00018205595393396568, "loss": 0.2986, "step": 215 }, { "epoch": 0.48, "grad_norm": 0.21549907787627992, "learning_rate": 0.00018184464399931412, "loss": 0.3098, "step": 216 }, { "epoch": 0.48, "grad_norm": 0.20061134281496176, "learning_rate": 0.00018163222128205853, "loss": 0.2871, "step": 217 }, { "epoch": 0.48, "grad_norm": 0.20502352214692726, "learning_rate": 0.00018141868867035745, "loss": 0.294, "step": 218 }, { "epoch": 0.49, "grad_norm": 0.2291358128642933, "learning_rate": 0.00018120404906745973, "loss": 0.2757, "step": 219 }, { "epoch": 0.49, "grad_norm": 0.1972835362678214, "learning_rate": 0.00018098830539166536, "loss": 0.3084, "step": 220 }, { "epoch": 0.49, "grad_norm": 0.21611715417590088, "learning_rate": 0.00018077146057628545, "loss": 0.2816, "step": 221 }, { "epoch": 0.49, "grad_norm": 0.21599858891116713, "learning_rate": 0.00018055351756960262, "loss": 0.3085, "step": 222 }, { "epoch": 0.49, "grad_norm": 0.19318954639097616, "learning_rate": 0.00018033447933483076, "loss": 0.2557, "step": 223 }, { "epoch": 0.5, "grad_norm": 0.18823294633329682, "learning_rate": 0.00018011434885007482, "loss": 0.2902, "step": 224 }, { "epoch": 0.5, "grad_norm": 0.21452443929587184, "learning_rate": 0.00017989312910829023, "loss": 0.311, "step": 225 }, { "epoch": 0.5, "grad_norm": 0.20666447125090429, "learning_rate": 0.00017967082311724227, "loss": 0.2883, "step": 226 }, { "epoch": 0.5, "eval_loss": 0.297645628452301, "eval_runtime": 173.6984, "eval_samples_per_second": 13.374, "eval_steps_per_second": 0.42, "step": 226 }, { "epoch": 0.5, "grad_norm": 0.21336924271339083, "learning_rate": 0.00017944743389946524, "loss": 0.3026, "step": 227 }, { "epoch": 0.5, "grad_norm": 0.20792583408257218, "learning_rate": 0.0001792229644922212, "loss": 0.2843, "step": 228 }, { "epoch": 0.51, "grad_norm": 0.2129371399765845, "learning_rate": 0.0001789974179474588, "loss": 0.3091, "step": 229 }, { "epoch": 0.51, "grad_norm": 0.19527091537284477, "learning_rate": 0.00017877079733177184, "loss": 0.297, "step": 230 }, { "epoch": 0.51, "grad_norm": 0.20516345655708346, "learning_rate": 0.00017854310572635733, "loss": 0.2935, "step": 231 }, { "epoch": 0.51, "grad_norm": 0.1969309791573733, "learning_rate": 0.00017831434622697385, "loss": 0.2898, "step": 232 }, { "epoch": 0.52, "grad_norm": 0.2696175229862278, "learning_rate": 0.0001780845219438994, "loss": 0.2924, "step": 233 }, { "epoch": 0.52, "grad_norm": 0.234973041162424, "learning_rate": 0.00017785363600188894, "loss": 0.3179, "step": 234 }, { "epoch": 0.52, "grad_norm": 0.23939104068785497, "learning_rate": 0.00017762169154013216, "loss": 0.2796, "step": 235 }, { "epoch": 0.52, "grad_norm": 0.20378572036784126, "learning_rate": 0.00017738869171221068, "loss": 0.2784, "step": 236 }, { "epoch": 0.52, "grad_norm": 0.19159802735904477, "learning_rate": 0.0001771546396860551, "loss": 0.2834, "step": 237 }, { "epoch": 0.53, "grad_norm": 0.23302725481391803, "learning_rate": 0.00017691953864390207, "loss": 0.2997, "step": 238 }, { "epoch": 0.53, "grad_norm": 0.23471887316578038, "learning_rate": 0.0001766833917822509, "loss": 0.3, "step": 239 }, { "epoch": 0.53, "grad_norm": 0.1796225243725005, "learning_rate": 0.00017644620231182015, "loss": 0.2901, "step": 240 }, { "epoch": 0.53, "grad_norm": 0.20037902218428075, "learning_rate": 0.00017620797345750403, "loss": 0.294, "step": 241 }, { "epoch": 0.54, "grad_norm": 0.20415485179217374, "learning_rate": 0.0001759687084583285, "loss": 0.3162, "step": 242 }, { "epoch": 0.54, "grad_norm": 0.18182779738483645, "learning_rate": 0.00017572841056740722, "loss": 0.275, "step": 243 }, { "epoch": 0.54, "grad_norm": 0.17275776985467062, "learning_rate": 0.00017548708305189722, "loss": 0.2592, "step": 244 }, { "epoch": 0.54, "grad_norm": 0.18388792575433974, "learning_rate": 0.00017524472919295487, "loss": 0.2998, "step": 245 }, { "epoch": 0.54, "grad_norm": 0.1821375376929017, "learning_rate": 0.00017500135228569068, "loss": 0.2586, "step": 246 }, { "epoch": 0.55, "grad_norm": 0.18937126829543396, "learning_rate": 0.00017475695563912505, "loss": 0.2858, "step": 247 }, { "epoch": 0.55, "grad_norm": 0.1833938565560613, "learning_rate": 0.00017451154257614287, "loss": 0.271, "step": 248 }, { "epoch": 0.55, "grad_norm": 0.20382927797400074, "learning_rate": 0.0001742651164334486, "loss": 0.2931, "step": 249 }, { "epoch": 0.55, "grad_norm": 0.23510651870506527, "learning_rate": 0.00017401768056152085, "loss": 0.3078, "step": 250 }, { "epoch": 0.56, "grad_norm": 0.2191619740250893, "learning_rate": 0.00017376923832456665, "loss": 0.3111, "step": 251 }, { "epoch": 0.56, "grad_norm": 0.20648235757385622, "learning_rate": 0.00017351979310047602, "loss": 0.2816, "step": 252 }, { "epoch": 0.56, "grad_norm": 0.19445561313531604, "learning_rate": 0.00017326934828077573, "loss": 0.2894, "step": 253 }, { "epoch": 0.56, "grad_norm": 0.19169798824869538, "learning_rate": 0.00017301790727058345, "loss": 0.2802, "step": 254 }, { "epoch": 0.56, "grad_norm": 0.2006703950203184, "learning_rate": 0.0001727654734885612, "loss": 0.2749, "step": 255 }, { "epoch": 0.57, "grad_norm": 0.21061482297381626, "learning_rate": 0.0001725120503668691, "loss": 0.3042, "step": 256 }, { "epoch": 0.57, "grad_norm": 0.20740853426573325, "learning_rate": 0.00017225764135111868, "loss": 0.3025, "step": 257 }, { "epoch": 0.57, "grad_norm": 0.1945016623980029, "learning_rate": 0.00017200224990032576, "loss": 0.2964, "step": 258 }, { "epoch": 0.57, "grad_norm": 0.21095668740779128, "learning_rate": 0.00017174587948686374, "loss": 0.3047, "step": 259 }, { "epoch": 0.58, "grad_norm": 0.18724414137242903, "learning_rate": 0.00017148853359641626, "loss": 0.2678, "step": 260 }, { "epoch": 0.58, "grad_norm": 0.18247589724576602, "learning_rate": 0.00017123021572792982, "loss": 0.2796, "step": 261 }, { "epoch": 0.58, "grad_norm": 0.20187877784359254, "learning_rate": 0.00017097092939356623, "loss": 0.2819, "step": 262 }, { "epoch": 0.58, "grad_norm": 0.21960455156021205, "learning_rate": 0.00017071067811865476, "loss": 0.2849, "step": 263 }, { "epoch": 0.58, "grad_norm": 0.21920804195394356, "learning_rate": 0.00017044946544164433, "loss": 0.286, "step": 264 }, { "epoch": 0.59, "grad_norm": 0.2153094965416609, "learning_rate": 0.00017018729491405536, "loss": 0.2728, "step": 265 }, { "epoch": 0.59, "grad_norm": 0.18899974451413623, "learning_rate": 0.00016992417010043142, "loss": 0.2643, "step": 266 }, { "epoch": 0.59, "grad_norm": 0.20452371771386804, "learning_rate": 0.00016966009457829086, "loss": 0.2805, "step": 267 }, { "epoch": 0.59, "grad_norm": 0.19546419928152936, "learning_rate": 0.0001693950719380782, "loss": 0.2749, "step": 268 }, { "epoch": 0.6, "grad_norm": 0.19627215184487343, "learning_rate": 0.00016912910578311503, "loss": 0.273, "step": 269 }, { "epoch": 0.6, "grad_norm": 0.18828419766156645, "learning_rate": 0.00016886219972955146, "loss": 0.273, "step": 270 }, { "epoch": 0.6, "grad_norm": 0.19027059707604657, "learning_rate": 0.00016859435740631658, "loss": 0.3046, "step": 271 }, { "epoch": 0.6, "grad_norm": 0.18426233664144975, "learning_rate": 0.00016832558245506935, "loss": 0.2643, "step": 272 }, { "epoch": 0.6, "grad_norm": 0.18658729778193736, "learning_rate": 0.00016805587853014895, "loss": 0.285, "step": 273 }, { "epoch": 0.61, "grad_norm": 0.1796032355554721, "learning_rate": 0.00016778524929852512, "loss": 0.261, "step": 274 }, { "epoch": 0.61, "grad_norm": 0.19790847765561645, "learning_rate": 0.0001675136984397484, "loss": 0.3036, "step": 275 }, { "epoch": 0.61, "grad_norm": 0.2013507251688789, "learning_rate": 0.0001672412296459, "loss": 0.2929, "step": 276 }, { "epoch": 0.61, "grad_norm": 0.20727255991158675, "learning_rate": 0.00016696784662154163, "loss": 0.28, "step": 277 }, { "epoch": 0.62, "grad_norm": 0.22175173491924988, "learning_rate": 0.0001666935530836651, "loss": 0.2953, "step": 278 }, { "epoch": 0.62, "grad_norm": 0.2110368794764076, "learning_rate": 0.00016641835276164183, "loss": 0.3012, "step": 279 }, { "epoch": 0.62, "grad_norm": 0.1839097327175649, "learning_rate": 0.00016614224939717217, "loss": 0.2985, "step": 280 }, { "epoch": 0.62, "grad_norm": 0.18075701410157072, "learning_rate": 0.00016586524674423446, "loss": 0.2614, "step": 281 }, { "epoch": 0.62, "grad_norm": 0.19006328055997038, "learning_rate": 0.00016558734856903404, "loss": 0.2741, "step": 282 }, { "epoch": 0.63, "grad_norm": 0.1936501426398552, "learning_rate": 0.00016530855864995195, "loss": 0.2486, "step": 283 }, { "epoch": 0.63, "grad_norm": 0.2146360520459362, "learning_rate": 0.0001650288807774937, "loss": 0.2949, "step": 284 }, { "epoch": 0.63, "grad_norm": 0.1934454081094374, "learning_rate": 0.00016474831875423767, "loss": 0.25, "step": 285 }, { "epoch": 0.63, "grad_norm": 0.20138529225313043, "learning_rate": 0.0001644668763947833, "loss": 0.2826, "step": 286 }, { "epoch": 0.64, "grad_norm": 0.20336180418412392, "learning_rate": 0.00016418455752569943, "loss": 0.281, "step": 287 }, { "epoch": 0.64, "grad_norm": 0.2366660122305481, "learning_rate": 0.00016390136598547217, "loss": 0.2665, "step": 288 }, { "epoch": 0.64, "grad_norm": 0.18478597928089585, "learning_rate": 0.00016361730562445263, "loss": 0.3022, "step": 289 }, { "epoch": 0.64, "grad_norm": 0.18679948920602824, "learning_rate": 0.0001633323803048047, "loss": 0.2844, "step": 290 }, { "epoch": 0.64, "grad_norm": 0.19949957744833288, "learning_rate": 0.00016304659390045252, "loss": 0.2912, "step": 291 }, { "epoch": 0.65, "grad_norm": 0.20044341182168152, "learning_rate": 0.0001627599502970277, "loss": 0.2729, "step": 292 }, { "epoch": 0.65, "grad_norm": 0.18558206726419454, "learning_rate": 0.00016247245339181662, "loss": 0.2693, "step": 293 }, { "epoch": 0.65, "grad_norm": 0.2083655759577377, "learning_rate": 0.00016218410709370736, "loss": 0.3022, "step": 294 }, { "epoch": 0.65, "grad_norm": 0.21456219690969847, "learning_rate": 0.00016189491532313664, "loss": 0.2933, "step": 295 }, { "epoch": 0.66, "grad_norm": 0.2316084982608295, "learning_rate": 0.00016160488201203644, "loss": 0.2631, "step": 296 }, { "epoch": 0.66, "grad_norm": 0.20943766303868608, "learning_rate": 0.00016131401110378043, "loss": 0.2847, "step": 297 }, { "epoch": 0.66, "grad_norm": 0.2346281512353159, "learning_rate": 0.00016102230655313076, "loss": 0.2898, "step": 298 }, { "epoch": 0.66, "grad_norm": 0.18690479698056445, "learning_rate": 0.0001607297723261837, "loss": 0.2761, "step": 299 }, { "epoch": 0.66, "grad_norm": 0.22244239877246086, "learning_rate": 0.00016043641240031623, "loss": 0.2794, "step": 300 }, { "epoch": 0.67, "grad_norm": 0.19513520777123738, "learning_rate": 0.00016014223076413173, "loss": 0.2757, "step": 301 }, { "epoch": 0.67, "grad_norm": 0.1954004471181246, "learning_rate": 0.00015984723141740576, "loss": 0.2713, "step": 302 }, { "epoch": 0.67, "grad_norm": 0.21399036326669596, "learning_rate": 0.00015955141837103168, "loss": 0.2767, "step": 303 }, { "epoch": 0.67, "grad_norm": 0.2045546981741005, "learning_rate": 0.0001592547956469662, "loss": 0.2807, "step": 304 }, { "epoch": 0.68, "grad_norm": 0.21321779886240727, "learning_rate": 0.00015895736727817455, "loss": 0.2899, "step": 305 }, { "epoch": 0.68, "grad_norm": 0.20121770170139855, "learning_rate": 0.00015865913730857582, "loss": 0.2706, "step": 306 }, { "epoch": 0.68, "grad_norm": 0.21901656836549904, "learning_rate": 0.00015836010979298782, "loss": 0.3043, "step": 307 }, { "epoch": 0.68, "grad_norm": 0.19476889300368513, "learning_rate": 0.0001580602887970721, "loss": 0.294, "step": 308 }, { "epoch": 0.68, "grad_norm": 0.1864916489760236, "learning_rate": 0.00015775967839727842, "loss": 0.2789, "step": 309 }, { "epoch": 0.69, "grad_norm": 0.2005263777787887, "learning_rate": 0.0001574582826807897, "loss": 0.2786, "step": 310 }, { "epoch": 0.69, "grad_norm": 0.20330910409013883, "learning_rate": 0.0001571561057454661, "loss": 0.2856, "step": 311 }, { "epoch": 0.69, "grad_norm": 0.2075041911121307, "learning_rate": 0.00015685315169978954, "loss": 0.301, "step": 312 }, { "epoch": 0.69, "grad_norm": 0.198545224318664, "learning_rate": 0.0001565494246628077, "loss": 0.2667, "step": 313 }, { "epoch": 0.7, "grad_norm": 0.18262873322406858, "learning_rate": 0.0001562449287640781, "loss": 0.2761, "step": 314 }, { "epoch": 0.7, "grad_norm": 0.20682904427400386, "learning_rate": 0.0001559396681436118, "loss": 0.2798, "step": 315 }, { "epoch": 0.7, "grad_norm": 0.17100330949740156, "learning_rate": 0.00015563364695181741, "loss": 0.2638, "step": 316 }, { "epoch": 0.7, "grad_norm": 0.19147233406100644, "learning_rate": 0.00015532686934944438, "loss": 0.2772, "step": 317 }, { "epoch": 0.7, "grad_norm": 0.20859821686266017, "learning_rate": 0.00015501933950752656, "loss": 0.2899, "step": 318 }, { "epoch": 0.71, "grad_norm": 0.22660943071408993, "learning_rate": 0.00015471106160732542, "loss": 0.2809, "step": 319 }, { "epoch": 0.71, "grad_norm": 0.1970595992988207, "learning_rate": 0.00015440203984027324, "loss": 0.2664, "step": 320 }, { "epoch": 0.71, "grad_norm": 0.21441465355834557, "learning_rate": 0.00015409227840791617, "loss": 0.2872, "step": 321 }, { "epoch": 0.71, "grad_norm": 0.1928160770028818, "learning_rate": 0.000153781781521857, "loss": 0.2932, "step": 322 }, { "epoch": 0.72, "grad_norm": 0.18232874606722474, "learning_rate": 0.00015347055340369804, "loss": 0.2865, "step": 323 }, { "epoch": 0.72, "grad_norm": 0.18944518082470313, "learning_rate": 0.00015315859828498354, "loss": 0.2895, "step": 324 }, { "epoch": 0.72, "grad_norm": 0.18688411692002266, "learning_rate": 0.00015284592040714227, "loss": 0.3068, "step": 325 }, { "epoch": 0.72, "grad_norm": 0.15360887670295487, "learning_rate": 0.00015253252402142988, "loss": 0.2541, "step": 326 }, { "epoch": 0.72, "grad_norm": 0.17886321037354558, "learning_rate": 0.00015221841338887104, "loss": 0.2735, "step": 327 }, { "epoch": 0.73, "grad_norm": 0.18210509607368616, "learning_rate": 0.0001519035927802015, "loss": 0.2674, "step": 328 }, { "epoch": 0.73, "grad_norm": 0.2113800439853146, "learning_rate": 0.00015158806647581002, "loss": 0.2611, "step": 329 }, { "epoch": 0.73, "grad_norm": 0.21664373470568593, "learning_rate": 0.00015127183876568022, "loss": 0.2734, "step": 330 }, { "epoch": 0.73, "grad_norm": 0.21152226314764647, "learning_rate": 0.0001509549139493323, "loss": 0.2605, "step": 331 }, { "epoch": 0.74, "grad_norm": 0.20787608314066813, "learning_rate": 0.0001506372963357644, "loss": 0.2829, "step": 332 }, { "epoch": 0.74, "grad_norm": 0.20890612340451087, "learning_rate": 0.00015031899024339415, "loss": 0.2761, "step": 333 }, { "epoch": 0.74, "grad_norm": 0.18123961862890825, "learning_rate": 0.00015000000000000001, "loss": 0.2625, "step": 334 }, { "epoch": 0.74, "grad_norm": 0.20005061805451332, "learning_rate": 0.00014968032994266224, "loss": 0.2739, "step": 335 }, { "epoch": 0.74, "grad_norm": 0.22011995491515385, "learning_rate": 0.00014935998441770407, "loss": 0.2769, "step": 336 }, { "epoch": 0.75, "grad_norm": 0.1759458248293069, "learning_rate": 0.00014903896778063267, "loss": 0.2751, "step": 337 }, { "epoch": 0.75, "grad_norm": 0.17963701036394614, "learning_rate": 0.00014871728439607966, "loss": 0.2861, "step": 338 }, { "epoch": 0.75, "grad_norm": 0.22395035633400334, "learning_rate": 0.00014839493863774212, "loss": 0.2748, "step": 339 }, { "epoch": 0.75, "eval_loss": 0.27847620844841003, "eval_runtime": 174.097, "eval_samples_per_second": 13.343, "eval_steps_per_second": 0.419, "step": 339 }, { "epoch": 0.75, "grad_norm": 0.19931473943900235, "learning_rate": 0.00014807193488832282, "loss": 0.261, "step": 340 }, { "epoch": 0.76, "grad_norm": 0.1895068181039295, "learning_rate": 0.00014774827753947088, "loss": 0.2666, "step": 341 }, { "epoch": 0.76, "grad_norm": 0.19682173311096884, "learning_rate": 0.00014742397099172183, "loss": 0.2564, "step": 342 }, { "epoch": 0.76, "grad_norm": 0.2183356533233642, "learning_rate": 0.00014709901965443794, "loss": 0.2904, "step": 343 }, { "epoch": 0.76, "grad_norm": 0.2113562507034691, "learning_rate": 0.00014677342794574817, "loss": 0.2915, "step": 344 }, { "epoch": 0.76, "grad_norm": 0.2034784414842117, "learning_rate": 0.00014644720029248829, "loss": 0.2717, "step": 345 }, { "epoch": 0.77, "grad_norm": 0.1967572374697097, "learning_rate": 0.00014612034113014035, "loss": 0.2887, "step": 346 }, { "epoch": 0.77, "grad_norm": 0.1899953535101574, "learning_rate": 0.00014579285490277274, "loss": 0.2922, "step": 347 }, { "epoch": 0.77, "grad_norm": 0.19441384522232566, "learning_rate": 0.0001454647460629795, "loss": 0.2785, "step": 348 }, { "epoch": 0.77, "grad_norm": 0.20710674744690838, "learning_rate": 0.00014513601907181992, "loss": 0.2929, "step": 349 }, { "epoch": 0.78, "grad_norm": 0.18640646337351455, "learning_rate": 0.00014480667839875786, "loss": 0.261, "step": 350 }, { "epoch": 0.78, "grad_norm": 0.1754057901024035, "learning_rate": 0.00014447672852160095, "loss": 0.267, "step": 351 }, { "epoch": 0.78, "grad_norm": 0.1845116395890587, "learning_rate": 0.0001441461739264397, "loss": 0.2608, "step": 352 }, { "epoch": 0.78, "grad_norm": 0.1832842348380918, "learning_rate": 0.00014381501910758662, "loss": 0.264, "step": 353 }, { "epoch": 0.78, "grad_norm": 0.21078762275275081, "learning_rate": 0.00014348326856751496, "loss": 0.2903, "step": 354 }, { "epoch": 0.79, "grad_norm": 0.2040423439668467, "learning_rate": 0.00014315092681679755, "loss": 0.2866, "step": 355 }, { "epoch": 0.79, "grad_norm": 0.20916646529152538, "learning_rate": 0.00014281799837404552, "loss": 0.2669, "step": 356 }, { "epoch": 0.79, "grad_norm": 0.20829199241699958, "learning_rate": 0.00014248448776584688, "loss": 0.2773, "step": 357 }, { "epoch": 0.79, "grad_norm": 0.22863270132432637, "learning_rate": 0.0001421503995267048, "loss": 0.2691, "step": 358 }, { "epoch": 0.8, "grad_norm": 0.19107899619014693, "learning_rate": 0.00014181573819897617, "loss": 0.2854, "step": 359 }, { "epoch": 0.8, "grad_norm": 0.18310920202899897, "learning_rate": 0.00014148050833280977, "loss": 0.2523, "step": 360 }, { "epoch": 0.8, "grad_norm": 0.19142180009852441, "learning_rate": 0.00014114471448608426, "loss": 0.2668, "step": 361 }, { "epoch": 0.8, "grad_norm": 0.17916118207750328, "learning_rate": 0.0001408083612243465, "loss": 0.2826, "step": 362 }, { "epoch": 0.8, "grad_norm": 0.1795532305403206, "learning_rate": 0.0001404714531207492, "loss": 0.2876, "step": 363 }, { "epoch": 0.81, "grad_norm": 0.19812580507096825, "learning_rate": 0.0001401339947559889, "loss": 0.2813, "step": 364 }, { "epoch": 0.81, "grad_norm": 0.16985158453017796, "learning_rate": 0.00013979599071824362, "loss": 0.2824, "step": 365 }, { "epoch": 0.81, "grad_norm": 0.20447909651191687, "learning_rate": 0.00013945744560311057, "loss": 0.2567, "step": 366 }, { "epoch": 0.81, "grad_norm": 0.20178806044954373, "learning_rate": 0.0001391183640135435, "loss": 0.2755, "step": 367 }, { "epoch": 0.82, "grad_norm": 0.2265042443851448, "learning_rate": 0.00013877875055979023, "loss": 0.2905, "step": 368 }, { "epoch": 0.82, "grad_norm": 0.22127748544952333, "learning_rate": 0.00013843860985933003, "loss": 0.2624, "step": 369 }, { "epoch": 0.82, "grad_norm": 0.20805077265871227, "learning_rate": 0.00013809794653681074, "loss": 0.2396, "step": 370 }, { "epoch": 0.82, "grad_norm": 0.21236429860308073, "learning_rate": 0.00013775676522398588, "loss": 0.2916, "step": 371 }, { "epoch": 0.82, "grad_norm": 0.19893678645018886, "learning_rate": 0.00013741507055965168, "loss": 0.2551, "step": 372 }, { "epoch": 0.83, "grad_norm": 0.18256634499151977, "learning_rate": 0.00013707286718958413, "loss": 0.2633, "step": 373 }, { "epoch": 0.83, "grad_norm": 0.18221655160895703, "learning_rate": 0.00013673015976647568, "loss": 0.2672, "step": 374 }, { "epoch": 0.83, "grad_norm": 0.19429764298431815, "learning_rate": 0.00013638695294987204, "loss": 0.2417, "step": 375 }, { "epoch": 0.83, "grad_norm": 0.18793583918739298, "learning_rate": 0.0001360432514061087, "loss": 0.2613, "step": 376 }, { "epoch": 0.83, "grad_norm": 0.2011435035031983, "learning_rate": 0.00013569905980824788, "loss": 0.2685, "step": 377 }, { "epoch": 0.84, "grad_norm": 0.2052637622415086, "learning_rate": 0.00013535438283601435, "loss": 0.2959, "step": 378 }, { "epoch": 0.84, "grad_norm": 0.1945089020971851, "learning_rate": 0.00013500922517573245, "loss": 0.2482, "step": 379 }, { "epoch": 0.84, "grad_norm": 0.20812822716018067, "learning_rate": 0.00013466359152026195, "loss": 0.2741, "step": 380 }, { "epoch": 0.84, "grad_norm": 0.22384533508071883, "learning_rate": 0.0001343174865689344, "loss": 0.2914, "step": 381 }, { "epoch": 0.85, "grad_norm": 0.19177959970339475, "learning_rate": 0.0001339709150274893, "loss": 0.2694, "step": 382 }, { "epoch": 0.85, "grad_norm": 0.18196106404878304, "learning_rate": 0.0001336238816080099, "loss": 0.2649, "step": 383 }, { "epoch": 0.85, "grad_norm": 0.2526481416447456, "learning_rate": 0.00013327639102885937, "loss": 0.2813, "step": 384 }, { "epoch": 0.85, "grad_norm": 0.26250954621985834, "learning_rate": 0.0001329284480146166, "loss": 0.2793, "step": 385 }, { "epoch": 0.85, "grad_norm": 0.19318862080071506, "learning_rate": 0.00013258005729601177, "loss": 0.2441, "step": 386 }, { "epoch": 0.86, "grad_norm": 0.21447364833995924, "learning_rate": 0.00013223122360986225, "loss": 0.2887, "step": 387 }, { "epoch": 0.86, "grad_norm": 0.18021814111984105, "learning_rate": 0.00013188195169900813, "loss": 0.2722, "step": 388 }, { "epoch": 0.86, "grad_norm": 0.1986609859985159, "learning_rate": 0.0001315322463122477, "loss": 0.2638, "step": 389 }, { "epoch": 0.86, "grad_norm": 0.24674716367898283, "learning_rate": 0.00013118211220427298, "loss": 0.2753, "step": 390 }, { "epoch": 0.87, "grad_norm": 0.22439353357611744, "learning_rate": 0.0001308315541356049, "loss": 0.2853, "step": 391 }, { "epoch": 0.87, "grad_norm": 0.17776179888083857, "learning_rate": 0.00013048057687252865, "loss": 0.2441, "step": 392 }, { "epoch": 0.87, "grad_norm": 0.21002490821212472, "learning_rate": 0.00013012918518702914, "loss": 0.2882, "step": 393 }, { "epoch": 0.87, "grad_norm": 0.18514107951857695, "learning_rate": 0.00012977738385672557, "loss": 0.2643, "step": 394 }, { "epoch": 0.87, "grad_norm": 0.18577497415553973, "learning_rate": 0.000129425177664807, "loss": 0.2588, "step": 395 }, { "epoch": 0.88, "grad_norm": 0.21465706083350347, "learning_rate": 0.00012907257139996704, "loss": 0.287, "step": 396 }, { "epoch": 0.88, "grad_norm": 0.1742663966229961, "learning_rate": 0.0001287195698563388, "loss": 0.2668, "step": 397 }, { "epoch": 0.88, "grad_norm": 0.20553702826350395, "learning_rate": 0.0001283661778334297, "loss": 0.2739, "step": 398 }, { "epoch": 0.88, "grad_norm": 0.16765771945192084, "learning_rate": 0.0001280124001360562, "loss": 0.2594, "step": 399 }, { "epoch": 0.89, "grad_norm": 0.1943563901020515, "learning_rate": 0.0001276582415742786, "loss": 0.2708, "step": 400 }, { "epoch": 0.89, "grad_norm": 0.1896178424172107, "learning_rate": 0.0001273037069633354, "loss": 0.2639, "step": 401 }, { "epoch": 0.89, "grad_norm": 1.0712870595212587, "learning_rate": 0.00012694880112357808, "loss": 0.2765, "step": 402 }, { "epoch": 0.89, "grad_norm": 0.2092081458414992, "learning_rate": 0.00012659352888040547, "loss": 0.2589, "step": 403 }, { "epoch": 0.89, "grad_norm": 0.21211010351459872, "learning_rate": 0.0001262378950641979, "loss": 0.285, "step": 404 }, { "epoch": 0.9, "grad_norm": 0.23377789093835202, "learning_rate": 0.00012588190451025207, "loss": 0.3038, "step": 405 }, { "epoch": 0.9, "grad_norm": 0.24727367430206143, "learning_rate": 0.00012552556205871478, "loss": 0.2577, "step": 406 }, { "epoch": 0.9, "grad_norm": 0.1800512798597796, "learning_rate": 0.00012516887255451735, "loss": 0.2392, "step": 407 }, { "epoch": 0.9, "grad_norm": 0.2114523591213948, "learning_rate": 0.00012481184084730976, "loss": 0.27, "step": 408 }, { "epoch": 0.91, "grad_norm": 0.22674225384993477, "learning_rate": 0.0001244544717913947, "loss": 0.2372, "step": 409 }, { "epoch": 0.91, "grad_norm": 0.17864125878915538, "learning_rate": 0.00012409677024566144, "loss": 0.2601, "step": 410 }, { "epoch": 0.91, "grad_norm": 0.1904684467485375, "learning_rate": 0.00012373874107352004, "loss": 0.2647, "step": 411 }, { "epoch": 0.91, "grad_norm": 0.17298512767214175, "learning_rate": 0.0001233803891428349, "loss": 0.2438, "step": 412 }, { "epoch": 0.91, "grad_norm": 0.17544690229663243, "learning_rate": 0.00012302171932585885, "loss": 0.2585, "step": 413 }, { "epoch": 0.92, "grad_norm": 0.20074139062085292, "learning_rate": 0.0001226627364991667, "loss": 0.2575, "step": 414 }, { "epoch": 0.92, "grad_norm": 0.18591883669880138, "learning_rate": 0.0001223034455435891, "loss": 0.2593, "step": 415 }, { "epoch": 0.92, "grad_norm": 0.20298533039339473, "learning_rate": 0.00012194385134414608, "loss": 0.2779, "step": 416 }, { "epoch": 0.92, "grad_norm": 0.19947012767946487, "learning_rate": 0.00012158395878998063, "loss": 0.2776, "step": 417 }, { "epoch": 0.93, "grad_norm": 0.18367461189103365, "learning_rate": 0.00012122377277429231, "loss": 0.2934, "step": 418 }, { "epoch": 0.93, "grad_norm": 0.18639410238342366, "learning_rate": 0.00012086329819427065, "loss": 0.2848, "step": 419 }, { "epoch": 0.93, "grad_norm": 0.18355116451736245, "learning_rate": 0.00012050253995102854, "loss": 0.2864, "step": 420 }, { "epoch": 0.93, "grad_norm": 0.16618761241876043, "learning_rate": 0.00012014150294953563, "loss": 0.2722, "step": 421 }, { "epoch": 0.93, "grad_norm": 0.1679981782550472, "learning_rate": 0.00011978019209855174, "loss": 0.2417, "step": 422 }, { "epoch": 0.94, "grad_norm": 0.17373986628880816, "learning_rate": 0.00011941861231055994, "loss": 0.2464, "step": 423 }, { "epoch": 0.94, "grad_norm": 0.16872090115264807, "learning_rate": 0.0001190567685016998, "loss": 0.2541, "step": 424 }, { "epoch": 0.94, "grad_norm": 0.17551340253440462, "learning_rate": 0.00011869466559170073, "loss": 0.2521, "step": 425 }, { "epoch": 0.94, "grad_norm": 0.18233468138593362, "learning_rate": 0.00011833230850381487, "loss": 0.2712, "step": 426 }, { "epoch": 0.95, "grad_norm": 0.18504780357931186, "learning_rate": 0.00011796970216475018, "loss": 0.2754, "step": 427 }, { "epoch": 0.95, "grad_norm": 0.18478336853897961, "learning_rate": 0.00011760685150460362, "loss": 0.2592, "step": 428 }, { "epoch": 0.95, "grad_norm": 0.19917097640828008, "learning_rate": 0.00011724376145679394, "loss": 0.2855, "step": 429 }, { "epoch": 0.95, "grad_norm": 0.19144241808727208, "learning_rate": 0.00011688043695799468, "loss": 0.2502, "step": 430 }, { "epoch": 0.95, "grad_norm": 0.18405239160044018, "learning_rate": 0.00011651688294806706, "loss": 0.2477, "step": 431 }, { "epoch": 0.96, "grad_norm": 0.1796721265270086, "learning_rate": 0.00011615310436999279, "loss": 0.249, "step": 432 }, { "epoch": 0.96, "grad_norm": 0.198923200273133, "learning_rate": 0.00011578910616980683, "loss": 0.2559, "step": 433 }, { "epoch": 0.96, "grad_norm": 0.2056661440897021, "learning_rate": 0.00011542489329653024, "loss": 0.2645, "step": 434 }, { "epoch": 0.96, "grad_norm": 0.21059350822537876, "learning_rate": 0.00011506047070210282, "loss": 0.2747, "step": 435 }, { "epoch": 0.97, "grad_norm": 0.19445339084850757, "learning_rate": 0.00011469584334131578, "loss": 0.2732, "step": 436 }, { "epoch": 0.97, "grad_norm": 0.1978300160069901, "learning_rate": 0.0001143310161717444, "loss": 0.2653, "step": 437 }, { "epoch": 0.97, "grad_norm": 0.1992099703929807, "learning_rate": 0.00011396599415368061, "loss": 0.2775, "step": 438 }, { "epoch": 0.97, "grad_norm": 0.20064532246856512, "learning_rate": 0.00011360078225006562, "loss": 0.2642, "step": 439 }, { "epoch": 0.97, "grad_norm": 0.19513907362313387, "learning_rate": 0.00011323538542642227, "loss": 0.2554, "step": 440 }, { "epoch": 0.98, "grad_norm": 0.191943659814926, "learning_rate": 0.00011286980865078763, "loss": 0.2778, "step": 441 }, { "epoch": 0.98, "grad_norm": 0.1921885618320068, "learning_rate": 0.0001125040568936456, "loss": 0.2734, "step": 442 }, { "epoch": 0.98, "grad_norm": 0.19140570504247198, "learning_rate": 0.00011213813512785898, "loss": 0.2809, "step": 443 }, { "epoch": 0.98, "grad_norm": 0.1772706721475436, "learning_rate": 0.00011177204832860213, "loss": 0.2558, "step": 444 }, { "epoch": 0.99, "grad_norm": 0.17875694766580855, "learning_rate": 0.00011140580147329338, "loss": 0.2569, "step": 445 }, { "epoch": 0.99, "grad_norm": 0.1891065652583873, "learning_rate": 0.000111039399541527, "loss": 0.271, "step": 446 }, { "epoch": 0.99, "grad_norm": 0.17377167224369017, "learning_rate": 0.00011067284751500583, "loss": 0.2711, "step": 447 }, { "epoch": 0.99, "grad_norm": 0.16281001225472738, "learning_rate": 0.00011030615037747353, "loss": 0.2464, "step": 448 }, { "epoch": 0.99, "grad_norm": 0.17949964198592142, "learning_rate": 0.0001099393131146466, "loss": 0.2514, "step": 449 }, { "epoch": 1.0, "grad_norm": 0.18463419257880492, "learning_rate": 0.00010957234071414674, "loss": 0.2799, "step": 450 }, { "epoch": 1.0, "grad_norm": 0.18137088353673406, "learning_rate": 0.00010920523816543309, "loss": 0.2615, "step": 451 }, { "epoch": 1.0, "grad_norm": 0.18259357360917502, "learning_rate": 0.00010883801045973425, "loss": 0.2812, "step": 452 }, { "epoch": 1.0, "eval_loss": 0.2611839771270752, "eval_runtime": 174.0726, "eval_samples_per_second": 13.345, "eval_steps_per_second": 0.419, "step": 452 }, { "epoch": 1.0, "grad_norm": 0.20671514454621873, "learning_rate": 0.00010847066258998053, "loss": 0.2973, "step": 453 }, { "epoch": 1.01, "grad_norm": 0.18947687688376835, "learning_rate": 0.00010810319955073601, "loss": 0.2752, "step": 454 }, { "epoch": 1.01, "grad_norm": 0.20255312746628473, "learning_rate": 0.00010773562633813061, "loss": 0.2548, "step": 455 }, { "epoch": 1.01, "grad_norm": 0.18423910547834596, "learning_rate": 0.00010736794794979227, "loss": 0.2553, "step": 456 }, { "epoch": 1.01, "grad_norm": 0.19708052534061443, "learning_rate": 0.00010700016938477891, "loss": 0.2494, "step": 457 }, { "epoch": 1.01, "grad_norm": 0.17998530753425868, "learning_rate": 0.00010663229564351041, "loss": 0.2546, "step": 458 }, { "epoch": 1.02, "grad_norm": 0.2085635770627596, "learning_rate": 0.00010626433172770078, "loss": 0.2596, "step": 459 }, { "epoch": 1.02, "grad_norm": 0.18909283426176846, "learning_rate": 0.00010589628264029006, "loss": 0.2384, "step": 460 }, { "epoch": 1.0, "grad_norm": 0.20378232017976675, "learning_rate": 0.0001055281533853762, "loss": 0.2283, "step": 461 }, { "epoch": 1.0, "grad_norm": 0.208268642315818, "learning_rate": 0.00010515994896814731, "loss": 0.2436, "step": 462 }, { "epoch": 1.01, "grad_norm": 0.1809796643912464, "learning_rate": 0.00010479167439481328, "loss": 0.2476, "step": 463 }, { "epoch": 1.01, "grad_norm": 0.1954805890621439, "learning_rate": 0.00010442333467253789, "loss": 0.2412, "step": 464 }, { "epoch": 1.01, "grad_norm": 0.1812710951170547, "learning_rate": 0.00010405493480937076, "loss": 0.2325, "step": 465 }, { "epoch": 1.01, "grad_norm": 0.20086044909183742, "learning_rate": 0.00010368647981417916, "loss": 0.2552, "step": 466 }, { "epoch": 1.02, "grad_norm": 0.18278992994480658, "learning_rate": 0.00010331797469657992, "loss": 0.2381, "step": 467 }, { "epoch": 1.02, "grad_norm": 0.17652404986801007, "learning_rate": 0.00010294942446687146, "loss": 0.2528, "step": 468 }, { "epoch": 1.02, "grad_norm": 0.19696889409419022, "learning_rate": 0.00010258083413596537, "loss": 0.2251, "step": 469 }, { "epoch": 1.02, "grad_norm": 0.20649006123097716, "learning_rate": 0.00010221220871531869, "loss": 0.241, "step": 470 }, { "epoch": 1.02, "grad_norm": 0.2048554346722386, "learning_rate": 0.00010184355321686538, "loss": 0.2507, "step": 471 }, { "epoch": 1.03, "grad_norm": 0.19936391739850573, "learning_rate": 0.00010147487265294844, "loss": 0.2483, "step": 472 }, { "epoch": 1.03, "grad_norm": 0.20014649703171655, "learning_rate": 0.0001011061720362516, "loss": 0.243, "step": 473 }, { "epoch": 1.03, "grad_norm": 0.19698954610511737, "learning_rate": 0.00010073745637973124, "loss": 0.2655, "step": 474 }, { "epoch": 1.03, "grad_norm": 0.18526644345070792, "learning_rate": 0.00010036873069654832, "loss": 0.2372, "step": 475 }, { "epoch": 1.04, "grad_norm": 0.19481973475599917, "learning_rate": 0.0001, "loss": 0.2446, "step": 476 }, { "epoch": 1.04, "grad_norm": 0.20394580318594288, "learning_rate": 9.96312693034517e-05, "loss": 0.2511, "step": 477 }, { "epoch": 1.04, "grad_norm": 0.220457159395997, "learning_rate": 9.926254362026875e-05, "loss": 0.2445, "step": 478 }, { "epoch": 1.04, "grad_norm": 0.18633719328240075, "learning_rate": 9.889382796374844e-05, "loss": 0.2377, "step": 479 }, { "epoch": 1.04, "grad_norm": 0.1927353006687828, "learning_rate": 9.852512734705159e-05, "loss": 0.2528, "step": 480 }, { "epoch": 1.05, "grad_norm": 0.22794683697537815, "learning_rate": 9.815644678313462e-05, "loss": 0.2592, "step": 481 }, { "epoch": 1.05, "grad_norm": 0.20639017103117685, "learning_rate": 9.778779128468132e-05, "loss": 0.2433, "step": 482 }, { "epoch": 1.05, "grad_norm": 0.20823587084943018, "learning_rate": 9.741916586403465e-05, "loss": 0.2411, "step": 483 }, { "epoch": 1.05, "grad_norm": 0.20402677890788284, "learning_rate": 9.705057553312856e-05, "loss": 0.2577, "step": 484 }, { "epoch": 1.06, "grad_norm": 0.19085117547328076, "learning_rate": 9.668202530342009e-05, "loss": 0.2209, "step": 485 }, { "epoch": 1.06, "grad_norm": 0.194053406187504, "learning_rate": 9.631352018582089e-05, "loss": 0.241, "step": 486 }, { "epoch": 1.06, "grad_norm": 0.20333052287999634, "learning_rate": 9.594506519062925e-05, "loss": 0.2526, "step": 487 }, { "epoch": 1.06, "grad_norm": 0.1889581802336701, "learning_rate": 9.557666532746213e-05, "loss": 0.2118, "step": 488 }, { "epoch": 1.06, "grad_norm": 0.1947056880617223, "learning_rate": 9.520832560518673e-05, "loss": 0.2245, "step": 489 }, { "epoch": 1.07, "grad_norm": 0.19441196737143498, "learning_rate": 9.48400510318527e-05, "loss": 0.2498, "step": 490 }, { "epoch": 1.07, "grad_norm": 0.19134129181023776, "learning_rate": 9.447184661462382e-05, "loss": 0.2186, "step": 491 }, { "epoch": 1.07, "grad_norm": 0.1932180061270486, "learning_rate": 9.410371735970997e-05, "loss": 0.2376, "step": 492 }, { "epoch": 1.07, "grad_norm": 0.21806461386450074, "learning_rate": 9.373566827229923e-05, "loss": 0.2519, "step": 493 }, { "epoch": 1.08, "grad_norm": 0.19951555170904356, "learning_rate": 9.336770435648964e-05, "loss": 0.2467, "step": 494 }, { "epoch": 1.08, "grad_norm": 0.22251564739691396, "learning_rate": 9.29998306152211e-05, "loss": 0.2661, "step": 495 }, { "epoch": 1.08, "grad_norm": 0.21386539336545002, "learning_rate": 9.263205205020775e-05, "loss": 0.243, "step": 496 }, { "epoch": 1.08, "grad_norm": 0.2097758786130228, "learning_rate": 9.226437366186941e-05, "loss": 0.2278, "step": 497 }, { "epoch": 1.08, "grad_norm": 0.227474113330377, "learning_rate": 9.1896800449264e-05, "loss": 0.2381, "step": 498 }, { "epoch": 1.09, "grad_norm": 0.2127655256693725, "learning_rate": 9.15293374100195e-05, "loss": 0.2156, "step": 499 }, { "epoch": 1.09, "grad_norm": 0.21595500178149832, "learning_rate": 9.116198954026577e-05, "loss": 0.2534, "step": 500 }, { "epoch": 1.09, "grad_norm": 0.20638365496977454, "learning_rate": 9.07947618345669e-05, "loss": 0.2252, "step": 501 }, { "epoch": 1.09, "grad_norm": 0.2111994035780588, "learning_rate": 9.042765928585327e-05, "loss": 0.2427, "step": 502 }, { "epoch": 1.1, "grad_norm": 0.2015346478285064, "learning_rate": 9.006068688535342e-05, "loss": 0.2434, "step": 503 }, { "epoch": 1.1, "grad_norm": 0.20144655863058586, "learning_rate": 8.969384962252646e-05, "loss": 0.2324, "step": 504 }, { "epoch": 1.1, "grad_norm": 0.1966831938081588, "learning_rate": 8.932715248499418e-05, "loss": 0.2342, "step": 505 }, { "epoch": 1.1, "grad_norm": 0.2077508601935168, "learning_rate": 8.896060045847304e-05, "loss": 0.2462, "step": 506 }, { "epoch": 1.1, "grad_norm": 0.21104976527581248, "learning_rate": 8.859419852670664e-05, "loss": 0.2381, "step": 507 }, { "epoch": 1.11, "grad_norm": 0.22776314339414724, "learning_rate": 8.822795167139788e-05, "loss": 0.2531, "step": 508 }, { "epoch": 1.11, "grad_norm": 0.2094206489789516, "learning_rate": 8.786186487214107e-05, "loss": 0.2416, "step": 509 }, { "epoch": 1.11, "grad_norm": 0.21100736273500442, "learning_rate": 8.749594310635443e-05, "loss": 0.2582, "step": 510 }, { "epoch": 1.11, "grad_norm": 0.19053851221493054, "learning_rate": 8.713019134921238e-05, "loss": 0.2238, "step": 511 }, { "epoch": 1.12, "grad_norm": 0.19389337918079758, "learning_rate": 8.676461457357776e-05, "loss": 0.2338, "step": 512 }, { "epoch": 1.12, "grad_norm": 0.19404522798986354, "learning_rate": 8.639921774993439e-05, "loss": 0.2391, "step": 513 }, { "epoch": 1.12, "grad_norm": 0.21274678156633864, "learning_rate": 8.60340058463194e-05, "loss": 0.2604, "step": 514 }, { "epoch": 1.12, "grad_norm": 0.1969735372029067, "learning_rate": 8.566898382825559e-05, "loss": 0.2219, "step": 515 }, { "epoch": 1.12, "grad_norm": 0.1909564045414605, "learning_rate": 8.530415665868424e-05, "loss": 0.2379, "step": 516 }, { "epoch": 1.13, "grad_norm": 0.2076961618333992, "learning_rate": 8.49395292978972e-05, "loss": 0.2343, "step": 517 }, { "epoch": 1.13, "grad_norm": 0.2275056200459885, "learning_rate": 8.457510670346976e-05, "loss": 0.2558, "step": 518 }, { "epoch": 1.13, "grad_norm": 0.20774675383309715, "learning_rate": 8.421089383019319e-05, "loss": 0.2549, "step": 519 }, { "epoch": 1.13, "grad_norm": 0.2221943514742673, "learning_rate": 8.384689563000723e-05, "loss": 0.2464, "step": 520 }, { "epoch": 1.14, "grad_norm": 0.22707800753207244, "learning_rate": 8.348311705193293e-05, "loss": 0.243, "step": 521 }, { "epoch": 1.14, "grad_norm": 0.2447932865549153, "learning_rate": 8.311956304200533e-05, "loss": 0.264, "step": 522 }, { "epoch": 1.14, "grad_norm": 0.2072307507148345, "learning_rate": 8.275623854320608e-05, "loss": 0.2449, "step": 523 }, { "epoch": 1.14, "grad_norm": 0.19366898457969708, "learning_rate": 8.239314849539638e-05, "loss": 0.2402, "step": 524 }, { "epoch": 1.14, "grad_norm": 0.20056495036693472, "learning_rate": 8.203029783524984e-05, "loss": 0.2257, "step": 525 }, { "epoch": 1.15, "grad_norm": 0.2084543244379053, "learning_rate": 8.166769149618517e-05, "loss": 0.2493, "step": 526 }, { "epoch": 1.15, "grad_norm": 0.207683866170019, "learning_rate": 8.130533440829928e-05, "loss": 0.2435, "step": 527 }, { "epoch": 1.15, "grad_norm": 0.20125088482715547, "learning_rate": 8.094323149830023e-05, "loss": 0.2413, "step": 528 }, { "epoch": 1.15, "grad_norm": 0.18512657144962646, "learning_rate": 8.058138768944013e-05, "loss": 0.2272, "step": 529 }, { "epoch": 1.16, "grad_norm": 0.19840383281608243, "learning_rate": 8.021980790144827e-05, "loss": 0.2301, "step": 530 }, { "epoch": 1.16, "grad_norm": 0.19751451861458907, "learning_rate": 7.985849705046439e-05, "loss": 0.2392, "step": 531 }, { "epoch": 1.16, "grad_norm": 0.21690288735429328, "learning_rate": 7.949746004897151e-05, "loss": 0.2441, "step": 532 }, { "epoch": 1.16, "grad_norm": 0.21086255260898568, "learning_rate": 7.913670180572936e-05, "loss": 0.2533, "step": 533 }, { "epoch": 1.16, "grad_norm": 0.21659430069441296, "learning_rate": 7.877622722570771e-05, "loss": 0.2451, "step": 534 }, { "epoch": 1.17, "grad_norm": 0.21073430482336133, "learning_rate": 7.84160412100194e-05, "loss": 0.2354, "step": 535 }, { "epoch": 1.17, "grad_norm": 0.19999698025688695, "learning_rate": 7.805614865585396e-05, "loss": 0.2274, "step": 536 }, { "epoch": 1.17, "grad_norm": 0.21741693336883022, "learning_rate": 7.769655445641093e-05, "loss": 0.243, "step": 537 }, { "epoch": 1.17, "grad_norm": 0.19692922935201004, "learning_rate": 7.733726350083332e-05, "loss": 0.237, "step": 538 }, { "epoch": 1.17, "grad_norm": 0.20151776142559946, "learning_rate": 7.697828067414119e-05, "loss": 0.2363, "step": 539 }, { "epoch": 1.18, "grad_norm": 0.22663937283958677, "learning_rate": 7.661961085716512e-05, "loss": 0.2426, "step": 540 }, { "epoch": 1.18, "grad_norm": 0.21759095693447697, "learning_rate": 7.626125892647997e-05, "loss": 0.2675, "step": 541 }, { "epoch": 1.18, "grad_norm": 0.20761753236557173, "learning_rate": 7.590322975433857e-05, "loss": 0.237, "step": 542 }, { "epoch": 1.18, "grad_norm": 0.18838661893932568, "learning_rate": 7.554552820860534e-05, "loss": 0.2258, "step": 543 }, { "epoch": 1.19, "grad_norm": 0.2110298449100788, "learning_rate": 7.518815915269024e-05, "loss": 0.2496, "step": 544 }, { "epoch": 1.19, "grad_norm": 0.20854600045531033, "learning_rate": 7.483112744548268e-05, "loss": 0.2626, "step": 545 }, { "epoch": 1.19, "grad_norm": 0.19208651929110873, "learning_rate": 7.447443794128525e-05, "loss": 0.2335, "step": 546 }, { "epoch": 1.19, "grad_norm": 0.18632455778406, "learning_rate": 7.411809548974792e-05, "loss": 0.2247, "step": 547 }, { "epoch": 1.19, "grad_norm": 0.20794441259868549, "learning_rate": 7.376210493580212e-05, "loss": 0.2479, "step": 548 }, { "epoch": 1.2, "grad_norm": 0.20925285349145628, "learning_rate": 7.34064711195946e-05, "loss": 0.2438, "step": 549 }, { "epoch": 1.2, "grad_norm": 0.19745634320778846, "learning_rate": 7.305119887642191e-05, "loss": 0.2622, "step": 550 }, { "epoch": 1.2, "grad_norm": 0.1856897506744385, "learning_rate": 7.269629303666463e-05, "loss": 0.2251, "step": 551 }, { "epoch": 1.2, "grad_norm": 0.19811525650313866, "learning_rate": 7.234175842572145e-05, "loss": 0.2464, "step": 552 }, { "epoch": 1.21, "grad_norm": 0.20082814748751002, "learning_rate": 7.198759986394381e-05, "loss": 0.2406, "step": 553 }, { "epoch": 1.21, "grad_norm": 0.20448100943924474, "learning_rate": 7.163382216657034e-05, "loss": 0.2536, "step": 554 }, { "epoch": 1.21, "grad_norm": 0.2087548947902547, "learning_rate": 7.128043014366123e-05, "loss": 0.2432, "step": 555 }, { "epoch": 1.21, "grad_norm": 0.18979837479724626, "learning_rate": 7.092742860003297e-05, "loss": 0.2286, "step": 556 }, { "epoch": 1.21, "grad_norm": 0.1938685062447375, "learning_rate": 7.057482233519302e-05, "loss": 0.2332, "step": 557 }, { "epoch": 1.22, "grad_norm": 0.19389222639217057, "learning_rate": 7.022261614327448e-05, "loss": 0.2259, "step": 558 }, { "epoch": 1.22, "grad_norm": 0.2028721556190883, "learning_rate": 6.98708148129709e-05, "loss": 0.2522, "step": 559 }, { "epoch": 1.22, "grad_norm": 0.2030667349807296, "learning_rate": 6.951942312747134e-05, "loss": 0.2414, "step": 560 }, { "epoch": 1.22, "grad_norm": 0.2118463451987899, "learning_rate": 6.916844586439513e-05, "loss": 0.2521, "step": 561 }, { "epoch": 1.23, "grad_norm": 0.21226696216160676, "learning_rate": 6.881788779572705e-05, "loss": 0.2443, "step": 562 }, { "epoch": 1.23, "grad_norm": 0.20461446026763713, "learning_rate": 6.84677536877523e-05, "loss": 0.2323, "step": 563 }, { "epoch": 1.23, "grad_norm": 0.19401289577735006, "learning_rate": 6.811804830099187e-05, "loss": 0.2452, "step": 564 }, { "epoch": 1.23, "grad_norm": 0.19494475075646883, "learning_rate": 6.776877639013777e-05, "loss": 0.2276, "step": 565 }, { "epoch": 1.23, "eval_loss": 0.25228264927864075, "eval_runtime": 174.1389, "eval_samples_per_second": 13.34, "eval_steps_per_second": 0.419, "step": 565 }, { "epoch": 1.23, "grad_norm": 0.18369247947269843, "learning_rate": 6.741994270398826e-05, "loss": 0.2327, "step": 566 }, { "epoch": 1.24, "grad_norm": 0.19844990451218875, "learning_rate": 6.70715519853834e-05, "loss": 0.2121, "step": 567 }, { "epoch": 1.24, "grad_norm": 0.2000478247742355, "learning_rate": 6.672360897114062e-05, "loss": 0.216, "step": 568 }, { "epoch": 1.24, "grad_norm": 0.2066389631842727, "learning_rate": 6.637611839199013e-05, "loss": 0.2245, "step": 569 }, { "epoch": 1.24, "grad_norm": 0.19561029619648587, "learning_rate": 6.602908497251072e-05, "loss": 0.2269, "step": 570 }, { "epoch": 1.25, "grad_norm": 0.17858369393459886, "learning_rate": 6.568251343106562e-05, "loss": 0.2371, "step": 571 }, { "epoch": 1.25, "grad_norm": 0.2042913512728603, "learning_rate": 6.533640847973808e-05, "loss": 0.2347, "step": 572 }, { "epoch": 1.25, "grad_norm": 0.19680273871868867, "learning_rate": 6.499077482426756e-05, "loss": 0.2246, "step": 573 }, { "epoch": 1.25, "grad_norm": 0.2019623623327301, "learning_rate": 6.464561716398565e-05, "loss": 0.225, "step": 574 }, { "epoch": 1.25, "grad_norm": 0.22138397726642448, "learning_rate": 6.430094019175217e-05, "loss": 0.249, "step": 575 }, { "epoch": 1.26, "grad_norm": 0.20441012808266243, "learning_rate": 6.395674859389127e-05, "loss": 0.2226, "step": 576 }, { "epoch": 1.26, "grad_norm": 0.22123583080158427, "learning_rate": 6.361304705012798e-05, "loss": 0.242, "step": 577 }, { "epoch": 1.26, "grad_norm": 0.20417002392136502, "learning_rate": 6.326984023352435e-05, "loss": 0.2214, "step": 578 }, { "epoch": 1.26, "grad_norm": 0.20372731516937811, "learning_rate": 6.292713281041588e-05, "loss": 0.2308, "step": 579 }, { "epoch": 1.27, "grad_norm": 0.19749044128114374, "learning_rate": 6.258492944034833e-05, "loss": 0.2331, "step": 580 }, { "epoch": 1.27, "grad_norm": 0.19648293746816736, "learning_rate": 6.224323477601417e-05, "loss": 0.2325, "step": 581 }, { "epoch": 1.27, "grad_norm": 0.19271942162617067, "learning_rate": 6.190205346318927e-05, "loss": 0.2312, "step": 582 }, { "epoch": 1.27, "grad_norm": 0.21072725131725892, "learning_rate": 6.156139014066999e-05, "loss": 0.2423, "step": 583 }, { "epoch": 1.27, "grad_norm": 0.19198580990324404, "learning_rate": 6.122124944020977e-05, "loss": 0.2347, "step": 584 }, { "epoch": 1.28, "grad_norm": 0.21149357156024817, "learning_rate": 6.0881635986456555e-05, "loss": 0.2246, "step": 585 }, { "epoch": 1.28, "grad_norm": 0.19079790950210723, "learning_rate": 6.054255439688947e-05, "loss": 0.2266, "step": 586 }, { "epoch": 1.28, "grad_norm": 0.19482559547798342, "learning_rate": 6.020400928175637e-05, "loss": 0.2265, "step": 587 }, { "epoch": 1.28, "grad_norm": 0.19971007179240052, "learning_rate": 5.986600524401112e-05, "loss": 0.2355, "step": 588 }, { "epoch": 1.29, "grad_norm": 0.19343926484407545, "learning_rate": 5.9528546879250825e-05, "loss": 0.2218, "step": 589 }, { "epoch": 1.29, "grad_norm": 0.19930073976975177, "learning_rate": 5.91916387756535e-05, "loss": 0.2403, "step": 590 }, { "epoch": 1.29, "grad_norm": 0.20394273446146363, "learning_rate": 5.885528551391576e-05, "loss": 0.2105, "step": 591 }, { "epoch": 1.29, "grad_norm": 0.22138051923326063, "learning_rate": 5.8519491667190264e-05, "loss": 0.2329, "step": 592 }, { "epoch": 1.29, "grad_norm": 0.20190957372704954, "learning_rate": 5.818426180102382e-05, "loss": 0.2486, "step": 593 }, { "epoch": 1.3, "grad_norm": 0.20629696816954632, "learning_rate": 5.784960047329519e-05, "loss": 0.2228, "step": 594 }, { "epoch": 1.3, "grad_norm": 0.20235775048560617, "learning_rate": 5.751551223415313e-05, "loss": 0.24, "step": 595 }, { "epoch": 1.3, "grad_norm": 0.1922696841143549, "learning_rate": 5.718200162595449e-05, "loss": 0.2286, "step": 596 }, { "epoch": 1.3, "grad_norm": 0.19801052068324668, "learning_rate": 5.684907318320247e-05, "loss": 0.2328, "step": 597 }, { "epoch": 1.31, "grad_norm": 0.22189721177634955, "learning_rate": 5.651673143248508e-05, "loss": 0.2366, "step": 598 }, { "epoch": 1.31, "grad_norm": 0.20510345484208217, "learning_rate": 5.61849808924134e-05, "loss": 0.2445, "step": 599 }, { "epoch": 1.31, "grad_norm": 0.20437222653878181, "learning_rate": 5.5853826073560314e-05, "loss": 0.2283, "step": 600 }, { "epoch": 1.31, "grad_norm": 0.18788556351853072, "learning_rate": 5.5523271478399084e-05, "loss": 0.2264, "step": 601 }, { "epoch": 1.31, "grad_norm": 0.1947377617771126, "learning_rate": 5.5193321601242156e-05, "loss": 0.2369, "step": 602 }, { "epoch": 1.32, "grad_norm": 0.19219173101094308, "learning_rate": 5.48639809281801e-05, "loss": 0.2267, "step": 603 }, { "epoch": 1.32, "grad_norm": 0.21019734225509798, "learning_rate": 5.4535253937020526e-05, "loss": 0.2342, "step": 604 }, { "epoch": 1.32, "grad_norm": 0.19871629669119603, "learning_rate": 5.420714509722729e-05, "loss": 0.2412, "step": 605 }, { "epoch": 1.32, "grad_norm": 0.18878853628731276, "learning_rate": 5.387965886985966e-05, "loss": 0.2125, "step": 606 }, { "epoch": 1.33, "grad_norm": 0.19185339023163583, "learning_rate": 5.355279970751174e-05, "loss": 0.2335, "step": 607 }, { "epoch": 1.33, "grad_norm": 0.19314964556917028, "learning_rate": 5.322657205425183e-05, "loss": 0.2172, "step": 608 }, { "epoch": 1.33, "grad_norm": 0.19994293680338385, "learning_rate": 5.29009803455621e-05, "loss": 0.2315, "step": 609 }, { "epoch": 1.33, "grad_norm": 0.19241810973460463, "learning_rate": 5.257602900827821e-05, "loss": 0.2172, "step": 610 }, { "epoch": 1.33, "grad_norm": 0.22168479611060188, "learning_rate": 5.225172246052914e-05, "loss": 0.2308, "step": 611 }, { "epoch": 1.34, "grad_norm": 0.21457620781731124, "learning_rate": 5.192806511167718e-05, "loss": 0.2531, "step": 612 }, { "epoch": 1.34, "grad_norm": 0.2202429827455574, "learning_rate": 5.160506136225787e-05, "loss": 0.2587, "step": 613 }, { "epoch": 1.34, "grad_norm": 0.1981490531845343, "learning_rate": 5.1282715603920374e-05, "loss": 0.2374, "step": 614 }, { "epoch": 1.34, "grad_norm": 0.20755864391862947, "learning_rate": 5.0961032219367374e-05, "loss": 0.2328, "step": 615 }, { "epoch": 1.35, "grad_norm": 0.2050356684741965, "learning_rate": 5.064001558229591e-05, "loss": 0.234, "step": 616 }, { "epoch": 1.35, "grad_norm": 0.19524780155524296, "learning_rate": 5.03196700573378e-05, "loss": 0.226, "step": 617 }, { "epoch": 1.35, "grad_norm": 0.2207091761071787, "learning_rate": 5.000000000000002e-05, "loss": 0.2284, "step": 618 }, { "epoch": 1.35, "grad_norm": 0.207367533948898, "learning_rate": 4.968100975660582e-05, "loss": 0.2275, "step": 619 }, { "epoch": 1.35, "grad_norm": 0.20040401651948256, "learning_rate": 4.936270366423563e-05, "loss": 0.2349, "step": 620 }, { "epoch": 1.36, "grad_norm": 0.19722408386051055, "learning_rate": 4.904508605066773e-05, "loss": 0.2266, "step": 621 }, { "epoch": 1.36, "grad_norm": 0.21069454400535126, "learning_rate": 4.872816123431977e-05, "loss": 0.2399, "step": 622 }, { "epoch": 1.36, "grad_norm": 0.27442431176210424, "learning_rate": 4.841193352419003e-05, "loss": 0.255, "step": 623 }, { "epoch": 1.36, "grad_norm": 0.21268499575308422, "learning_rate": 4.809640721979855e-05, "loss": 0.2536, "step": 624 }, { "epoch": 1.37, "grad_norm": 0.1958296129113393, "learning_rate": 4.778158661112896e-05, "loss": 0.2227, "step": 625 }, { "epoch": 1.37, "grad_norm": 0.2037287597321259, "learning_rate": 4.746747597857014e-05, "loss": 0.264, "step": 626 }, { "epoch": 1.37, "grad_norm": 0.21857028629384495, "learning_rate": 4.715407959285776e-05, "loss": 0.2539, "step": 627 }, { "epoch": 1.37, "grad_norm": 0.17653319803507914, "learning_rate": 4.6841401715016463e-05, "loss": 0.2173, "step": 628 }, { "epoch": 1.37, "grad_norm": 0.2088118045650298, "learning_rate": 4.6529446596301994e-05, "loss": 0.2181, "step": 629 }, { "epoch": 1.38, "grad_norm": 0.20018277943155008, "learning_rate": 4.6218218478142984e-05, "loss": 0.2451, "step": 630 }, { "epoch": 1.38, "grad_norm": 0.19840613289975464, "learning_rate": 4.590772159208384e-05, "loss": 0.2222, "step": 631 }, { "epoch": 1.38, "grad_norm": 0.1995822171363863, "learning_rate": 4.559796015972677e-05, "loss": 0.241, "step": 632 }, { "epoch": 1.38, "grad_norm": 0.19895719674336662, "learning_rate": 4.52889383926746e-05, "loss": 0.2371, "step": 633 }, { "epoch": 1.39, "grad_norm": 0.18187801681389232, "learning_rate": 4.498066049247344e-05, "loss": 0.2218, "step": 634 }, { "epoch": 1.39, "grad_norm": 0.2177749086338868, "learning_rate": 4.4673130650555605e-05, "loss": 0.2486, "step": 635 }, { "epoch": 1.39, "grad_norm": 0.21347066621586283, "learning_rate": 4.436635304818258e-05, "loss": 0.2262, "step": 636 }, { "epoch": 1.39, "grad_norm": 0.20310412813205128, "learning_rate": 4.40603318563882e-05, "loss": 0.227, "step": 637 }, { "epoch": 1.39, "grad_norm": 0.20257831329266338, "learning_rate": 4.375507123592194e-05, "loss": 0.2071, "step": 638 }, { "epoch": 1.4, "grad_norm": 0.21375954762824598, "learning_rate": 4.34505753371923e-05, "loss": 0.2316, "step": 639 }, { "epoch": 1.4, "grad_norm": 0.21010933001283658, "learning_rate": 4.314684830021046e-05, "loss": 0.2269, "step": 640 }, { "epoch": 1.4, "grad_norm": 0.23866662146097914, "learning_rate": 4.2843894254533904e-05, "loss": 0.2621, "step": 641 }, { "epoch": 1.4, "grad_norm": 0.202396176516729, "learning_rate": 4.2541717319210315e-05, "loss": 0.2229, "step": 642 }, { "epoch": 1.41, "grad_norm": 0.2114385200169175, "learning_rate": 4.224032160272159e-05, "loss": 0.2244, "step": 643 }, { "epoch": 1.41, "grad_norm": 0.19545378343350944, "learning_rate": 4.1939711202927936e-05, "loss": 0.2084, "step": 644 }, { "epoch": 1.41, "grad_norm": 0.2037569083756898, "learning_rate": 4.1639890207012176e-05, "loss": 0.2336, "step": 645 }, { "epoch": 1.41, "grad_norm": 0.21516998837226808, "learning_rate": 4.1340862691424184e-05, "loss": 0.2236, "step": 646 }, { "epoch": 1.41, "grad_norm": 0.20361533038162538, "learning_rate": 4.104263272182546e-05, "loss": 0.2298, "step": 647 }, { "epoch": 1.42, "grad_norm": 0.2299271783012964, "learning_rate": 4.074520435303383e-05, "loss": 0.2436, "step": 648 }, { "epoch": 1.42, "grad_norm": 0.20022559518426747, "learning_rate": 4.044858162896834e-05, "loss": 0.2399, "step": 649 }, { "epoch": 1.42, "grad_norm": 0.212614510340799, "learning_rate": 4.015276858259427e-05, "loss": 0.2287, "step": 650 }, { "epoch": 1.42, "grad_norm": 0.20401412514620756, "learning_rate": 3.985776923586829e-05, "loss": 0.2314, "step": 651 }, { "epoch": 1.43, "grad_norm": 0.1987739253038705, "learning_rate": 3.9563587599683795e-05, "loss": 0.2133, "step": 652 }, { "epoch": 1.43, "grad_norm": 0.18748268405111385, "learning_rate": 3.927022767381634e-05, "loss": 0.2099, "step": 653 }, { "epoch": 1.43, "grad_norm": 0.19769569764762104, "learning_rate": 3.897769344686929e-05, "loss": 0.2294, "step": 654 }, { "epoch": 1.43, "grad_norm": 0.23235953320355546, "learning_rate": 3.868598889621957e-05, "loss": 0.2433, "step": 655 }, { "epoch": 1.43, "grad_norm": 0.19438645171196242, "learning_rate": 3.839511798796357e-05, "loss": 0.2317, "step": 656 }, { "epoch": 1.44, "grad_norm": 0.23669503631409483, "learning_rate": 3.810508467686337e-05, "loss": 0.2455, "step": 657 }, { "epoch": 1.44, "grad_norm": 0.21634566116480686, "learning_rate": 3.781589290629266e-05, "loss": 0.2343, "step": 658 }, { "epoch": 1.44, "grad_norm": 0.20952846432796216, "learning_rate": 3.752754660818339e-05, "loss": 0.2135, "step": 659 }, { "epoch": 1.44, "grad_norm": 0.20360318165711974, "learning_rate": 3.7240049702972345e-05, "loss": 0.2068, "step": 660 }, { "epoch": 1.45, "grad_norm": 0.21078292163104015, "learning_rate": 3.6953406099547514e-05, "loss": 0.2365, "step": 661 }, { "epoch": 1.45, "grad_norm": 0.1948997332089136, "learning_rate": 3.6667619695195285e-05, "loss": 0.2232, "step": 662 }, { "epoch": 1.45, "grad_norm": 0.20947930741143364, "learning_rate": 3.638269437554741e-05, "loss": 0.2324, "step": 663 }, { "epoch": 1.45, "grad_norm": 0.2012585295919145, "learning_rate": 3.609863401452787e-05, "loss": 0.2135, "step": 664 }, { "epoch": 1.45, "grad_norm": 0.2118635337112509, "learning_rate": 3.581544247430056e-05, "loss": 0.2296, "step": 665 }, { "epoch": 1.46, "grad_norm": 0.20679430628191067, "learning_rate": 3.553312360521675e-05, "loss": 0.2228, "step": 666 }, { "epoch": 1.46, "grad_norm": 0.20244338222777755, "learning_rate": 3.525168124576239e-05, "loss": 0.2214, "step": 667 }, { "epoch": 1.46, "grad_norm": 0.21511489923748392, "learning_rate": 3.49711192225063e-05, "loss": 0.2461, "step": 668 }, { "epoch": 1.46, "grad_norm": 0.20512373280874766, "learning_rate": 3.469144135004809e-05, "loss": 0.224, "step": 669 }, { "epoch": 1.47, "grad_norm": 0.2298619558228232, "learning_rate": 3.4412651430966e-05, "loss": 0.2572, "step": 670 }, { "epoch": 1.47, "grad_norm": 0.19862016808278388, "learning_rate": 3.413475325576554e-05, "loss": 0.2403, "step": 671 }, { "epoch": 1.47, "grad_norm": 0.22555024826650924, "learning_rate": 3.385775060282783e-05, "loss": 0.2282, "step": 672 }, { "epoch": 1.47, "grad_norm": 0.19215563610758657, "learning_rate": 3.3581647238358215e-05, "loss": 0.2144, "step": 673 }, { "epoch": 1.47, "grad_norm": 0.20476097833745566, "learning_rate": 3.330644691633492e-05, "loss": 0.225, "step": 674 }, { "epoch": 1.48, "grad_norm": 0.19750951099732988, "learning_rate": 3.3032153378458376e-05, "loss": 0.2287, "step": 675 }, { "epoch": 1.48, "grad_norm": 0.21089379422961432, "learning_rate": 3.275877035410002e-05, "loss": 0.2321, "step": 676 }, { "epoch": 1.48, "grad_norm": 0.2078264905766209, "learning_rate": 3.248630156025158e-05, "loss": 0.2381, "step": 677 }, { "epoch": 1.48, "grad_norm": 0.2179758901349511, "learning_rate": 3.2214750701474874e-05, "loss": 0.2483, "step": 678 }, { "epoch": 1.48, "eval_loss": 0.2440394163131714, "eval_runtime": 174.4083, "eval_samples_per_second": 13.319, "eval_steps_per_second": 0.419, "step": 678 }, { "epoch": 1.49, "grad_norm": 0.2131438455471886, "learning_rate": 3.194412146985106e-05, "loss": 0.2348, "step": 679 }, { "epoch": 1.49, "grad_norm": 0.20229002928013867, "learning_rate": 3.167441754493066e-05, "loss": 0.2186, "step": 680 }, { "epoch": 1.49, "grad_norm": 0.20234425592543342, "learning_rate": 3.1405642593683425e-05, "loss": 0.2273, "step": 681 }, { "epoch": 1.49, "grad_norm": 0.22859532849176858, "learning_rate": 3.113780027044856e-05, "loss": 0.2254, "step": 682 }, { "epoch": 1.49, "grad_norm": 0.20559865195742957, "learning_rate": 3.0870894216884995e-05, "loss": 0.2291, "step": 683 }, { "epoch": 1.5, "grad_norm": 0.21685988567299588, "learning_rate": 3.060492806192185e-05, "loss": 0.2164, "step": 684 }, { "epoch": 1.5, "grad_norm": 0.19802233527020463, "learning_rate": 3.0339905421709137e-05, "loss": 0.234, "step": 685 }, { "epoch": 1.5, "grad_norm": 0.19515973792765512, "learning_rate": 3.0075829899568597e-05, "loss": 0.2232, "step": 686 }, { "epoch": 1.5, "grad_norm": 0.19173680197953472, "learning_rate": 2.981270508594467e-05, "loss": 0.2308, "step": 687 }, { "epoch": 1.5, "grad_norm": 0.19991596550087715, "learning_rate": 2.955053455835568e-05, "loss": 0.218, "step": 688 }, { "epoch": 1.51, "grad_norm": 0.18665703388000335, "learning_rate": 2.9289321881345254e-05, "loss": 0.2366, "step": 689 }, { "epoch": 1.51, "grad_norm": 0.18942165113318024, "learning_rate": 2.9029070606433785e-05, "loss": 0.2132, "step": 690 }, { "epoch": 1.51, "grad_norm": 0.20485337976012102, "learning_rate": 2.8769784272070188e-05, "loss": 0.2335, "step": 691 }, { "epoch": 1.51, "grad_norm": 0.20246216943076425, "learning_rate": 2.8511466403583766e-05, "loss": 0.2352, "step": 692 }, { "epoch": 1.52, "grad_norm": 0.19311011267813682, "learning_rate": 2.8254120513136296e-05, "loss": 0.2067, "step": 693 }, { "epoch": 1.52, "grad_norm": 0.210224208337888, "learning_rate": 2.799775009967428e-05, "loss": 0.2333, "step": 694 }, { "epoch": 1.52, "grad_norm": 0.21970393594166632, "learning_rate": 2.7742358648881352e-05, "loss": 0.229, "step": 695 }, { "epoch": 1.52, "grad_norm": 0.19295096373345785, "learning_rate": 2.7487949633130882e-05, "loss": 0.2204, "step": 696 }, { "epoch": 1.52, "grad_norm": 0.19297425613578706, "learning_rate": 2.7234526511438818e-05, "loss": 0.2307, "step": 697 }, { "epoch": 1.53, "grad_norm": 0.21270027750348675, "learning_rate": 2.6982092729416587e-05, "loss": 0.2304, "step": 698 }, { "epoch": 1.53, "grad_norm": 0.1821212153306606, "learning_rate": 2.6730651719224285e-05, "loss": 0.2217, "step": 699 }, { "epoch": 1.53, "grad_norm": 0.20641341779566383, "learning_rate": 2.6480206899524007e-05, "loss": 0.2297, "step": 700 }, { "epoch": 1.53, "grad_norm": 0.19895365786093475, "learning_rate": 2.623076167543337e-05, "loss": 0.2294, "step": 701 }, { "epoch": 1.54, "grad_norm": 0.211723910335701, "learning_rate": 2.5982319438479164e-05, "loss": 0.2279, "step": 702 }, { "epoch": 1.54, "grad_norm": 0.20694547351763354, "learning_rate": 2.5734883566551414e-05, "loss": 0.2176, "step": 703 }, { "epoch": 1.54, "grad_norm": 0.20794623990405123, "learning_rate": 2.548845742385717e-05, "loss": 0.241, "step": 704 }, { "epoch": 1.54, "grad_norm": 0.18729243114097596, "learning_rate": 2.5243044360874968e-05, "loss": 0.2268, "step": 705 }, { "epoch": 1.54, "grad_norm": 0.2096570854756017, "learning_rate": 2.4998647714309342e-05, "loss": 0.2233, "step": 706 }, { "epoch": 1.55, "grad_norm": 0.20227926508122493, "learning_rate": 2.4755270807045174e-05, "loss": 0.2287, "step": 707 }, { "epoch": 1.55, "grad_norm": 0.20682124517055636, "learning_rate": 2.451291694810275e-05, "loss": 0.2315, "step": 708 }, { "epoch": 1.55, "grad_norm": 0.19809452439226158, "learning_rate": 2.42715894325928e-05, "loss": 0.2219, "step": 709 }, { "epoch": 1.55, "grad_norm": 0.2095130134600125, "learning_rate": 2.403129154167153e-05, "loss": 0.2291, "step": 710 }, { "epoch": 1.56, "grad_norm": 0.20788084999909145, "learning_rate": 2.379202654249596e-05, "loss": 0.2428, "step": 711 }, { "epoch": 1.56, "grad_norm": 0.2172405068460128, "learning_rate": 2.3553797688179856e-05, "loss": 0.2372, "step": 712 }, { "epoch": 1.56, "grad_norm": 0.19461394318035657, "learning_rate": 2.3316608217749147e-05, "loss": 0.2145, "step": 713 }, { "epoch": 1.56, "grad_norm": 0.18940028210697493, "learning_rate": 2.3080461356097937e-05, "loss": 0.221, "step": 714 }, { "epoch": 1.56, "grad_norm": 0.1994412806184569, "learning_rate": 2.2845360313944885e-05, "loss": 0.2192, "step": 715 }, { "epoch": 1.57, "grad_norm": 0.22225917578862417, "learning_rate": 2.2611308287789344e-05, "loss": 0.2326, "step": 716 }, { "epoch": 1.57, "grad_norm": 0.22182577419896116, "learning_rate": 2.2378308459867825e-05, "loss": 0.2233, "step": 717 }, { "epoch": 1.57, "grad_norm": 0.20974195726874859, "learning_rate": 2.2146363998111076e-05, "loss": 0.2075, "step": 718 }, { "epoch": 1.57, "grad_norm": 0.20126278684107707, "learning_rate": 2.191547805610066e-05, "loss": 0.2272, "step": 719 }, { "epoch": 1.58, "grad_norm": 0.2115414591515168, "learning_rate": 2.168565377302615e-05, "loss": 0.2011, "step": 720 }, { "epoch": 1.58, "grad_norm": 0.22823267316076018, "learning_rate": 2.145689427364268e-05, "loss": 0.2346, "step": 721 }, { "epoch": 1.58, "grad_norm": 0.20563465172572046, "learning_rate": 2.1229202668228197e-05, "loss": 0.223, "step": 722 }, { "epoch": 1.58, "grad_norm": 0.2125772917653009, "learning_rate": 2.1002582052541188e-05, "loss": 0.2351, "step": 723 }, { "epoch": 1.58, "grad_norm": 0.20342137649326203, "learning_rate": 2.077703550777882e-05, "loss": 0.2214, "step": 724 }, { "epoch": 1.59, "grad_norm": 0.2182233891977982, "learning_rate": 2.055256610053479e-05, "loss": 0.2355, "step": 725 }, { "epoch": 1.59, "grad_norm": 0.20932056445296426, "learning_rate": 2.032917688275774e-05, "loss": 0.233, "step": 726 }, { "epoch": 1.59, "grad_norm": 0.19761955501105657, "learning_rate": 2.0106870891709795e-05, "loss": 0.2166, "step": 727 }, { "epoch": 1.59, "grad_norm": 0.20268753517625793, "learning_rate": 1.988565114992519e-05, "loss": 0.2322, "step": 728 }, { "epoch": 1.6, "grad_norm": 0.21415195057982833, "learning_rate": 1.9665520665169235e-05, "loss": 0.2333, "step": 729 }, { "epoch": 1.6, "grad_norm": 0.18762518756686802, "learning_rate": 1.9446482430397385e-05, "loss": 0.2183, "step": 730 }, { "epoch": 1.6, "grad_norm": 0.17327672319544696, "learning_rate": 1.9228539423714565e-05, "loss": 0.1868, "step": 731 }, { "epoch": 1.6, "grad_norm": 0.20722565430523956, "learning_rate": 1.9011694608334675e-05, "loss": 0.2233, "step": 732 }, { "epoch": 1.6, "grad_norm": 0.21924150970052983, "learning_rate": 1.8795950932540284e-05, "loss": 0.2208, "step": 733 }, { "epoch": 1.61, "grad_norm": 0.20907553351335467, "learning_rate": 1.858131132964259e-05, "loss": 0.2223, "step": 734 }, { "epoch": 1.61, "grad_norm": 0.19056857008632616, "learning_rate": 1.8367778717941474e-05, "loss": 0.2101, "step": 735 }, { "epoch": 1.61, "grad_norm": 0.21084097137078958, "learning_rate": 1.8155356000685895e-05, "loss": 0.2453, "step": 736 }, { "epoch": 1.61, "grad_norm": 0.20113307615771472, "learning_rate": 1.794404606603434e-05, "loss": 0.2285, "step": 737 }, { "epoch": 1.62, "grad_norm": 0.1977232826916787, "learning_rate": 1.773385178701561e-05, "loss": 0.2066, "step": 738 }, { "epoch": 1.62, "grad_norm": 0.22423101983076504, "learning_rate": 1.752477602148973e-05, "loss": 0.2358, "step": 739 }, { "epoch": 1.62, "grad_norm": 0.1879155996937992, "learning_rate": 1.7316821612109136e-05, "loss": 0.2272, "step": 740 }, { "epoch": 1.62, "grad_norm": 0.19578435167746402, "learning_rate": 1.7109991386279966e-05, "loss": 0.2179, "step": 741 }, { "epoch": 1.62, "grad_norm": 0.19857068398335803, "learning_rate": 1.6904288156123636e-05, "loss": 0.2261, "step": 742 }, { "epoch": 1.63, "grad_norm": 0.20885572306052286, "learning_rate": 1.6699714718438642e-05, "loss": 0.2323, "step": 743 }, { "epoch": 1.63, "grad_norm": 0.20555096566027478, "learning_rate": 1.649627385466248e-05, "loss": 0.2159, "step": 744 }, { "epoch": 1.63, "grad_norm": 0.22223342099052376, "learning_rate": 1.6293968330833874e-05, "loss": 0.2445, "step": 745 }, { "epoch": 1.63, "grad_norm": 0.20268806818030535, "learning_rate": 1.609280089755515e-05, "loss": 0.2336, "step": 746 }, { "epoch": 1.64, "grad_norm": 0.21879721029918314, "learning_rate": 1.5892774289954814e-05, "loss": 0.2326, "step": 747 }, { "epoch": 1.64, "grad_norm": 0.1979735202861903, "learning_rate": 1.5693891227650426e-05, "loss": 0.2444, "step": 748 }, { "epoch": 1.64, "grad_norm": 0.19636639452399096, "learning_rate": 1.549615441471153e-05, "loss": 0.2229, "step": 749 }, { "epoch": 1.64, "grad_norm": 0.19526467691493388, "learning_rate": 1.5299566539623046e-05, "loss": 0.2212, "step": 750 }, { "epoch": 1.64, "grad_norm": 0.21208951867669978, "learning_rate": 1.5104130275248452e-05, "loss": 0.2186, "step": 751 }, { "epoch": 1.65, "grad_norm": 0.20045769407509645, "learning_rate": 1.4909848278793782e-05, "loss": 0.2129, "step": 752 }, { "epoch": 1.65, "grad_norm": 0.2147971015918436, "learning_rate": 1.4716723191771231e-05, "loss": 0.2262, "step": 753 }, { "epoch": 1.65, "grad_norm": 0.206242119031671, "learning_rate": 1.452475763996326e-05, "loss": 0.2236, "step": 754 }, { "epoch": 1.65, "grad_norm": 0.20912939904357944, "learning_rate": 1.4333954233387136e-05, "loss": 0.2123, "step": 755 }, { "epoch": 1.66, "grad_norm": 0.20917641203494783, "learning_rate": 1.414431556625917e-05, "loss": 0.2104, "step": 756 }, { "epoch": 1.66, "grad_norm": 0.2084574875835098, "learning_rate": 1.3955844216959502e-05, "loss": 0.2313, "step": 757 }, { "epoch": 1.66, "grad_norm": 0.23628135092170213, "learning_rate": 1.3768542747997215e-05, "loss": 0.2307, "step": 758 }, { "epoch": 1.66, "grad_norm": 0.21532128754886334, "learning_rate": 1.3582413705975327e-05, "loss": 0.229, "step": 759 }, { "epoch": 1.66, "grad_norm": 0.19053700675294885, "learning_rate": 1.339745962155613e-05, "loss": 0.2185, "step": 760 }, { "epoch": 1.67, "grad_norm": 0.20258235891664356, "learning_rate": 1.3213683009427003e-05, "loss": 0.2267, "step": 761 }, { "epoch": 1.67, "grad_norm": 0.20530653449904004, "learning_rate": 1.3031086368265998e-05, "loss": 0.24, "step": 762 }, { "epoch": 1.67, "grad_norm": 0.2065814367954602, "learning_rate": 1.2849672180707928e-05, "loss": 0.2405, "step": 763 }, { "epoch": 1.67, "grad_norm": 0.20567628728139853, "learning_rate": 1.2669442913310725e-05, "loss": 0.2144, "step": 764 }, { "epoch": 1.68, "grad_norm": 0.21176419914000316, "learning_rate": 1.249040101652178e-05, "loss": 0.2173, "step": 765 }, { "epoch": 1.68, "grad_norm": 0.21884981500734577, "learning_rate": 1.2312548924644585e-05, "loss": 0.2405, "step": 766 }, { "epoch": 1.68, "grad_norm": 0.19687803356921588, "learning_rate": 1.2135889055805837e-05, "loss": 0.2204, "step": 767 }, { "epoch": 1.68, "grad_norm": 0.20109138915685001, "learning_rate": 1.196042381192236e-05, "loss": 0.2297, "step": 768 }, { "epoch": 1.68, "grad_norm": 0.19234466174457304, "learning_rate": 1.1786155578668545e-05, "loss": 0.2258, "step": 769 }, { "epoch": 1.69, "grad_norm": 0.1978258826111612, "learning_rate": 1.161308672544389e-05, "loss": 0.2038, "step": 770 }, { "epoch": 1.69, "grad_norm": 0.21048922943923923, "learning_rate": 1.1441219605340792e-05, "loss": 0.2283, "step": 771 }, { "epoch": 1.69, "grad_norm": 0.22257127311445948, "learning_rate": 1.1270556555112532e-05, "loss": 0.2283, "step": 772 }, { "epoch": 1.69, "grad_norm": 0.19149094284354706, "learning_rate": 1.1101099895141542e-05, "loss": 0.215, "step": 773 }, { "epoch": 1.7, "grad_norm": 0.20549949771292994, "learning_rate": 1.0932851929407827e-05, "loss": 0.2385, "step": 774 }, { "epoch": 1.7, "grad_norm": 0.1758290014045967, "learning_rate": 1.0765814945457653e-05, "loss": 0.1882, "step": 775 }, { "epoch": 1.7, "grad_norm": 0.1898211512831452, "learning_rate": 1.059999121437244e-05, "loss": 0.2049, "step": 776 }, { "epoch": 1.7, "grad_norm": 0.20222538943850363, "learning_rate": 1.0435382990737853e-05, "loss": 0.2175, "step": 777 }, { "epoch": 1.7, "grad_norm": 0.22196981794940074, "learning_rate": 1.0271992512613237e-05, "loss": 0.2463, "step": 778 }, { "epoch": 1.71, "grad_norm": 0.2027360971190236, "learning_rate": 1.0109822001501079e-05, "loss": 0.2145, "step": 779 }, { "epoch": 1.71, "grad_norm": 0.20140001638042115, "learning_rate": 9.948873662316883e-06, "loss": 0.2106, "step": 780 }, { "epoch": 1.71, "grad_norm": 0.21591974789953125, "learning_rate": 9.789149683359144e-06, "loss": 0.2421, "step": 781 }, { "epoch": 1.71, "grad_norm": 0.20965313194879998, "learning_rate": 9.630652236279625e-06, "loss": 0.2313, "step": 782 }, { "epoch": 1.72, "grad_norm": 0.1989390148317295, "learning_rate": 9.473383476053821e-06, "loss": 0.2319, "step": 783 }, { "epoch": 1.72, "grad_norm": 0.17721856904495678, "learning_rate": 9.31734554095165e-06, "loss": 0.2086, "step": 784 }, { "epoch": 1.72, "grad_norm": 0.18710003353683868, "learning_rate": 9.162540552508392e-06, "loss": 0.2135, "step": 785 }, { "epoch": 1.72, "grad_norm": 0.19769078883988017, "learning_rate": 9.008970615495826e-06, "loss": 0.2203, "step": 786 }, { "epoch": 1.72, "grad_norm": 0.19064162760273085, "learning_rate": 8.856637817893688e-06, "loss": 0.2022, "step": 787 }, { "epoch": 1.73, "grad_norm": 0.1926418728967082, "learning_rate": 8.70554423086114e-06, "loss": 0.1937, "step": 788 }, { "epoch": 1.73, "grad_norm": 0.21141489190301602, "learning_rate": 8.555691908708707e-06, "loss": 0.2266, "step": 789 }, { "epoch": 1.73, "grad_norm": 0.20876156881963107, "learning_rate": 8.40708288887042e-06, "loss": 0.2343, "step": 790 }, { "epoch": 1.73, "grad_norm": 0.18416645127890868, "learning_rate": 8.259719191875892e-06, "loss": 0.1982, "step": 791 }, { "epoch": 1.73, "eval_loss": 0.23803040385246277, "eval_runtime": 174.4554, "eval_samples_per_second": 13.316, "eval_steps_per_second": 0.418, "step": 791 }, { "epoch": 1.74, "grad_norm": 0.2040866955904897, "learning_rate": 8.113602821323041e-06, "loss": 0.2136, "step": 792 }, { "epoch": 1.74, "grad_norm": 0.1982756149597501, "learning_rate": 7.968735763850854e-06, "loss": 0.2167, "step": 793 }, { "epoch": 1.74, "grad_norm": 0.21789914937542643, "learning_rate": 7.825119989112173e-06, "loss": 0.2397, "step": 794 }, { "epoch": 1.74, "grad_norm": 0.2147093525916075, "learning_rate": 7.682757449747135e-06, "loss": 0.2358, "step": 795 }, { "epoch": 1.74, "grad_norm": 0.20816472900077765, "learning_rate": 7.541650081356588e-06, "loss": 0.2234, "step": 796 }, { "epoch": 1.75, "grad_norm": 0.21148961713513456, "learning_rate": 7.401799802475573e-06, "loss": 0.2319, "step": 797 }, { "epoch": 1.75, "grad_norm": 0.1956708576913421, "learning_rate": 7.263208514547548e-06, "loss": 0.2137, "step": 798 }, { "epoch": 1.75, "grad_norm": 0.20721619820091447, "learning_rate": 7.125878101898298e-06, "loss": 0.2188, "step": 799 }, { "epoch": 1.75, "grad_norm": 0.22116900534807907, "learning_rate": 6.989810431710375e-06, "loss": 0.2267, "step": 800 }, { "epoch": 1.76, "grad_norm": 0.19978803749852295, "learning_rate": 6.855007353997822e-06, "loss": 0.2101, "step": 801 }, { "epoch": 1.76, "grad_norm": 0.21381752099366957, "learning_rate": 6.7214707015808565e-06, "loss": 0.2261, "step": 802 }, { "epoch": 1.76, "grad_norm": 0.19219567545117702, "learning_rate": 6.589202290061014e-06, "loss": 0.2034, "step": 803 }, { "epoch": 1.76, "grad_norm": 0.18779844136231152, "learning_rate": 6.4582039177965455e-06, "loss": 0.2061, "step": 804 }, { "epoch": 1.76, "grad_norm": 0.20174766286042234, "learning_rate": 6.328477365877849e-06, "loss": 0.2233, "step": 805 }, { "epoch": 1.77, "grad_norm": 0.21231582457977347, "learning_rate": 6.200024398103255e-06, "loss": 0.2128, "step": 806 }, { "epoch": 1.77, "grad_norm": 0.18391901989584938, "learning_rate": 6.072846760955186e-06, "loss": 0.1992, "step": 807 }, { "epoch": 1.77, "grad_norm": 0.20294386783873802, "learning_rate": 5.946946183576241e-06, "loss": 0.222, "step": 808 }, { "epoch": 1.77, "grad_norm": 0.20469202298707184, "learning_rate": 5.822324377745791e-06, "loss": 0.2235, "step": 809 }, { "epoch": 1.78, "grad_norm": 0.22515194087302773, "learning_rate": 5.698983037856665e-06, "loss": 0.2438, "step": 810 }, { "epoch": 1.78, "grad_norm": 0.20643509196206, "learning_rate": 5.57692384089209e-06, "loss": 0.2131, "step": 811 }, { "epoch": 1.78, "grad_norm": 0.2108231693828425, "learning_rate": 5.456148446402976e-06, "loss": 0.2078, "step": 812 }, { "epoch": 1.78, "grad_norm": 0.21597167356066369, "learning_rate": 5.336658496485248e-06, "loss": 0.2222, "step": 813 }, { "epoch": 1.78, "grad_norm": 0.1933627658122827, "learning_rate": 5.2184556157576e-06, "loss": 0.2157, "step": 814 }, { "epoch": 1.79, "grad_norm": 0.22222814897657234, "learning_rate": 5.10154141133935e-06, "loss": 0.2533, "step": 815 }, { "epoch": 1.79, "grad_norm": 0.20557125384851288, "learning_rate": 4.985917472828639e-06, "loss": 0.2264, "step": 816 }, { "epoch": 1.79, "grad_norm": 0.21870759144189225, "learning_rate": 4.871585372280774e-06, "loss": 0.2208, "step": 817 }, { "epoch": 1.79, "grad_norm": 0.20383293324564078, "learning_rate": 4.758546664186869e-06, "loss": 0.2128, "step": 818 }, { "epoch": 1.8, "grad_norm": 0.2207651135648432, "learning_rate": 4.646802885452739e-06, "loss": 0.2331, "step": 819 }, { "epoch": 1.8, "grad_norm": 0.20935706303755758, "learning_rate": 4.536355555377947e-06, "loss": 0.2056, "step": 820 }, { "epoch": 1.8, "grad_norm": 0.19500728601207568, "learning_rate": 4.427206175635202e-06, "loss": 0.2216, "step": 821 }, { "epoch": 1.8, "grad_norm": 0.1831341530266785, "learning_rate": 4.319356230249905e-06, "loss": 0.2185, "step": 822 }, { "epoch": 1.8, "grad_norm": 0.20328065520250121, "learning_rate": 4.212807185579992e-06, "loss": 0.22, "step": 823 }, { "epoch": 1.81, "grad_norm": 0.197367783334187, "learning_rate": 4.107560490295992e-06, "loss": 0.2088, "step": 824 }, { "epoch": 1.81, "grad_norm": 0.199986101949123, "learning_rate": 4.003617575361318e-06, "loss": 0.2252, "step": 825 }, { "epoch": 1.81, "grad_norm": 0.20378685375125896, "learning_rate": 3.90097985401282e-06, "loss": 0.2177, "step": 826 }, { "epoch": 1.81, "grad_norm": 0.2089863714675373, "learning_rate": 3.7996487217416223e-06, "loss": 0.2281, "step": 827 }, { "epoch": 1.82, "grad_norm": 0.22174346947327125, "learning_rate": 3.699625556274e-06, "loss": 0.2298, "step": 828 }, { "epoch": 1.82, "grad_norm": 0.20182626270268705, "learning_rate": 3.600911717552824e-06, "loss": 0.2239, "step": 829 }, { "epoch": 1.82, "grad_norm": 0.20313048355330246, "learning_rate": 3.5035085477190143e-06, "loss": 0.225, "step": 830 }, { "epoch": 1.82, "grad_norm": 0.2123857689013769, "learning_rate": 3.40741737109318e-06, "loss": 0.2285, "step": 831 }, { "epoch": 1.82, "grad_norm": 0.20877332815435723, "learning_rate": 3.3126394941577656e-06, "loss": 0.2179, "step": 832 }, { "epoch": 1.83, "grad_norm": 0.20261893136881134, "learning_rate": 3.2191762055392428e-06, "loss": 0.2053, "step": 833 }, { "epoch": 1.83, "grad_norm": 0.2165479347703477, "learning_rate": 3.127028775990515e-06, "loss": 0.234, "step": 834 }, { "epoch": 1.83, "grad_norm": 0.20854006087046636, "learning_rate": 3.03619845837374e-06, "loss": 0.2194, "step": 835 }, { "epoch": 1.83, "grad_norm": 0.20839858622666318, "learning_rate": 2.94668648764328e-06, "loss": 0.2011, "step": 836 }, { "epoch": 1.83, "grad_norm": 0.2001464877253074, "learning_rate": 2.8584940808288107e-06, "loss": 0.2074, "step": 837 }, { "epoch": 1.84, "grad_norm": 0.2003840882274367, "learning_rate": 2.7716224370188836e-06, "loss": 0.2102, "step": 838 }, { "epoch": 1.84, "grad_norm": 0.18643967510262804, "learning_rate": 2.6860727373446472e-06, "loss": 0.2039, "step": 839 }, { "epoch": 1.84, "grad_norm": 0.21745550785786122, "learning_rate": 2.6018461449636265e-06, "loss": 0.2225, "step": 840 }, { "epoch": 1.84, "grad_norm": 0.22552267236084578, "learning_rate": 2.5189438050440607e-06, "loss": 0.233, "step": 841 }, { "epoch": 1.85, "grad_norm": 0.21764248387495064, "learning_rate": 2.4373668447493224e-06, "loss": 0.2214, "step": 842 }, { "epoch": 1.85, "grad_norm": 0.1980950415461294, "learning_rate": 2.357116373222479e-06, "loss": 0.2098, "step": 843 }, { "epoch": 1.85, "grad_norm": 0.1880075122855425, "learning_rate": 2.2781934815713225e-06, "loss": 0.216, "step": 844 }, { "epoch": 1.85, "grad_norm": 0.21062925563667326, "learning_rate": 2.2005992428535184e-06, "loss": 0.2247, "step": 845 }, { "epoch": 1.85, "grad_norm": 0.2119014226422388, "learning_rate": 2.1243347120619595e-06, "loss": 0.2329, "step": 846 }, { "epoch": 1.86, "grad_norm": 0.20411445722959737, "learning_rate": 2.0494009261104785e-06, "loss": 0.1941, "step": 847 }, { "epoch": 1.86, "grad_norm": 0.19915577884332822, "learning_rate": 1.9757989038197146e-06, "loss": 0.2151, "step": 848 }, { "epoch": 1.86, "grad_norm": 0.21265487566748392, "learning_rate": 1.9035296459033125e-06, "loss": 0.2401, "step": 849 }, { "epoch": 1.86, "grad_norm": 0.1996989913105502, "learning_rate": 1.8325941349542353e-06, "loss": 0.2146, "step": 850 }, { "epoch": 1.87, "grad_norm": 0.1933371213186986, "learning_rate": 1.7629933354314733e-06, "loss": 0.2002, "step": 851 }, { "epoch": 1.87, "grad_norm": 0.20045617018763978, "learning_rate": 1.6947281936469218e-06, "loss": 0.2293, "step": 852 }, { "epoch": 1.87, "grad_norm": 0.19920942808294725, "learning_rate": 1.6277996377524695e-06, "loss": 0.2183, "step": 853 }, { "epoch": 1.87, "grad_norm": 0.1936935465614516, "learning_rate": 1.562208577727442e-06, "loss": 0.1957, "step": 854 }, { "epoch": 1.87, "grad_norm": 0.18683523311557979, "learning_rate": 1.4979559053661663e-06, "loss": 0.2009, "step": 855 }, { "epoch": 1.88, "grad_norm": 0.18917173941955473, "learning_rate": 1.4350424942659146e-06, "loss": 0.2109, "step": 856 }, { "epoch": 1.88, "grad_norm": 0.19682753422160834, "learning_rate": 1.3734691998149474e-06, "loss": 0.2088, "step": 857 }, { "epoch": 1.88, "grad_norm": 0.2048854194962544, "learning_rate": 1.3132368591809552e-06, "loss": 0.2291, "step": 858 }, { "epoch": 1.88, "grad_norm": 0.20142459735505794, "learning_rate": 1.2543462912996463e-06, "loss": 0.2278, "step": 859 }, { "epoch": 1.89, "grad_norm": 0.18935162489689705, "learning_rate": 1.1967982968635993e-06, "loss": 0.2113, "step": 860 }, { "epoch": 1.89, "grad_norm": 0.20159497258222603, "learning_rate": 1.1405936583113841e-06, "loss": 0.2266, "step": 861 }, { "epoch": 1.89, "grad_norm": 0.19452768000808426, "learning_rate": 1.0857331398169579e-06, "loss": 0.2053, "step": 862 }, { "epoch": 1.89, "grad_norm": 0.2014716516584308, "learning_rate": 1.032217487279219e-06, "loss": 0.1975, "step": 863 }, { "epoch": 1.89, "grad_norm": 0.19846952960822706, "learning_rate": 9.800474283119144e-07, "loss": 0.2062, "step": 864 }, { "epoch": 1.9, "grad_norm": 0.20327559720564944, "learning_rate": 9.292236722337033e-07, "loss": 0.2108, "step": 865 }, { "epoch": 1.9, "grad_norm": 0.19290238138729213, "learning_rate": 8.797469100585431e-07, "loss": 0.2039, "step": 866 }, { "epoch": 1.9, "grad_norm": 0.21287614440666355, "learning_rate": 8.316178144862963e-07, "loss": 0.2307, "step": 867 }, { "epoch": 1.9, "grad_norm": 0.22567801639890173, "learning_rate": 7.848370398935711e-07, "loss": 0.2573, "step": 868 }, { "epoch": 1.91, "grad_norm": 0.21560397460264963, "learning_rate": 7.394052223248182e-07, "loss": 0.2241, "step": 869 }, { "epoch": 1.91, "grad_norm": 0.1957878241687257, "learning_rate": 6.953229794837146e-07, "loss": 0.2193, "step": 870 }, { "epoch": 1.91, "grad_norm": 0.20291967284647494, "learning_rate": 6.525909107247041e-07, "loss": 0.2064, "step": 871 }, { "epoch": 1.91, "grad_norm": 0.18847141357662414, "learning_rate": 6.11209597044926e-07, "loss": 0.2105, "step": 872 }, { "epoch": 1.91, "grad_norm": 0.2044881782029401, "learning_rate": 5.71179601076266e-07, "loss": 0.2177, "step": 873 }, { "epoch": 1.92, "grad_norm": 0.21142542004279102, "learning_rate": 5.325014670776951e-07, "loss": 0.233, "step": 874 }, { "epoch": 1.92, "grad_norm": 0.19576617003967908, "learning_rate": 4.951757209279095e-07, "loss": 0.2214, "step": 875 }, { "epoch": 1.92, "grad_norm": 0.20534907108193845, "learning_rate": 4.5920287011818054e-07, "loss": 0.2122, "step": 876 }, { "epoch": 1.92, "grad_norm": 0.2059415081571872, "learning_rate": 4.245834037454155e-07, "loss": 0.213, "step": 877 }, { "epoch": 1.93, "grad_norm": 0.19398152799817778, "learning_rate": 3.913177925055189e-07, "loss": 0.2219, "step": 878 }, { "epoch": 1.93, "grad_norm": 0.2070317988118509, "learning_rate": 3.5940648868703073e-07, "loss": 0.2244, "step": 879 }, { "epoch": 1.93, "grad_norm": 0.8623179589154261, "learning_rate": 3.288499261649314e-07, "loss": 0.2217, "step": 880 }, { "epoch": 1.93, "grad_norm": 0.215386416256104, "learning_rate": 2.9964852039476854e-07, "loss": 0.2229, "step": 881 }, { "epoch": 1.93, "grad_norm": 0.20755620295808855, "learning_rate": 2.718026684070063e-07, "loss": 0.2208, "step": 882 }, { "epoch": 1.94, "grad_norm": 0.20071657749706617, "learning_rate": 2.4531274880160713e-07, "loss": 0.212, "step": 883 }, { "epoch": 1.94, "grad_norm": 0.20451448277112147, "learning_rate": 2.201791217428917e-07, "loss": 0.2293, "step": 884 }, { "epoch": 1.94, "grad_norm": 0.18693359575878427, "learning_rate": 1.964021289546869e-07, "loss": 0.2067, "step": 885 }, { "epoch": 1.94, "grad_norm": 0.1878379319628547, "learning_rate": 1.739820937155967e-07, "loss": 0.2023, "step": 886 }, { "epoch": 1.95, "grad_norm": 0.183214719604713, "learning_rate": 1.5291932085468307e-07, "loss": 0.1914, "step": 887 }, { "epoch": 1.95, "grad_norm": 0.20983709187003516, "learning_rate": 1.3321409674728057e-07, "loss": 0.2458, "step": 888 }, { "epoch": 1.95, "grad_norm": 0.18418752748665865, "learning_rate": 1.1486668931111056e-07, "loss": 0.2111, "step": 889 }, { "epoch": 1.95, "grad_norm": 0.21472529700497753, "learning_rate": 9.78773480026396e-08, "loss": 0.2597, "step": 890 }, { "epoch": 1.95, "grad_norm": 0.19925573417089712, "learning_rate": 8.224630381369335e-08, "loss": 0.2274, "step": 891 }, { "epoch": 1.96, "grad_norm": 0.288190699780251, "learning_rate": 6.797376926829246e-08, "loss": 0.226, "step": 892 }, { "epoch": 1.96, "grad_norm": 0.20512892684223066, "learning_rate": 5.5059938419821377e-08, "loss": 0.209, "step": 893 }, { "epoch": 1.96, "grad_norm": 0.21184481530963395, "learning_rate": 4.350498684829729e-08, "loss": 0.223, "step": 894 }, { "epoch": 1.96, "grad_norm": 0.20194019888677378, "learning_rate": 3.330907165809416e-08, "loss": 0.219, "step": 895 }, { "epoch": 1.97, "grad_norm": 0.21240034431305688, "learning_rate": 2.447233147570005e-08, "loss": 0.2052, "step": 896 }, { "epoch": 1.97, "grad_norm": 0.2164638642279727, "learning_rate": 1.6994886447896376e-08, "loss": 0.2264, "step": 897 }, { "epoch": 1.97, "grad_norm": 0.21008301667873402, "learning_rate": 1.0876838240125863e-08, "loss": 0.2419, "step": 898 }, { "epoch": 1.97, "grad_norm": 0.20392637628490956, "learning_rate": 6.1182700350714825e-09, "loss": 0.2225, "step": 899 }, { "epoch": 1.97, "grad_norm": 0.21551655412638365, "learning_rate": 2.719246531535102e-09, "loss": 0.2395, "step": 900 }, { "epoch": 1.98, "grad_norm": 0.1985114294041871, "learning_rate": 6.798139436159367e-10, "loss": 0.2104, "step": 901 }, { "epoch": 1.98, "grad_norm": 0.2212692948861265, "learning_rate": 0.0, "loss": 0.2445, "step": 902 } ], "logging_steps": 1, "max_steps": 902, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 451, "total_flos": 2.635365103453392e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }