{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9988925802879292, "eval_steps": 113, "global_step": 451, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 3.428594393312656, "learning_rate": 4.000000000000001e-06, "loss": 2.1469, "step": 1 }, { "epoch": 0.0, "eval_loss": 2.179504632949829, "eval_runtime": 172.7557, "eval_samples_per_second": 13.447, "eval_steps_per_second": 0.423, "step": 1 }, { "epoch": 0.0, "grad_norm": 3.7159828982580754, "learning_rate": 8.000000000000001e-06, "loss": 2.1946, "step": 2 }, { "epoch": 0.01, "grad_norm": 3.643082734610657, "learning_rate": 1.2e-05, "loss": 2.232, "step": 3 }, { "epoch": 0.01, "grad_norm": 3.75889245136616, "learning_rate": 1.6000000000000003e-05, "loss": 2.1482, "step": 4 }, { "epoch": 0.01, "grad_norm": 3.121791718587376, "learning_rate": 2e-05, "loss": 1.9094, "step": 5 }, { "epoch": 0.01, "grad_norm": 3.357880867767518, "learning_rate": 2.4e-05, "loss": 1.5871, "step": 6 }, { "epoch": 0.02, "grad_norm": 3.946129682154882, "learning_rate": 2.8000000000000003e-05, "loss": 1.326, "step": 7 }, { "epoch": 0.02, "grad_norm": 2.108395610487625, "learning_rate": 3.2000000000000005e-05, "loss": 1.0602, "step": 8 }, { "epoch": 0.02, "grad_norm": 2.130447473758986, "learning_rate": 3.6e-05, "loss": 0.962, "step": 9 }, { "epoch": 0.02, "grad_norm": 2.250625735269271, "learning_rate": 4e-05, "loss": 0.7239, "step": 10 }, { "epoch": 0.02, "grad_norm": 2.086877732413652, "learning_rate": 4.4000000000000006e-05, "loss": 0.6204, "step": 11 }, { "epoch": 0.03, "grad_norm": 1.1321791358110234, "learning_rate": 4.8e-05, "loss": 0.5483, "step": 12 }, { "epoch": 0.03, "grad_norm": 0.8099243715089002, "learning_rate": 5.2000000000000004e-05, "loss": 0.5086, "step": 13 }, { "epoch": 0.03, "grad_norm": 0.8079844588753853, "learning_rate": 5.6000000000000006e-05, "loss": 0.5112, "step": 14 }, { "epoch": 0.03, "grad_norm": 0.9169759048137539, "learning_rate": 6e-05, "loss": 0.4889, "step": 15 }, { "epoch": 0.04, "grad_norm": 0.6917931287891316, "learning_rate": 6.400000000000001e-05, "loss": 0.4691, "step": 16 }, { "epoch": 0.04, "grad_norm": 0.6294590670553853, "learning_rate": 6.800000000000001e-05, "loss": 0.4549, "step": 17 }, { "epoch": 0.04, "grad_norm": 0.6001819771732335, "learning_rate": 7.2e-05, "loss": 0.4412, "step": 18 }, { "epoch": 0.04, "grad_norm": 0.6389927422614288, "learning_rate": 7.6e-05, "loss": 0.4077, "step": 19 }, { "epoch": 0.04, "grad_norm": 0.5185334799357709, "learning_rate": 8e-05, "loss": 0.4219, "step": 20 }, { "epoch": 0.05, "grad_norm": 0.5274718517549238, "learning_rate": 8.4e-05, "loss": 0.4124, "step": 21 }, { "epoch": 0.05, "grad_norm": 0.5333597700699511, "learning_rate": 8.800000000000001e-05, "loss": 0.4333, "step": 22 }, { "epoch": 0.05, "grad_norm": 0.4817408646893783, "learning_rate": 9.200000000000001e-05, "loss": 0.4022, "step": 23 }, { "epoch": 0.05, "grad_norm": 0.5041871581629092, "learning_rate": 9.6e-05, "loss": 0.3971, "step": 24 }, { "epoch": 0.06, "grad_norm": 0.5892498993207921, "learning_rate": 0.0001, "loss": 0.3769, "step": 25 }, { "epoch": 0.06, "grad_norm": 0.4448471314677223, "learning_rate": 0.00010400000000000001, "loss": 0.3905, "step": 26 }, { "epoch": 0.06, "grad_norm": 0.41460492934076704, "learning_rate": 0.00010800000000000001, "loss": 0.3814, "step": 27 }, { "epoch": 0.06, "grad_norm": 0.39759977947743247, "learning_rate": 0.00011200000000000001, "loss": 0.3675, "step": 28 }, { "epoch": 0.06, "grad_norm": 0.42656614420804834, "learning_rate": 0.000116, "loss": 0.3949, "step": 29 }, { "epoch": 0.07, "grad_norm": 0.3248894659390531, "learning_rate": 0.00012, "loss": 0.3734, "step": 30 }, { "epoch": 0.07, "grad_norm": 0.35364315198613105, "learning_rate": 0.000124, "loss": 0.3452, "step": 31 }, { "epoch": 0.07, "grad_norm": 0.3377660731261332, "learning_rate": 0.00012800000000000002, "loss": 0.365, "step": 32 }, { "epoch": 0.07, "grad_norm": 0.3029616210555195, "learning_rate": 0.000132, "loss": 0.3502, "step": 33 }, { "epoch": 0.08, "grad_norm": 0.43234275224940705, "learning_rate": 0.00013600000000000003, "loss": 0.3728, "step": 34 }, { "epoch": 0.08, "grad_norm": 0.3831666942704003, "learning_rate": 0.00014, "loss": 0.3922, "step": 35 }, { "epoch": 0.08, "grad_norm": 0.3421076533380305, "learning_rate": 0.000144, "loss": 0.3544, "step": 36 }, { "epoch": 0.08, "grad_norm": 0.30146273904800347, "learning_rate": 0.000148, "loss": 0.3237, "step": 37 }, { "epoch": 0.08, "grad_norm": 0.3234891150012619, "learning_rate": 0.000152, "loss": 0.3771, "step": 38 }, { "epoch": 0.09, "grad_norm": 0.32536194802044366, "learning_rate": 0.00015600000000000002, "loss": 0.3719, "step": 39 }, { "epoch": 0.09, "grad_norm": 0.32489341033312524, "learning_rate": 0.00016, "loss": 0.3716, "step": 40 }, { "epoch": 0.09, "grad_norm": 0.3135978131489568, "learning_rate": 0.000164, "loss": 0.3818, "step": 41 }, { "epoch": 0.09, "grad_norm": 0.28403149178513853, "learning_rate": 0.000168, "loss": 0.3576, "step": 42 }, { "epoch": 0.1, "grad_norm": 0.30747408490186307, "learning_rate": 0.000172, "loss": 0.3801, "step": 43 }, { "epoch": 0.1, "grad_norm": 0.3491185114838396, "learning_rate": 0.00017600000000000002, "loss": 0.3612, "step": 44 }, { "epoch": 0.1, "grad_norm": 0.4062709802932099, "learning_rate": 0.00018, "loss": 0.3752, "step": 45 }, { "epoch": 0.1, "grad_norm": 0.4174448098463489, "learning_rate": 0.00018400000000000003, "loss": 0.3705, "step": 46 }, { "epoch": 0.1, "grad_norm": 0.30034892483536746, "learning_rate": 0.000188, "loss": 0.3504, "step": 47 }, { "epoch": 0.11, "grad_norm": 0.24699047743823555, "learning_rate": 0.000192, "loss": 0.3622, "step": 48 }, { "epoch": 0.11, "grad_norm": 0.2944161708858025, "learning_rate": 0.000196, "loss": 0.3462, "step": 49 }, { "epoch": 0.11, "grad_norm": 0.24968532785704994, "learning_rate": 0.0002, "loss": 0.3074, "step": 50 }, { "epoch": 0.11, "grad_norm": 0.4989866478305154, "learning_rate": 0.00019999932018605637, "loss": 0.3426, "step": 51 }, { "epoch": 0.12, "grad_norm": 0.2729667152979656, "learning_rate": 0.00019999728075346845, "loss": 0.3452, "step": 52 }, { "epoch": 0.12, "grad_norm": 0.26553719135078474, "learning_rate": 0.00019999388172996495, "loss": 0.3771, "step": 53 }, { "epoch": 0.12, "grad_norm": 0.2563892875653707, "learning_rate": 0.0001999891231617599, "loss": 0.3709, "step": 54 }, { "epoch": 0.12, "grad_norm": 0.40493567306146466, "learning_rate": 0.0001999830051135521, "loss": 0.3659, "step": 55 }, { "epoch": 0.12, "grad_norm": 0.282074263507646, "learning_rate": 0.00019997552766852432, "loss": 0.3579, "step": 56 }, { "epoch": 0.13, "grad_norm": 0.24830919197340875, "learning_rate": 0.00019996669092834193, "loss": 0.3164, "step": 57 }, { "epoch": 0.13, "grad_norm": 0.31153665113975915, "learning_rate": 0.0001999564950131517, "loss": 0.3554, "step": 58 }, { "epoch": 0.13, "grad_norm": 0.2883397422934534, "learning_rate": 0.00019994494006158017, "loss": 0.3318, "step": 59 }, { "epoch": 0.13, "grad_norm": 0.29607763624002476, "learning_rate": 0.00019993202623073172, "loss": 0.3515, "step": 60 }, { "epoch": 0.14, "grad_norm": 0.26631042457457627, "learning_rate": 0.0001999177536961863, "loss": 0.3404, "step": 61 }, { "epoch": 0.14, "grad_norm": 0.30709845993919116, "learning_rate": 0.00019990212265199738, "loss": 0.3454, "step": 62 }, { "epoch": 0.14, "grad_norm": 0.2689959836388791, "learning_rate": 0.0001998851333106889, "loss": 0.3448, "step": 63 }, { "epoch": 0.14, "grad_norm": 0.28079893446027976, "learning_rate": 0.00019986678590325273, "loss": 0.3537, "step": 64 }, { "epoch": 0.14, "grad_norm": 0.2777161046281277, "learning_rate": 0.00019984708067914532, "loss": 0.3512, "step": 65 }, { "epoch": 0.15, "grad_norm": 0.2612936496089595, "learning_rate": 0.0001998260179062844, "loss": 0.3355, "step": 66 }, { "epoch": 0.15, "grad_norm": 0.270747618988818, "learning_rate": 0.0001998035978710453, "loss": 0.3557, "step": 67 }, { "epoch": 0.15, "grad_norm": 0.2524425572972277, "learning_rate": 0.00019977982087825713, "loss": 0.324, "step": 68 }, { "epoch": 0.15, "grad_norm": 0.2655824178335376, "learning_rate": 0.00019975468725119843, "loss": 0.3274, "step": 69 }, { "epoch": 0.16, "grad_norm": 0.29931436509996834, "learning_rate": 0.000199728197331593, "loss": 0.3733, "step": 70 }, { "epoch": 0.16, "grad_norm": 0.2945779980864737, "learning_rate": 0.00019970035147960524, "loss": 0.3161, "step": 71 }, { "epoch": 0.16, "grad_norm": 0.42061603746317366, "learning_rate": 0.00019967115007383507, "loss": 0.3486, "step": 72 }, { "epoch": 0.16, "grad_norm": 0.24899746628510128, "learning_rate": 0.000199640593511313, "loss": 0.3428, "step": 73 }, { "epoch": 0.16, "grad_norm": 0.2748778960772334, "learning_rate": 0.00019960868220749448, "loss": 0.3215, "step": 74 }, { "epoch": 0.17, "grad_norm": 0.2826057447583215, "learning_rate": 0.00019957541659625458, "loss": 0.3663, "step": 75 }, { "epoch": 0.17, "grad_norm": 0.26758814883425874, "learning_rate": 0.00019954079712988183, "loss": 0.3473, "step": 76 }, { "epoch": 0.17, "grad_norm": 0.26191530390935236, "learning_rate": 0.00019950482427907211, "loss": 0.3464, "step": 77 }, { "epoch": 0.17, "grad_norm": 0.27009995213133564, "learning_rate": 0.00019946749853292232, "loss": 0.3427, "step": 78 }, { "epoch": 0.17, "grad_norm": 0.2828252836889427, "learning_rate": 0.00019942882039892377, "loss": 0.3369, "step": 79 }, { "epoch": 0.18, "grad_norm": 0.28359570095347486, "learning_rate": 0.00019938879040295508, "loss": 0.3474, "step": 80 }, { "epoch": 0.18, "grad_norm": 0.23726568176198512, "learning_rate": 0.0001993474090892753, "loss": 0.3348, "step": 81 }, { "epoch": 0.18, "grad_norm": 0.28359002617947426, "learning_rate": 0.00019930467702051628, "loss": 0.3434, "step": 82 }, { "epoch": 0.18, "grad_norm": 0.24507300294017476, "learning_rate": 0.0001992605947776752, "loss": 0.3211, "step": 83 }, { "epoch": 0.19, "grad_norm": 0.2459171433488338, "learning_rate": 0.00019921516296010644, "loss": 0.3539, "step": 84 }, { "epoch": 0.19, "grad_norm": 0.23982609581354497, "learning_rate": 0.0001991683821855137, "loss": 0.3367, "step": 85 }, { "epoch": 0.19, "grad_norm": 0.24948097532302946, "learning_rate": 0.00019912025308994148, "loss": 0.3313, "step": 86 }, { "epoch": 0.19, "grad_norm": 0.25673241215001325, "learning_rate": 0.00019907077632776632, "loss": 0.3384, "step": 87 }, { "epoch": 0.19, "grad_norm": 0.2248520328170477, "learning_rate": 0.00019901995257168807, "loss": 0.3075, "step": 88 }, { "epoch": 0.2, "grad_norm": 0.2785744560779341, "learning_rate": 0.00019896778251272078, "loss": 0.3505, "step": 89 }, { "epoch": 0.2, "grad_norm": 0.2612264615045669, "learning_rate": 0.00019891426686018305, "loss": 0.3319, "step": 90 }, { "epoch": 0.2, "grad_norm": 0.2588107693763323, "learning_rate": 0.00019885940634168864, "loss": 0.3036, "step": 91 }, { "epoch": 0.2, "grad_norm": 0.2539991795599821, "learning_rate": 0.0001988032017031364, "loss": 0.3275, "step": 92 }, { "epoch": 0.21, "grad_norm": 0.2228639788098274, "learning_rate": 0.00019874565370870038, "loss": 0.3128, "step": 93 }, { "epoch": 0.21, "grad_norm": 0.2295330254763288, "learning_rate": 0.00019868676314081904, "loss": 0.3226, "step": 94 }, { "epoch": 0.21, "grad_norm": 0.26774082666954935, "learning_rate": 0.00019862653080018506, "loss": 0.3438, "step": 95 }, { "epoch": 0.21, "grad_norm": 0.2096704558396889, "learning_rate": 0.0001985649575057341, "loss": 0.3158, "step": 96 }, { "epoch": 0.21, "grad_norm": 0.222343383957648, "learning_rate": 0.00019850204409463385, "loss": 0.3127, "step": 97 }, { "epoch": 0.22, "grad_norm": 0.24355910663526567, "learning_rate": 0.00019843779142227256, "loss": 0.3278, "step": 98 }, { "epoch": 0.22, "grad_norm": 0.2589007730313338, "learning_rate": 0.00019837220036224756, "loss": 0.3433, "step": 99 }, { "epoch": 0.22, "grad_norm": 0.2048208439355147, "learning_rate": 0.00019830527180635308, "loss": 0.3038, "step": 100 }, { "epoch": 0.22, "grad_norm": 0.22100911829011585, "learning_rate": 0.00019823700666456853, "loss": 0.3295, "step": 101 }, { "epoch": 0.23, "grad_norm": 0.2865876628348663, "learning_rate": 0.0001981674058650458, "loss": 0.3374, "step": 102 }, { "epoch": 0.23, "grad_norm": 0.23267706317431597, "learning_rate": 0.00019809647035409672, "loss": 0.3146, "step": 103 }, { "epoch": 0.23, "grad_norm": 0.22896912398059824, "learning_rate": 0.0001980242010961803, "loss": 0.3302, "step": 104 }, { "epoch": 0.23, "grad_norm": 0.21199810980682918, "learning_rate": 0.00019795059907388952, "loss": 0.3089, "step": 105 }, { "epoch": 0.23, "grad_norm": 0.2314314536279127, "learning_rate": 0.00019787566528793807, "loss": 0.3219, "step": 106 }, { "epoch": 0.24, "grad_norm": 0.22578014749699987, "learning_rate": 0.00019779940075714648, "loss": 0.3089, "step": 107 }, { "epoch": 0.24, "grad_norm": 0.26739038717214403, "learning_rate": 0.0001977218065184287, "loss": 0.3314, "step": 108 }, { "epoch": 0.24, "grad_norm": 0.27640406493967007, "learning_rate": 0.00019764288362677753, "loss": 0.315, "step": 109 }, { "epoch": 0.24, "grad_norm": 0.275170691575301, "learning_rate": 0.0001975626331552507, "loss": 0.3192, "step": 110 }, { "epoch": 0.25, "grad_norm": 0.28590415176453904, "learning_rate": 0.00019748105619495594, "loss": 0.3235, "step": 111 }, { "epoch": 0.25, "grad_norm": 0.26618886451702856, "learning_rate": 0.0001973981538550364, "loss": 0.3235, "step": 112 }, { "epoch": 0.25, "grad_norm": 0.22783361042781863, "learning_rate": 0.00019731392726265537, "loss": 0.3319, "step": 113 }, { "epoch": 0.25, "eval_loss": 0.3324408233165741, "eval_runtime": 173.7614, "eval_samples_per_second": 13.369, "eval_steps_per_second": 0.42, "step": 113 }, { "epoch": 0.25, "grad_norm": 0.25003343596979083, "learning_rate": 0.00019722837756298113, "loss": 0.3269, "step": 114 }, { "epoch": 0.25, "grad_norm": 0.2051643008869916, "learning_rate": 0.0001971415059191712, "loss": 0.3331, "step": 115 }, { "epoch": 0.26, "grad_norm": 0.26679577582874325, "learning_rate": 0.00019705331351235674, "loss": 0.319, "step": 116 }, { "epoch": 0.26, "grad_norm": 0.22959410656566043, "learning_rate": 0.0001969638015416263, "loss": 0.326, "step": 117 }, { "epoch": 0.26, "grad_norm": 0.24732040292675736, "learning_rate": 0.0001968729712240095, "loss": 0.3456, "step": 118 }, { "epoch": 0.26, "grad_norm": 0.21472772586501115, "learning_rate": 0.00019678082379446078, "loss": 0.3154, "step": 119 }, { "epoch": 0.27, "grad_norm": 0.22004349633491344, "learning_rate": 0.00019668736050584224, "loss": 0.3296, "step": 120 }, { "epoch": 0.27, "grad_norm": 0.20481829759367912, "learning_rate": 0.00019659258262890683, "loss": 0.3168, "step": 121 }, { "epoch": 0.27, "grad_norm": 0.22596082871640896, "learning_rate": 0.00019649649145228102, "loss": 0.3198, "step": 122 }, { "epoch": 0.27, "grad_norm": 0.22416198789548378, "learning_rate": 0.00019639908828244718, "loss": 0.3122, "step": 123 }, { "epoch": 0.27, "grad_norm": 0.25019709447245486, "learning_rate": 0.000196300374443726, "loss": 0.3256, "step": 124 }, { "epoch": 0.28, "grad_norm": 0.23470807250735432, "learning_rate": 0.0001962003512782584, "loss": 0.3245, "step": 125 }, { "epoch": 0.28, "grad_norm": 0.24693935678122386, "learning_rate": 0.00019609902014598718, "loss": 0.3292, "step": 126 }, { "epoch": 0.28, "grad_norm": 0.2260534058582815, "learning_rate": 0.00019599638242463868, "loss": 0.325, "step": 127 }, { "epoch": 0.28, "grad_norm": 0.23667285933893983, "learning_rate": 0.00019589243950970402, "loss": 0.3337, "step": 128 }, { "epoch": 0.29, "grad_norm": 0.21298574370036155, "learning_rate": 0.00019578719281442003, "loss": 0.3092, "step": 129 }, { "epoch": 0.29, "grad_norm": 0.23372716379048458, "learning_rate": 0.00019568064376975012, "loss": 0.317, "step": 130 }, { "epoch": 0.29, "grad_norm": 0.22334413345013637, "learning_rate": 0.0001955727938243648, "loss": 0.2834, "step": 131 }, { "epoch": 0.29, "grad_norm": 0.2248381817587477, "learning_rate": 0.00019546364444462207, "loss": 0.302, "step": 132 }, { "epoch": 0.29, "grad_norm": 0.2360215129073668, "learning_rate": 0.00019535319711454728, "loss": 0.3293, "step": 133 }, { "epoch": 0.3, "grad_norm": 0.22623309491159815, "learning_rate": 0.00019524145333581317, "loss": 0.3273, "step": 134 }, { "epoch": 0.3, "grad_norm": 0.2070555672669519, "learning_rate": 0.00019512841462771924, "loss": 0.2972, "step": 135 }, { "epoch": 0.3, "grad_norm": 0.22460201914808312, "learning_rate": 0.00019501408252717138, "loss": 0.3019, "step": 136 }, { "epoch": 0.3, "grad_norm": 0.2225333175600629, "learning_rate": 0.00019489845858866066, "loss": 0.2983, "step": 137 }, { "epoch": 0.31, "grad_norm": 0.21017634852282913, "learning_rate": 0.0001947815443842424, "loss": 0.3002, "step": 138 }, { "epoch": 0.31, "grad_norm": 0.24382065213677345, "learning_rate": 0.00019466334150351476, "loss": 0.3061, "step": 139 }, { "epoch": 0.31, "grad_norm": 0.22222229848717998, "learning_rate": 0.00019454385155359702, "loss": 0.3189, "step": 140 }, { "epoch": 0.31, "grad_norm": 0.21144991906498956, "learning_rate": 0.00019442307615910793, "loss": 0.3093, "step": 141 }, { "epoch": 0.31, "grad_norm": 0.20966684015092005, "learning_rate": 0.00019430101696214336, "loss": 0.2975, "step": 142 }, { "epoch": 0.32, "grad_norm": 0.2285163015371888, "learning_rate": 0.0001941776756222542, "loss": 0.3291, "step": 143 }, { "epoch": 0.32, "grad_norm": 0.22173438888731478, "learning_rate": 0.00019405305381642375, "loss": 0.3052, "step": 144 }, { "epoch": 0.32, "grad_norm": 0.23472615797287633, "learning_rate": 0.00019392715323904481, "loss": 0.3158, "step": 145 }, { "epoch": 0.32, "grad_norm": 0.19206597044565454, "learning_rate": 0.00019379997560189675, "loss": 0.3047, "step": 146 }, { "epoch": 0.33, "grad_norm": 0.21928888599234125, "learning_rate": 0.00019367152263412217, "loss": 0.3196, "step": 147 }, { "epoch": 0.33, "grad_norm": 0.21469742386937682, "learning_rate": 0.00019354179608220348, "loss": 0.2981, "step": 148 }, { "epoch": 0.33, "grad_norm": 0.39037984784670976, "learning_rate": 0.000193410797709939, "loss": 0.2962, "step": 149 }, { "epoch": 0.33, "grad_norm": 0.3224683875107671, "learning_rate": 0.00019327852929841916, "loss": 0.3268, "step": 150 }, { "epoch": 0.33, "grad_norm": 0.22377973835731155, "learning_rate": 0.0001931449926460022, "loss": 0.3093, "step": 151 }, { "epoch": 0.34, "grad_norm": 0.21525051451055552, "learning_rate": 0.00019301018956828964, "loss": 0.3173, "step": 152 }, { "epoch": 0.34, "grad_norm": 0.19693490433987484, "learning_rate": 0.00019287412189810172, "loss": 0.3069, "step": 153 }, { "epoch": 0.34, "grad_norm": 0.1985719221741534, "learning_rate": 0.00019273679148545245, "loss": 0.3239, "step": 154 }, { "epoch": 0.34, "grad_norm": 0.19043619193681682, "learning_rate": 0.00019259820019752443, "loss": 0.3041, "step": 155 }, { "epoch": 0.35, "grad_norm": 0.2029259146228578, "learning_rate": 0.0001924583499186434, "loss": 0.3049, "step": 156 }, { "epoch": 0.35, "grad_norm": 0.1913024173068621, "learning_rate": 0.00019231724255025284, "loss": 0.3263, "step": 157 }, { "epoch": 0.35, "grad_norm": 0.20418535912509125, "learning_rate": 0.00019217488001088784, "loss": 0.3093, "step": 158 }, { "epoch": 0.35, "grad_norm": 0.21506493193668452, "learning_rate": 0.00019203126423614916, "loss": 0.3386, "step": 159 }, { "epoch": 0.35, "grad_norm": 0.2078781708947467, "learning_rate": 0.00019188639717867696, "loss": 0.3098, "step": 160 }, { "epoch": 0.36, "grad_norm": 0.21203565844547, "learning_rate": 0.00019174028080812415, "loss": 0.3245, "step": 161 }, { "epoch": 0.36, "grad_norm": 0.22944231505967116, "learning_rate": 0.0001915929171111296, "loss": 0.3174, "step": 162 }, { "epoch": 0.36, "grad_norm": 0.19389795184150505, "learning_rate": 0.00019144430809129128, "loss": 0.2985, "step": 163 }, { "epoch": 0.36, "grad_norm": 0.2294568217196969, "learning_rate": 0.00019129445576913888, "loss": 0.2916, "step": 164 }, { "epoch": 0.37, "grad_norm": 0.21471312626755512, "learning_rate": 0.00019114336218210634, "loss": 0.3203, "step": 165 }, { "epoch": 0.37, "grad_norm": 0.200256096520887, "learning_rate": 0.00019099102938450416, "loss": 0.314, "step": 166 }, { "epoch": 0.37, "grad_norm": 0.20286230376549838, "learning_rate": 0.00019083745944749162, "loss": 0.2953, "step": 167 }, { "epoch": 0.37, "grad_norm": 0.22752630009694177, "learning_rate": 0.00019068265445904836, "loss": 0.3098, "step": 168 }, { "epoch": 0.37, "grad_norm": 0.1969764402722988, "learning_rate": 0.00019052661652394618, "loss": 0.2798, "step": 169 }, { "epoch": 0.38, "grad_norm": 0.21419927863312113, "learning_rate": 0.0001903693477637204, "loss": 0.315, "step": 170 }, { "epoch": 0.38, "grad_norm": 0.1978129640154948, "learning_rate": 0.00019021085031664087, "loss": 0.2769, "step": 171 }, { "epoch": 0.38, "grad_norm": 0.1832605646042805, "learning_rate": 0.00019005112633768313, "loss": 0.2787, "step": 172 }, { "epoch": 0.38, "grad_norm": 0.25130865190581947, "learning_rate": 0.00018989017799849896, "loss": 0.3042, "step": 173 }, { "epoch": 0.39, "grad_norm": 0.22333188194958617, "learning_rate": 0.0001897280074873868, "loss": 0.3366, "step": 174 }, { "epoch": 0.39, "grad_norm": 0.23303955610127655, "learning_rate": 0.00018956461700926215, "loss": 0.3069, "step": 175 }, { "epoch": 0.39, "grad_norm": 0.21946538873928723, "learning_rate": 0.00018940000878562758, "loss": 0.3026, "step": 176 }, { "epoch": 0.39, "grad_norm": 0.22034016868322132, "learning_rate": 0.00018923418505454237, "loss": 0.3031, "step": 177 }, { "epoch": 0.39, "grad_norm": 0.1918273691770089, "learning_rate": 0.00018906714807059218, "loss": 0.287, "step": 178 }, { "epoch": 0.4, "grad_norm": 0.1984010069624954, "learning_rate": 0.00018889890010485847, "loss": 0.3039, "step": 179 }, { "epoch": 0.4, "grad_norm": 0.19139523377149348, "learning_rate": 0.00018872944344488747, "loss": 0.3152, "step": 180 }, { "epoch": 0.4, "grad_norm": 0.18841860181390324, "learning_rate": 0.0001885587803946592, "loss": 0.3171, "step": 181 }, { "epoch": 0.4, "grad_norm": 0.19736334769102976, "learning_rate": 0.0001883869132745561, "loss": 0.2851, "step": 182 }, { "epoch": 0.41, "grad_norm": 0.19610383893082234, "learning_rate": 0.00018821384442133145, "loss": 0.307, "step": 183 }, { "epoch": 0.41, "grad_norm": 0.19693236953217128, "learning_rate": 0.00018803957618807764, "loss": 0.3219, "step": 184 }, { "epoch": 0.41, "grad_norm": 0.1978461815572098, "learning_rate": 0.0001878641109441942, "loss": 0.2936, "step": 185 }, { "epoch": 0.41, "grad_norm": 0.22453262012783057, "learning_rate": 0.00018768745107535542, "loss": 0.3225, "step": 186 }, { "epoch": 0.41, "grad_norm": 0.1951859636493649, "learning_rate": 0.00018750959898347825, "loss": 0.2892, "step": 187 }, { "epoch": 0.42, "grad_norm": 0.19051492201816908, "learning_rate": 0.00018733055708668926, "loss": 0.2922, "step": 188 }, { "epoch": 0.42, "grad_norm": 0.19631810708829814, "learning_rate": 0.00018715032781929208, "loss": 0.2928, "step": 189 }, { "epoch": 0.42, "grad_norm": 0.20533375929006709, "learning_rate": 0.00018696891363173405, "loss": 0.3212, "step": 190 }, { "epoch": 0.42, "grad_norm": 0.21698315667726276, "learning_rate": 0.00018678631699057302, "loss": 0.3419, "step": 191 }, { "epoch": 0.43, "grad_norm": 0.23742145444149265, "learning_rate": 0.00018660254037844388, "loss": 0.3081, "step": 192 }, { "epoch": 0.43, "grad_norm": 0.22429285552735956, "learning_rate": 0.00018641758629402467, "loss": 0.3132, "step": 193 }, { "epoch": 0.43, "grad_norm": 0.20957107721564286, "learning_rate": 0.00018623145725200278, "loss": 0.3176, "step": 194 }, { "epoch": 0.43, "grad_norm": 0.23119517784848204, "learning_rate": 0.0001860441557830405, "loss": 0.3174, "step": 195 }, { "epoch": 0.43, "grad_norm": 0.22081382473360098, "learning_rate": 0.00018585568443374087, "loss": 0.3029, "step": 196 }, { "epoch": 0.44, "grad_norm": 0.20857278454177905, "learning_rate": 0.00018566604576661288, "loss": 0.2803, "step": 197 }, { "epoch": 0.44, "grad_norm": 0.20277792365487685, "learning_rate": 0.00018547524236003674, "loss": 0.3032, "step": 198 }, { "epoch": 0.44, "grad_norm": 0.22068595819466755, "learning_rate": 0.0001852832768082288, "loss": 0.3196, "step": 199 }, { "epoch": 0.44, "grad_norm": 0.22826508701764245, "learning_rate": 0.00018509015172120621, "loss": 0.307, "step": 200 }, { "epoch": 0.45, "grad_norm": 0.23318100222090005, "learning_rate": 0.00018489586972475155, "loss": 0.3243, "step": 201 }, { "epoch": 0.45, "grad_norm": 0.20790305208366794, "learning_rate": 0.00018470043346037698, "loss": 0.3026, "step": 202 }, { "epoch": 0.45, "grad_norm": 0.23045901908374763, "learning_rate": 0.00018450384558528845, "loss": 0.3215, "step": 203 }, { "epoch": 0.45, "grad_norm": 0.1895969081217627, "learning_rate": 0.0001843061087723496, "loss": 0.2827, "step": 204 }, { "epoch": 0.45, "grad_norm": 0.21471234551245583, "learning_rate": 0.00018410722571004522, "loss": 0.2758, "step": 205 }, { "epoch": 0.46, "grad_norm": 0.19174460263968052, "learning_rate": 0.00018390719910244487, "loss": 0.2935, "step": 206 }, { "epoch": 0.46, "grad_norm": 0.2042082548764658, "learning_rate": 0.00018370603166916616, "loss": 0.3219, "step": 207 }, { "epoch": 0.46, "grad_norm": 0.20862412820843024, "learning_rate": 0.00018350372614533753, "loss": 0.3079, "step": 208 }, { "epoch": 0.46, "grad_norm": 0.19365588486473745, "learning_rate": 0.00018330028528156138, "loss": 0.2878, "step": 209 }, { "epoch": 0.47, "grad_norm": 0.20923759592087007, "learning_rate": 0.0001830957118438764, "loss": 0.3052, "step": 210 }, { "epoch": 0.47, "grad_norm": 0.19346679246935292, "learning_rate": 0.00018289000861372007, "loss": 0.2873, "step": 211 }, { "epoch": 0.47, "grad_norm": 0.20707125547144337, "learning_rate": 0.00018268317838789088, "loss": 0.2888, "step": 212 }, { "epoch": 0.47, "grad_norm": 0.210272509030541, "learning_rate": 0.00018247522397851028, "loss": 0.3063, "step": 213 }, { "epoch": 0.47, "grad_norm": 0.22635533463893145, "learning_rate": 0.0001822661482129844, "loss": 0.2844, "step": 214 }, { "epoch": 0.48, "grad_norm": 0.20691107915045812, "learning_rate": 0.00018205595393396568, "loss": 0.2986, "step": 215 }, { "epoch": 0.48, "grad_norm": 0.21549907787627992, "learning_rate": 0.00018184464399931412, "loss": 0.3098, "step": 216 }, { "epoch": 0.48, "grad_norm": 0.20061134281496176, "learning_rate": 0.00018163222128205853, "loss": 0.2871, "step": 217 }, { "epoch": 0.48, "grad_norm": 0.20502352214692726, "learning_rate": 0.00018141868867035745, "loss": 0.294, "step": 218 }, { "epoch": 0.49, "grad_norm": 0.2291358128642933, "learning_rate": 0.00018120404906745973, "loss": 0.2757, "step": 219 }, { "epoch": 0.49, "grad_norm": 0.1972835362678214, "learning_rate": 0.00018098830539166536, "loss": 0.3084, "step": 220 }, { "epoch": 0.49, "grad_norm": 0.21611715417590088, "learning_rate": 0.00018077146057628545, "loss": 0.2816, "step": 221 }, { "epoch": 0.49, "grad_norm": 0.21599858891116713, "learning_rate": 0.00018055351756960262, "loss": 0.3085, "step": 222 }, { "epoch": 0.49, "grad_norm": 0.19318954639097616, "learning_rate": 0.00018033447933483076, "loss": 0.2557, "step": 223 }, { "epoch": 0.5, "grad_norm": 0.18823294633329682, "learning_rate": 0.00018011434885007482, "loss": 0.2902, "step": 224 }, { "epoch": 0.5, "grad_norm": 0.21452443929587184, "learning_rate": 0.00017989312910829023, "loss": 0.311, "step": 225 }, { "epoch": 0.5, "grad_norm": 0.20666447125090429, "learning_rate": 0.00017967082311724227, "loss": 0.2883, "step": 226 }, { "epoch": 0.5, "eval_loss": 0.297645628452301, "eval_runtime": 173.6984, "eval_samples_per_second": 13.374, "eval_steps_per_second": 0.42, "step": 226 }, { "epoch": 0.5, "grad_norm": 0.21336924271339083, "learning_rate": 0.00017944743389946524, "loss": 0.3026, "step": 227 }, { "epoch": 0.5, "grad_norm": 0.20792583408257218, "learning_rate": 0.0001792229644922212, "loss": 0.2843, "step": 228 }, { "epoch": 0.51, "grad_norm": 0.2129371399765845, "learning_rate": 0.0001789974179474588, "loss": 0.3091, "step": 229 }, { "epoch": 0.51, "grad_norm": 0.19527091537284477, "learning_rate": 0.00017877079733177184, "loss": 0.297, "step": 230 }, { "epoch": 0.51, "grad_norm": 0.20516345655708346, "learning_rate": 0.00017854310572635733, "loss": 0.2935, "step": 231 }, { "epoch": 0.51, "grad_norm": 0.1969309791573733, "learning_rate": 0.00017831434622697385, "loss": 0.2898, "step": 232 }, { "epoch": 0.52, "grad_norm": 0.2696175229862278, "learning_rate": 0.0001780845219438994, "loss": 0.2924, "step": 233 }, { "epoch": 0.52, "grad_norm": 0.234973041162424, "learning_rate": 0.00017785363600188894, "loss": 0.3179, "step": 234 }, { "epoch": 0.52, "grad_norm": 0.23939104068785497, "learning_rate": 0.00017762169154013216, "loss": 0.2796, "step": 235 }, { "epoch": 0.52, "grad_norm": 0.20378572036784126, "learning_rate": 0.00017738869171221068, "loss": 0.2784, "step": 236 }, { "epoch": 0.52, "grad_norm": 0.19159802735904477, "learning_rate": 0.0001771546396860551, "loss": 0.2834, "step": 237 }, { "epoch": 0.53, "grad_norm": 0.23302725481391803, "learning_rate": 0.00017691953864390207, "loss": 0.2997, "step": 238 }, { "epoch": 0.53, "grad_norm": 0.23471887316578038, "learning_rate": 0.0001766833917822509, "loss": 0.3, "step": 239 }, { "epoch": 0.53, "grad_norm": 0.1796225243725005, "learning_rate": 0.00017644620231182015, "loss": 0.2901, "step": 240 }, { "epoch": 0.53, "grad_norm": 0.20037902218428075, "learning_rate": 0.00017620797345750403, "loss": 0.294, "step": 241 }, { "epoch": 0.54, "grad_norm": 0.20415485179217374, "learning_rate": 0.0001759687084583285, "loss": 0.3162, "step": 242 }, { "epoch": 0.54, "grad_norm": 0.18182779738483645, "learning_rate": 0.00017572841056740722, "loss": 0.275, "step": 243 }, { "epoch": 0.54, "grad_norm": 0.17275776985467062, "learning_rate": 0.00017548708305189722, "loss": 0.2592, "step": 244 }, { "epoch": 0.54, "grad_norm": 0.18388792575433974, "learning_rate": 0.00017524472919295487, "loss": 0.2998, "step": 245 }, { "epoch": 0.54, "grad_norm": 0.1821375376929017, "learning_rate": 0.00017500135228569068, "loss": 0.2586, "step": 246 }, { "epoch": 0.55, "grad_norm": 0.18937126829543396, "learning_rate": 0.00017475695563912505, "loss": 0.2858, "step": 247 }, { "epoch": 0.55, "grad_norm": 0.1833938565560613, "learning_rate": 0.00017451154257614287, "loss": 0.271, "step": 248 }, { "epoch": 0.55, "grad_norm": 0.20382927797400074, "learning_rate": 0.0001742651164334486, "loss": 0.2931, "step": 249 }, { "epoch": 0.55, "grad_norm": 0.23510651870506527, "learning_rate": 0.00017401768056152085, "loss": 0.3078, "step": 250 }, { "epoch": 0.56, "grad_norm": 0.2191619740250893, "learning_rate": 0.00017376923832456665, "loss": 0.3111, "step": 251 }, { "epoch": 0.56, "grad_norm": 0.20648235757385622, "learning_rate": 0.00017351979310047602, "loss": 0.2816, "step": 252 }, { "epoch": 0.56, "grad_norm": 0.19445561313531604, "learning_rate": 0.00017326934828077573, "loss": 0.2894, "step": 253 }, { "epoch": 0.56, "grad_norm": 0.19169798824869538, "learning_rate": 0.00017301790727058345, "loss": 0.2802, "step": 254 }, { "epoch": 0.56, "grad_norm": 0.2006703950203184, "learning_rate": 0.0001727654734885612, "loss": 0.2749, "step": 255 }, { "epoch": 0.57, "grad_norm": 0.21061482297381626, "learning_rate": 0.0001725120503668691, "loss": 0.3042, "step": 256 }, { "epoch": 0.57, "grad_norm": 0.20740853426573325, "learning_rate": 0.00017225764135111868, "loss": 0.3025, "step": 257 }, { "epoch": 0.57, "grad_norm": 0.1945016623980029, "learning_rate": 0.00017200224990032576, "loss": 0.2964, "step": 258 }, { "epoch": 0.57, "grad_norm": 0.21095668740779128, "learning_rate": 0.00017174587948686374, "loss": 0.3047, "step": 259 }, { "epoch": 0.58, "grad_norm": 0.18724414137242903, "learning_rate": 0.00017148853359641626, "loss": 0.2678, "step": 260 }, { "epoch": 0.58, "grad_norm": 0.18247589724576602, "learning_rate": 0.00017123021572792982, "loss": 0.2796, "step": 261 }, { "epoch": 0.58, "grad_norm": 0.20187877784359254, "learning_rate": 0.00017097092939356623, "loss": 0.2819, "step": 262 }, { "epoch": 0.58, "grad_norm": 0.21960455156021205, "learning_rate": 0.00017071067811865476, "loss": 0.2849, "step": 263 }, { "epoch": 0.58, "grad_norm": 0.21920804195394356, "learning_rate": 0.00017044946544164433, "loss": 0.286, "step": 264 }, { "epoch": 0.59, "grad_norm": 0.2153094965416609, "learning_rate": 0.00017018729491405536, "loss": 0.2728, "step": 265 }, { "epoch": 0.59, "grad_norm": 0.18899974451413623, "learning_rate": 0.00016992417010043142, "loss": 0.2643, "step": 266 }, { "epoch": 0.59, "grad_norm": 0.20452371771386804, "learning_rate": 0.00016966009457829086, "loss": 0.2805, "step": 267 }, { "epoch": 0.59, "grad_norm": 0.19546419928152936, "learning_rate": 0.0001693950719380782, "loss": 0.2749, "step": 268 }, { "epoch": 0.6, "grad_norm": 0.19627215184487343, "learning_rate": 0.00016912910578311503, "loss": 0.273, "step": 269 }, { "epoch": 0.6, "grad_norm": 0.18828419766156645, "learning_rate": 0.00016886219972955146, "loss": 0.273, "step": 270 }, { "epoch": 0.6, "grad_norm": 0.19027059707604657, "learning_rate": 0.00016859435740631658, "loss": 0.3046, "step": 271 }, { "epoch": 0.6, "grad_norm": 0.18426233664144975, "learning_rate": 0.00016832558245506935, "loss": 0.2643, "step": 272 }, { "epoch": 0.6, "grad_norm": 0.18658729778193736, "learning_rate": 0.00016805587853014895, "loss": 0.285, "step": 273 }, { "epoch": 0.61, "grad_norm": 0.1796032355554721, "learning_rate": 0.00016778524929852512, "loss": 0.261, "step": 274 }, { "epoch": 0.61, "grad_norm": 0.19790847765561645, "learning_rate": 0.0001675136984397484, "loss": 0.3036, "step": 275 }, { "epoch": 0.61, "grad_norm": 0.2013507251688789, "learning_rate": 0.0001672412296459, "loss": 0.2929, "step": 276 }, { "epoch": 0.61, "grad_norm": 0.20727255991158675, "learning_rate": 0.00016696784662154163, "loss": 0.28, "step": 277 }, { "epoch": 0.62, "grad_norm": 0.22175173491924988, "learning_rate": 0.0001666935530836651, "loss": 0.2953, "step": 278 }, { "epoch": 0.62, "grad_norm": 0.2110368794764076, "learning_rate": 0.00016641835276164183, "loss": 0.3012, "step": 279 }, { "epoch": 0.62, "grad_norm": 0.1839097327175649, "learning_rate": 0.00016614224939717217, "loss": 0.2985, "step": 280 }, { "epoch": 0.62, "grad_norm": 0.18075701410157072, "learning_rate": 0.00016586524674423446, "loss": 0.2614, "step": 281 }, { "epoch": 0.62, "grad_norm": 0.19006328055997038, "learning_rate": 0.00016558734856903404, "loss": 0.2741, "step": 282 }, { "epoch": 0.63, "grad_norm": 0.1936501426398552, "learning_rate": 0.00016530855864995195, "loss": 0.2486, "step": 283 }, { "epoch": 0.63, "grad_norm": 0.2146360520459362, "learning_rate": 0.0001650288807774937, "loss": 0.2949, "step": 284 }, { "epoch": 0.63, "grad_norm": 0.1934454081094374, "learning_rate": 0.00016474831875423767, "loss": 0.25, "step": 285 }, { "epoch": 0.63, "grad_norm": 0.20138529225313043, "learning_rate": 0.0001644668763947833, "loss": 0.2826, "step": 286 }, { "epoch": 0.64, "grad_norm": 0.20336180418412392, "learning_rate": 0.00016418455752569943, "loss": 0.281, "step": 287 }, { "epoch": 0.64, "grad_norm": 0.2366660122305481, "learning_rate": 0.00016390136598547217, "loss": 0.2665, "step": 288 }, { "epoch": 0.64, "grad_norm": 0.18478597928089585, "learning_rate": 0.00016361730562445263, "loss": 0.3022, "step": 289 }, { "epoch": 0.64, "grad_norm": 0.18679948920602824, "learning_rate": 0.0001633323803048047, "loss": 0.2844, "step": 290 }, { "epoch": 0.64, "grad_norm": 0.19949957744833288, "learning_rate": 0.00016304659390045252, "loss": 0.2912, "step": 291 }, { "epoch": 0.65, "grad_norm": 0.20044341182168152, "learning_rate": 0.0001627599502970277, "loss": 0.2729, "step": 292 }, { "epoch": 0.65, "grad_norm": 0.18558206726419454, "learning_rate": 0.00016247245339181662, "loss": 0.2693, "step": 293 }, { "epoch": 0.65, "grad_norm": 0.2083655759577377, "learning_rate": 0.00016218410709370736, "loss": 0.3022, "step": 294 }, { "epoch": 0.65, "grad_norm": 0.21456219690969847, "learning_rate": 0.00016189491532313664, "loss": 0.2933, "step": 295 }, { "epoch": 0.66, "grad_norm": 0.2316084982608295, "learning_rate": 0.00016160488201203644, "loss": 0.2631, "step": 296 }, { "epoch": 0.66, "grad_norm": 0.20943766303868608, "learning_rate": 0.00016131401110378043, "loss": 0.2847, "step": 297 }, { "epoch": 0.66, "grad_norm": 0.2346281512353159, "learning_rate": 0.00016102230655313076, "loss": 0.2898, "step": 298 }, { "epoch": 0.66, "grad_norm": 0.18690479698056445, "learning_rate": 0.0001607297723261837, "loss": 0.2761, "step": 299 }, { "epoch": 0.66, "grad_norm": 0.22244239877246086, "learning_rate": 0.00016043641240031623, "loss": 0.2794, "step": 300 }, { "epoch": 0.67, "grad_norm": 0.19513520777123738, "learning_rate": 0.00016014223076413173, "loss": 0.2757, "step": 301 }, { "epoch": 0.67, "grad_norm": 0.1954004471181246, "learning_rate": 0.00015984723141740576, "loss": 0.2713, "step": 302 }, { "epoch": 0.67, "grad_norm": 0.21399036326669596, "learning_rate": 0.00015955141837103168, "loss": 0.2767, "step": 303 }, { "epoch": 0.67, "grad_norm": 0.2045546981741005, "learning_rate": 0.0001592547956469662, "loss": 0.2807, "step": 304 }, { "epoch": 0.68, "grad_norm": 0.21321779886240727, "learning_rate": 0.00015895736727817455, "loss": 0.2899, "step": 305 }, { "epoch": 0.68, "grad_norm": 0.20121770170139855, "learning_rate": 0.00015865913730857582, "loss": 0.2706, "step": 306 }, { "epoch": 0.68, "grad_norm": 0.21901656836549904, "learning_rate": 0.00015836010979298782, "loss": 0.3043, "step": 307 }, { "epoch": 0.68, "grad_norm": 0.19476889300368513, "learning_rate": 0.0001580602887970721, "loss": 0.294, "step": 308 }, { "epoch": 0.68, "grad_norm": 0.1864916489760236, "learning_rate": 0.00015775967839727842, "loss": 0.2789, "step": 309 }, { "epoch": 0.69, "grad_norm": 0.2005263777787887, "learning_rate": 0.0001574582826807897, "loss": 0.2786, "step": 310 }, { "epoch": 0.69, "grad_norm": 0.20330910409013883, "learning_rate": 0.0001571561057454661, "loss": 0.2856, "step": 311 }, { "epoch": 0.69, "grad_norm": 0.2075041911121307, "learning_rate": 0.00015685315169978954, "loss": 0.301, "step": 312 }, { "epoch": 0.69, "grad_norm": 0.198545224318664, "learning_rate": 0.0001565494246628077, "loss": 0.2667, "step": 313 }, { "epoch": 0.7, "grad_norm": 0.18262873322406858, "learning_rate": 0.0001562449287640781, "loss": 0.2761, "step": 314 }, { "epoch": 0.7, "grad_norm": 0.20682904427400386, "learning_rate": 0.0001559396681436118, "loss": 0.2798, "step": 315 }, { "epoch": 0.7, "grad_norm": 0.17100330949740156, "learning_rate": 0.00015563364695181741, "loss": 0.2638, "step": 316 }, { "epoch": 0.7, "grad_norm": 0.19147233406100644, "learning_rate": 0.00015532686934944438, "loss": 0.2772, "step": 317 }, { "epoch": 0.7, "grad_norm": 0.20859821686266017, "learning_rate": 0.00015501933950752656, "loss": 0.2899, "step": 318 }, { "epoch": 0.71, "grad_norm": 0.22660943071408993, "learning_rate": 0.00015471106160732542, "loss": 0.2809, "step": 319 }, { "epoch": 0.71, "grad_norm": 0.1970595992988207, "learning_rate": 0.00015440203984027324, "loss": 0.2664, "step": 320 }, { "epoch": 0.71, "grad_norm": 0.21441465355834557, "learning_rate": 0.00015409227840791617, "loss": 0.2872, "step": 321 }, { "epoch": 0.71, "grad_norm": 0.1928160770028818, "learning_rate": 0.000153781781521857, "loss": 0.2932, "step": 322 }, { "epoch": 0.72, "grad_norm": 0.18232874606722474, "learning_rate": 0.00015347055340369804, "loss": 0.2865, "step": 323 }, { "epoch": 0.72, "grad_norm": 0.18944518082470313, "learning_rate": 0.00015315859828498354, "loss": 0.2895, "step": 324 }, { "epoch": 0.72, "grad_norm": 0.18688411692002266, "learning_rate": 0.00015284592040714227, "loss": 0.3068, "step": 325 }, { "epoch": 0.72, "grad_norm": 0.15360887670295487, "learning_rate": 0.00015253252402142988, "loss": 0.2541, "step": 326 }, { "epoch": 0.72, "grad_norm": 0.17886321037354558, "learning_rate": 0.00015221841338887104, "loss": 0.2735, "step": 327 }, { "epoch": 0.73, "grad_norm": 0.18210509607368616, "learning_rate": 0.0001519035927802015, "loss": 0.2674, "step": 328 }, { "epoch": 0.73, "grad_norm": 0.2113800439853146, "learning_rate": 0.00015158806647581002, "loss": 0.2611, "step": 329 }, { "epoch": 0.73, "grad_norm": 0.21664373470568593, "learning_rate": 0.00015127183876568022, "loss": 0.2734, "step": 330 }, { "epoch": 0.73, "grad_norm": 0.21152226314764647, "learning_rate": 0.0001509549139493323, "loss": 0.2605, "step": 331 }, { "epoch": 0.74, "grad_norm": 0.20787608314066813, "learning_rate": 0.0001506372963357644, "loss": 0.2829, "step": 332 }, { "epoch": 0.74, "grad_norm": 0.20890612340451087, "learning_rate": 0.00015031899024339415, "loss": 0.2761, "step": 333 }, { "epoch": 0.74, "grad_norm": 0.18123961862890825, "learning_rate": 0.00015000000000000001, "loss": 0.2625, "step": 334 }, { "epoch": 0.74, "grad_norm": 0.20005061805451332, "learning_rate": 0.00014968032994266224, "loss": 0.2739, "step": 335 }, { "epoch": 0.74, "grad_norm": 0.22011995491515385, "learning_rate": 0.00014935998441770407, "loss": 0.2769, "step": 336 }, { "epoch": 0.75, "grad_norm": 0.1759458248293069, "learning_rate": 0.00014903896778063267, "loss": 0.2751, "step": 337 }, { "epoch": 0.75, "grad_norm": 0.17963701036394614, "learning_rate": 0.00014871728439607966, "loss": 0.2861, "step": 338 }, { "epoch": 0.75, "grad_norm": 0.22395035633400334, "learning_rate": 0.00014839493863774212, "loss": 0.2748, "step": 339 }, { "epoch": 0.75, "eval_loss": 0.27847620844841003, "eval_runtime": 174.097, "eval_samples_per_second": 13.343, "eval_steps_per_second": 0.419, "step": 339 }, { "epoch": 0.75, "grad_norm": 0.19931473943900235, "learning_rate": 0.00014807193488832282, "loss": 0.261, "step": 340 }, { "epoch": 0.76, "grad_norm": 0.1895068181039295, "learning_rate": 0.00014774827753947088, "loss": 0.2666, "step": 341 }, { "epoch": 0.76, "grad_norm": 0.19682173311096884, "learning_rate": 0.00014742397099172183, "loss": 0.2564, "step": 342 }, { "epoch": 0.76, "grad_norm": 0.2183356533233642, "learning_rate": 0.00014709901965443794, "loss": 0.2904, "step": 343 }, { "epoch": 0.76, "grad_norm": 0.2113562507034691, "learning_rate": 0.00014677342794574817, "loss": 0.2915, "step": 344 }, { "epoch": 0.76, "grad_norm": 0.2034784414842117, "learning_rate": 0.00014644720029248829, "loss": 0.2717, "step": 345 }, { "epoch": 0.77, "grad_norm": 0.1967572374697097, "learning_rate": 0.00014612034113014035, "loss": 0.2887, "step": 346 }, { "epoch": 0.77, "grad_norm": 0.1899953535101574, "learning_rate": 0.00014579285490277274, "loss": 0.2922, "step": 347 }, { "epoch": 0.77, "grad_norm": 0.19441384522232566, "learning_rate": 0.0001454647460629795, "loss": 0.2785, "step": 348 }, { "epoch": 0.77, "grad_norm": 0.20710674744690838, "learning_rate": 0.00014513601907181992, "loss": 0.2929, "step": 349 }, { "epoch": 0.78, "grad_norm": 0.18640646337351455, "learning_rate": 0.00014480667839875786, "loss": 0.261, "step": 350 }, { "epoch": 0.78, "grad_norm": 0.1754057901024035, "learning_rate": 0.00014447672852160095, "loss": 0.267, "step": 351 }, { "epoch": 0.78, "grad_norm": 0.1845116395890587, "learning_rate": 0.0001441461739264397, "loss": 0.2608, "step": 352 }, { "epoch": 0.78, "grad_norm": 0.1832842348380918, "learning_rate": 0.00014381501910758662, "loss": 0.264, "step": 353 }, { "epoch": 0.78, "grad_norm": 0.21078762275275081, "learning_rate": 0.00014348326856751496, "loss": 0.2903, "step": 354 }, { "epoch": 0.79, "grad_norm": 0.2040423439668467, "learning_rate": 0.00014315092681679755, "loss": 0.2866, "step": 355 }, { "epoch": 0.79, "grad_norm": 0.20916646529152538, "learning_rate": 0.00014281799837404552, "loss": 0.2669, "step": 356 }, { "epoch": 0.79, "grad_norm": 0.20829199241699958, "learning_rate": 0.00014248448776584688, "loss": 0.2773, "step": 357 }, { "epoch": 0.79, "grad_norm": 0.22863270132432637, "learning_rate": 0.0001421503995267048, "loss": 0.2691, "step": 358 }, { "epoch": 0.8, "grad_norm": 0.19107899619014693, "learning_rate": 0.00014181573819897617, "loss": 0.2854, "step": 359 }, { "epoch": 0.8, "grad_norm": 0.18310920202899897, "learning_rate": 0.00014148050833280977, "loss": 0.2523, "step": 360 }, { "epoch": 0.8, "grad_norm": 0.19142180009852441, "learning_rate": 0.00014114471448608426, "loss": 0.2668, "step": 361 }, { "epoch": 0.8, "grad_norm": 0.17916118207750328, "learning_rate": 0.0001408083612243465, "loss": 0.2826, "step": 362 }, { "epoch": 0.8, "grad_norm": 0.1795532305403206, "learning_rate": 0.0001404714531207492, "loss": 0.2876, "step": 363 }, { "epoch": 0.81, "grad_norm": 0.19812580507096825, "learning_rate": 0.0001401339947559889, "loss": 0.2813, "step": 364 }, { "epoch": 0.81, "grad_norm": 0.16985158453017796, "learning_rate": 0.00013979599071824362, "loss": 0.2824, "step": 365 }, { "epoch": 0.81, "grad_norm": 0.20447909651191687, "learning_rate": 0.00013945744560311057, "loss": 0.2567, "step": 366 }, { "epoch": 0.81, "grad_norm": 0.20178806044954373, "learning_rate": 0.0001391183640135435, "loss": 0.2755, "step": 367 }, { "epoch": 0.82, "grad_norm": 0.2265042443851448, "learning_rate": 0.00013877875055979023, "loss": 0.2905, "step": 368 }, { "epoch": 0.82, "grad_norm": 0.22127748544952333, "learning_rate": 0.00013843860985933003, "loss": 0.2624, "step": 369 }, { "epoch": 0.82, "grad_norm": 0.20805077265871227, "learning_rate": 0.00013809794653681074, "loss": 0.2396, "step": 370 }, { "epoch": 0.82, "grad_norm": 0.21236429860308073, "learning_rate": 0.00013775676522398588, "loss": 0.2916, "step": 371 }, { "epoch": 0.82, "grad_norm": 0.19893678645018886, "learning_rate": 0.00013741507055965168, "loss": 0.2551, "step": 372 }, { "epoch": 0.83, "grad_norm": 0.18256634499151977, "learning_rate": 0.00013707286718958413, "loss": 0.2633, "step": 373 }, { "epoch": 0.83, "grad_norm": 0.18221655160895703, "learning_rate": 0.00013673015976647568, "loss": 0.2672, "step": 374 }, { "epoch": 0.83, "grad_norm": 0.19429764298431815, "learning_rate": 0.00013638695294987204, "loss": 0.2417, "step": 375 }, { "epoch": 0.83, "grad_norm": 0.18793583918739298, "learning_rate": 0.0001360432514061087, "loss": 0.2613, "step": 376 }, { "epoch": 0.83, "grad_norm": 0.2011435035031983, "learning_rate": 0.00013569905980824788, "loss": 0.2685, "step": 377 }, { "epoch": 0.84, "grad_norm": 0.2052637622415086, "learning_rate": 0.00013535438283601435, "loss": 0.2959, "step": 378 }, { "epoch": 0.84, "grad_norm": 0.1945089020971851, "learning_rate": 0.00013500922517573245, "loss": 0.2482, "step": 379 }, { "epoch": 0.84, "grad_norm": 0.20812822716018067, "learning_rate": 0.00013466359152026195, "loss": 0.2741, "step": 380 }, { "epoch": 0.84, "grad_norm": 0.22384533508071883, "learning_rate": 0.0001343174865689344, "loss": 0.2914, "step": 381 }, { "epoch": 0.85, "grad_norm": 0.19177959970339475, "learning_rate": 0.0001339709150274893, "loss": 0.2694, "step": 382 }, { "epoch": 0.85, "grad_norm": 0.18196106404878304, "learning_rate": 0.0001336238816080099, "loss": 0.2649, "step": 383 }, { "epoch": 0.85, "grad_norm": 0.2526481416447456, "learning_rate": 0.00013327639102885937, "loss": 0.2813, "step": 384 }, { "epoch": 0.85, "grad_norm": 0.26250954621985834, "learning_rate": 0.0001329284480146166, "loss": 0.2793, "step": 385 }, { "epoch": 0.85, "grad_norm": 0.19318862080071506, "learning_rate": 0.00013258005729601177, "loss": 0.2441, "step": 386 }, { "epoch": 0.86, "grad_norm": 0.21447364833995924, "learning_rate": 0.00013223122360986225, "loss": 0.2887, "step": 387 }, { "epoch": 0.86, "grad_norm": 0.18021814111984105, "learning_rate": 0.00013188195169900813, "loss": 0.2722, "step": 388 }, { "epoch": 0.86, "grad_norm": 0.1986609859985159, "learning_rate": 0.0001315322463122477, "loss": 0.2638, "step": 389 }, { "epoch": 0.86, "grad_norm": 0.24674716367898283, "learning_rate": 0.00013118211220427298, "loss": 0.2753, "step": 390 }, { "epoch": 0.87, "grad_norm": 0.22439353357611744, "learning_rate": 0.0001308315541356049, "loss": 0.2853, "step": 391 }, { "epoch": 0.87, "grad_norm": 0.17776179888083857, "learning_rate": 0.00013048057687252865, "loss": 0.2441, "step": 392 }, { "epoch": 0.87, "grad_norm": 0.21002490821212472, "learning_rate": 0.00013012918518702914, "loss": 0.2882, "step": 393 }, { "epoch": 0.87, "grad_norm": 0.18514107951857695, "learning_rate": 0.00012977738385672557, "loss": 0.2643, "step": 394 }, { "epoch": 0.87, "grad_norm": 0.18577497415553973, "learning_rate": 0.000129425177664807, "loss": 0.2588, "step": 395 }, { "epoch": 0.88, "grad_norm": 0.21465706083350347, "learning_rate": 0.00012907257139996704, "loss": 0.287, "step": 396 }, { "epoch": 0.88, "grad_norm": 0.1742663966229961, "learning_rate": 0.0001287195698563388, "loss": 0.2668, "step": 397 }, { "epoch": 0.88, "grad_norm": 0.20553702826350395, "learning_rate": 0.0001283661778334297, "loss": 0.2739, "step": 398 }, { "epoch": 0.88, "grad_norm": 0.16765771945192084, "learning_rate": 0.0001280124001360562, "loss": 0.2594, "step": 399 }, { "epoch": 0.89, "grad_norm": 0.1943563901020515, "learning_rate": 0.0001276582415742786, "loss": 0.2708, "step": 400 }, { "epoch": 0.89, "grad_norm": 0.1896178424172107, "learning_rate": 0.0001273037069633354, "loss": 0.2639, "step": 401 }, { "epoch": 0.89, "grad_norm": 1.0712870595212587, "learning_rate": 0.00012694880112357808, "loss": 0.2765, "step": 402 }, { "epoch": 0.89, "grad_norm": 0.2092081458414992, "learning_rate": 0.00012659352888040547, "loss": 0.2589, "step": 403 }, { "epoch": 0.89, "grad_norm": 0.21211010351459872, "learning_rate": 0.0001262378950641979, "loss": 0.285, "step": 404 }, { "epoch": 0.9, "grad_norm": 0.23377789093835202, "learning_rate": 0.00012588190451025207, "loss": 0.3038, "step": 405 }, { "epoch": 0.9, "grad_norm": 0.24727367430206143, "learning_rate": 0.00012552556205871478, "loss": 0.2577, "step": 406 }, { "epoch": 0.9, "grad_norm": 0.1800512798597796, "learning_rate": 0.00012516887255451735, "loss": 0.2392, "step": 407 }, { "epoch": 0.9, "grad_norm": 0.2114523591213948, "learning_rate": 0.00012481184084730976, "loss": 0.27, "step": 408 }, { "epoch": 0.91, "grad_norm": 0.22674225384993477, "learning_rate": 0.0001244544717913947, "loss": 0.2372, "step": 409 }, { "epoch": 0.91, "grad_norm": 0.17864125878915538, "learning_rate": 0.00012409677024566144, "loss": 0.2601, "step": 410 }, { "epoch": 0.91, "grad_norm": 0.1904684467485375, "learning_rate": 0.00012373874107352004, "loss": 0.2647, "step": 411 }, { "epoch": 0.91, "grad_norm": 0.17298512767214175, "learning_rate": 0.0001233803891428349, "loss": 0.2438, "step": 412 }, { "epoch": 0.91, "grad_norm": 0.17544690229663243, "learning_rate": 0.00012302171932585885, "loss": 0.2585, "step": 413 }, { "epoch": 0.92, "grad_norm": 0.20074139062085292, "learning_rate": 0.0001226627364991667, "loss": 0.2575, "step": 414 }, { "epoch": 0.92, "grad_norm": 0.18591883669880138, "learning_rate": 0.0001223034455435891, "loss": 0.2593, "step": 415 }, { "epoch": 0.92, "grad_norm": 0.20298533039339473, "learning_rate": 0.00012194385134414608, "loss": 0.2779, "step": 416 }, { "epoch": 0.92, "grad_norm": 0.19947012767946487, "learning_rate": 0.00012158395878998063, "loss": 0.2776, "step": 417 }, { "epoch": 0.93, "grad_norm": 0.18367461189103365, "learning_rate": 0.00012122377277429231, "loss": 0.2934, "step": 418 }, { "epoch": 0.93, "grad_norm": 0.18639410238342366, "learning_rate": 0.00012086329819427065, "loss": 0.2848, "step": 419 }, { "epoch": 0.93, "grad_norm": 0.18355116451736245, "learning_rate": 0.00012050253995102854, "loss": 0.2864, "step": 420 }, { "epoch": 0.93, "grad_norm": 0.16618761241876043, "learning_rate": 0.00012014150294953563, "loss": 0.2722, "step": 421 }, { "epoch": 0.93, "grad_norm": 0.1679981782550472, "learning_rate": 0.00011978019209855174, "loss": 0.2417, "step": 422 }, { "epoch": 0.94, "grad_norm": 0.17373986628880816, "learning_rate": 0.00011941861231055994, "loss": 0.2464, "step": 423 }, { "epoch": 0.94, "grad_norm": 0.16872090115264807, "learning_rate": 0.0001190567685016998, "loss": 0.2541, "step": 424 }, { "epoch": 0.94, "grad_norm": 0.17551340253440462, "learning_rate": 0.00011869466559170073, "loss": 0.2521, "step": 425 }, { "epoch": 0.94, "grad_norm": 0.18233468138593362, "learning_rate": 0.00011833230850381487, "loss": 0.2712, "step": 426 }, { "epoch": 0.95, "grad_norm": 0.18504780357931186, "learning_rate": 0.00011796970216475018, "loss": 0.2754, "step": 427 }, { "epoch": 0.95, "grad_norm": 0.18478336853897961, "learning_rate": 0.00011760685150460362, "loss": 0.2592, "step": 428 }, { "epoch": 0.95, "grad_norm": 0.19917097640828008, "learning_rate": 0.00011724376145679394, "loss": 0.2855, "step": 429 }, { "epoch": 0.95, "grad_norm": 0.19144241808727208, "learning_rate": 0.00011688043695799468, "loss": 0.2502, "step": 430 }, { "epoch": 0.95, "grad_norm": 0.18405239160044018, "learning_rate": 0.00011651688294806706, "loss": 0.2477, "step": 431 }, { "epoch": 0.96, "grad_norm": 0.1796721265270086, "learning_rate": 0.00011615310436999279, "loss": 0.249, "step": 432 }, { "epoch": 0.96, "grad_norm": 0.198923200273133, "learning_rate": 0.00011578910616980683, "loss": 0.2559, "step": 433 }, { "epoch": 0.96, "grad_norm": 0.2056661440897021, "learning_rate": 0.00011542489329653024, "loss": 0.2645, "step": 434 }, { "epoch": 0.96, "grad_norm": 0.21059350822537876, "learning_rate": 0.00011506047070210282, "loss": 0.2747, "step": 435 }, { "epoch": 0.97, "grad_norm": 0.19445339084850757, "learning_rate": 0.00011469584334131578, "loss": 0.2732, "step": 436 }, { "epoch": 0.97, "grad_norm": 0.1978300160069901, "learning_rate": 0.0001143310161717444, "loss": 0.2653, "step": 437 }, { "epoch": 0.97, "grad_norm": 0.1992099703929807, "learning_rate": 0.00011396599415368061, "loss": 0.2775, "step": 438 }, { "epoch": 0.97, "grad_norm": 0.20064532246856512, "learning_rate": 0.00011360078225006562, "loss": 0.2642, "step": 439 }, { "epoch": 0.97, "grad_norm": 0.19513907362313387, "learning_rate": 0.00011323538542642227, "loss": 0.2554, "step": 440 }, { "epoch": 0.98, "grad_norm": 0.191943659814926, "learning_rate": 0.00011286980865078763, "loss": 0.2778, "step": 441 }, { "epoch": 0.98, "grad_norm": 0.1921885618320068, "learning_rate": 0.0001125040568936456, "loss": 0.2734, "step": 442 }, { "epoch": 0.98, "grad_norm": 0.19140570504247198, "learning_rate": 0.00011213813512785898, "loss": 0.2809, "step": 443 }, { "epoch": 0.98, "grad_norm": 0.1772706721475436, "learning_rate": 0.00011177204832860213, "loss": 0.2558, "step": 444 }, { "epoch": 0.99, "grad_norm": 0.17875694766580855, "learning_rate": 0.00011140580147329338, "loss": 0.2569, "step": 445 }, { "epoch": 0.99, "grad_norm": 0.1891065652583873, "learning_rate": 0.000111039399541527, "loss": 0.271, "step": 446 }, { "epoch": 0.99, "grad_norm": 0.17377167224369017, "learning_rate": 0.00011067284751500583, "loss": 0.2711, "step": 447 }, { "epoch": 0.99, "grad_norm": 0.16281001225472738, "learning_rate": 0.00011030615037747353, "loss": 0.2464, "step": 448 }, { "epoch": 0.99, "grad_norm": 0.17949964198592142, "learning_rate": 0.0001099393131146466, "loss": 0.2514, "step": 449 }, { "epoch": 1.0, "grad_norm": 0.18463419257880492, "learning_rate": 0.00010957234071414674, "loss": 0.2799, "step": 450 }, { "epoch": 1.0, "grad_norm": 0.18137088353673406, "learning_rate": 0.00010920523816543309, "loss": 0.2615, "step": 451 } ], "logging_steps": 1, "max_steps": 902, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 451, "total_flos": 1.317682551726696e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }