{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9982466393921683, "eval_steps": 500, "global_step": 427, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0023378141437755697, "grad_norm": 27.288526825189052, "learning_rate": 0.0, "loss": 2.2488, "step": 1 }, { "epoch": 0.004675628287551139, "grad_norm": 18.402597061645437, "learning_rate": 1.3511907721365987e-06, "loss": 1.7216, "step": 2 }, { "epoch": 0.0070134424313267095, "grad_norm": 20.827793127279097, "learning_rate": 2.1415867051569737e-06, "loss": 1.8096, "step": 3 }, { "epoch": 0.009351256575102279, "grad_norm": 17.887138033549412, "learning_rate": 2.7023815442731975e-06, "loss": 1.7273, "step": 4 }, { "epoch": 0.011689070718877849, "grad_norm": 13.059331628307975, "learning_rate": 3.137367815376517e-06, "loss": 1.3944, "step": 5 }, { "epoch": 0.014026884862653419, "grad_norm": 12.915220380638836, "learning_rate": 3.4927774772935725e-06, "loss": 1.3309, "step": 6 }, { "epoch": 0.01636469900642899, "grad_norm": 9.438170554483921, "learning_rate": 3.7932720647964956e-06, "loss": 1.1377, "step": 7 }, { "epoch": 0.018702513150204558, "grad_norm": 22.34498398581149, "learning_rate": 4.053572316409796e-06, "loss": 1.276, "step": 8 }, { "epoch": 0.02104032729398013, "grad_norm": 11.688684740044469, "learning_rate": 4.2831734103139475e-06, "loss": 1.1626, "step": 9 }, { "epoch": 0.023378141437755698, "grad_norm": 7.399035066119634, "learning_rate": 4.488558587513117e-06, "loss": 1.0172, "step": 10 }, { "epoch": 0.02571595558153127, "grad_norm": 6.844826809549996, "learning_rate": 4.674352079940294e-06, "loss": 1.0159, "step": 11 }, { "epoch": 0.028053769725306838, "grad_norm": 7.435042287761434, "learning_rate": 4.843968249430172e-06, "loss": 1.0191, "step": 12 }, { "epoch": 0.030391583869082407, "grad_norm": 8.142142185574825, "learning_rate": 5e-06, "loss": 1.0605, "step": 13 }, { "epoch": 0.03272939801285798, "grad_norm": 6.515223941306322, "learning_rate": 5e-06, "loss": 0.9758, "step": 14 }, { "epoch": 0.03506721215663355, "grad_norm": 6.0290403123052965, "learning_rate": 4.98792270531401e-06, "loss": 0.995, "step": 15 }, { "epoch": 0.037405026300409115, "grad_norm": 5.319952909218166, "learning_rate": 4.97584541062802e-06, "loss": 0.9515, "step": 16 }, { "epoch": 0.03974284044418469, "grad_norm": 5.748688589731486, "learning_rate": 4.963768115942029e-06, "loss": 1.0102, "step": 17 }, { "epoch": 0.04208065458796026, "grad_norm": 5.855208805312462, "learning_rate": 4.951690821256039e-06, "loss": 0.9919, "step": 18 }, { "epoch": 0.04441846873173583, "grad_norm": 5.223174635069425, "learning_rate": 4.939613526570048e-06, "loss": 0.9656, "step": 19 }, { "epoch": 0.046756282875511396, "grad_norm": 6.065299105647371, "learning_rate": 4.927536231884059e-06, "loss": 0.9285, "step": 20 }, { "epoch": 0.049094097019286964, "grad_norm": 6.07241545277926, "learning_rate": 4.915458937198068e-06, "loss": 1.0017, "step": 21 }, { "epoch": 0.05143191116306254, "grad_norm": 5.246067572533348, "learning_rate": 4.903381642512078e-06, "loss": 1.0095, "step": 22 }, { "epoch": 0.05376972530683811, "grad_norm": 5.990327031501364, "learning_rate": 4.891304347826087e-06, "loss": 0.9395, "step": 23 }, { "epoch": 0.056107539450613676, "grad_norm": 5.165210682799403, "learning_rate": 4.879227053140097e-06, "loss": 0.9407, "step": 24 }, { "epoch": 0.058445353594389245, "grad_norm": 5.126974516324422, "learning_rate": 4.867149758454107e-06, "loss": 0.949, "step": 25 }, { "epoch": 0.06078316773816481, "grad_norm": 5.363449994215859, "learning_rate": 4.855072463768117e-06, "loss": 0.9574, "step": 26 }, { "epoch": 0.06312098188194039, "grad_norm": 5.071572733466881, "learning_rate": 4.8429951690821256e-06, "loss": 0.9008, "step": 27 }, { "epoch": 0.06545879602571596, "grad_norm": 5.410969697138812, "learning_rate": 4.830917874396135e-06, "loss": 0.9422, "step": 28 }, { "epoch": 0.06779661016949153, "grad_norm": 4.602908185422313, "learning_rate": 4.818840579710145e-06, "loss": 0.8725, "step": 29 }, { "epoch": 0.0701344243132671, "grad_norm": 4.8619656541333836, "learning_rate": 4.806763285024155e-06, "loss": 0.9414, "step": 30 }, { "epoch": 0.07247223845704266, "grad_norm": 5.730566605120532, "learning_rate": 4.794685990338165e-06, "loss": 0.9039, "step": 31 }, { "epoch": 0.07481005260081823, "grad_norm": 5.038586687201418, "learning_rate": 4.782608695652174e-06, "loss": 0.9208, "step": 32 }, { "epoch": 0.0771478667445938, "grad_norm": 5.0552868730926335, "learning_rate": 4.770531400966184e-06, "loss": 0.8567, "step": 33 }, { "epoch": 0.07948568088836938, "grad_norm": 4.718130108871858, "learning_rate": 4.758454106280194e-06, "loss": 0.9145, "step": 34 }, { "epoch": 0.08182349503214495, "grad_norm": 5.492137838168964, "learning_rate": 4.746376811594204e-06, "loss": 0.8846, "step": 35 }, { "epoch": 0.08416130917592052, "grad_norm": 4.796280317690393, "learning_rate": 4.7342995169082125e-06, "loss": 0.8973, "step": 36 }, { "epoch": 0.08649912331969609, "grad_norm": 5.097877561946411, "learning_rate": 4.722222222222222e-06, "loss": 0.9225, "step": 37 }, { "epoch": 0.08883693746347165, "grad_norm": 5.149693059570453, "learning_rate": 4.710144927536232e-06, "loss": 0.9087, "step": 38 }, { "epoch": 0.09117475160724722, "grad_norm": 4.769756789814799, "learning_rate": 4.698067632850242e-06, "loss": 0.8372, "step": 39 }, { "epoch": 0.09351256575102279, "grad_norm": 4.303036243240873, "learning_rate": 4.6859903381642516e-06, "loss": 0.899, "step": 40 }, { "epoch": 0.09585037989479836, "grad_norm": 5.053977102743315, "learning_rate": 4.673913043478261e-06, "loss": 0.8242, "step": 41 }, { "epoch": 0.09818819403857393, "grad_norm": 4.448509206619331, "learning_rate": 4.661835748792271e-06, "loss": 0.8513, "step": 42 }, { "epoch": 0.1005260081823495, "grad_norm": 4.8321447335981595, "learning_rate": 4.649758454106281e-06, "loss": 0.8224, "step": 43 }, { "epoch": 0.10286382232612508, "grad_norm": 5.416510574830531, "learning_rate": 4.637681159420291e-06, "loss": 0.9078, "step": 44 }, { "epoch": 0.10520163646990065, "grad_norm": 5.548877279459332, "learning_rate": 4.6256038647342995e-06, "loss": 0.9292, "step": 45 }, { "epoch": 0.10753945061367622, "grad_norm": 5.023304416916682, "learning_rate": 4.613526570048309e-06, "loss": 0.8678, "step": 46 }, { "epoch": 0.10987726475745178, "grad_norm": 5.3492127097713995, "learning_rate": 4.601449275362319e-06, "loss": 0.8999, "step": 47 }, { "epoch": 0.11221507890122735, "grad_norm": 4.59060914495858, "learning_rate": 4.589371980676329e-06, "loss": 0.8611, "step": 48 }, { "epoch": 0.11455289304500292, "grad_norm": 4.659978410728117, "learning_rate": 4.5772946859903385e-06, "loss": 0.8553, "step": 49 }, { "epoch": 0.11689070718877849, "grad_norm": 4.869606947497931, "learning_rate": 4.565217391304348e-06, "loss": 0.8353, "step": 50 }, { "epoch": 0.11922852133255406, "grad_norm": 4.595092369703616, "learning_rate": 4.553140096618358e-06, "loss": 0.8852, "step": 51 }, { "epoch": 0.12156633547632963, "grad_norm": 4.846379704504629, "learning_rate": 4.541062801932368e-06, "loss": 0.8516, "step": 52 }, { "epoch": 0.12390414962010521, "grad_norm": 4.762019560202168, "learning_rate": 4.5289855072463775e-06, "loss": 0.8621, "step": 53 }, { "epoch": 0.12624196376388078, "grad_norm": 4.823419642191392, "learning_rate": 4.516908212560387e-06, "loss": 0.849, "step": 54 }, { "epoch": 0.12857977790765635, "grad_norm": 4.678982332878243, "learning_rate": 4.504830917874396e-06, "loss": 0.8595, "step": 55 }, { "epoch": 0.1309175920514319, "grad_norm": 4.695208099270892, "learning_rate": 4.492753623188406e-06, "loss": 0.918, "step": 56 }, { "epoch": 0.13325540619520748, "grad_norm": 4.692012801651267, "learning_rate": 4.480676328502416e-06, "loss": 0.8879, "step": 57 }, { "epoch": 0.13559322033898305, "grad_norm": 4.459981999724462, "learning_rate": 4.4685990338164255e-06, "loss": 0.8878, "step": 58 }, { "epoch": 0.13793103448275862, "grad_norm": 4.2801494436823, "learning_rate": 4.456521739130435e-06, "loss": 0.8695, "step": 59 }, { "epoch": 0.1402688486265342, "grad_norm": 4.86123317504702, "learning_rate": 4.444444444444444e-06, "loss": 0.839, "step": 60 }, { "epoch": 0.14260666277030976, "grad_norm": 4.621713656381368, "learning_rate": 4.432367149758455e-06, "loss": 0.8264, "step": 61 }, { "epoch": 0.14494447691408532, "grad_norm": 4.437318825045428, "learning_rate": 4.4202898550724645e-06, "loss": 0.8575, "step": 62 }, { "epoch": 0.1472822910578609, "grad_norm": 4.191896550350781, "learning_rate": 4.408212560386474e-06, "loss": 0.8231, "step": 63 }, { "epoch": 0.14962010520163646, "grad_norm": 4.934485372283743, "learning_rate": 4.396135265700483e-06, "loss": 0.908, "step": 64 }, { "epoch": 0.15195791934541203, "grad_norm": 5.164473939972992, "learning_rate": 4.384057971014493e-06, "loss": 0.8157, "step": 65 }, { "epoch": 0.1542957334891876, "grad_norm": 4.6359554134854655, "learning_rate": 4.371980676328503e-06, "loss": 0.8553, "step": 66 }, { "epoch": 0.15663354763296317, "grad_norm": 4.586287061115779, "learning_rate": 4.3599033816425124e-06, "loss": 0.8434, "step": 67 }, { "epoch": 0.15897136177673876, "grad_norm": 4.8424129486531, "learning_rate": 4.347826086956522e-06, "loss": 0.8788, "step": 68 }, { "epoch": 0.16130917592051433, "grad_norm": 6.155570830239365, "learning_rate": 4.335748792270532e-06, "loss": 0.9687, "step": 69 }, { "epoch": 0.1636469900642899, "grad_norm": 4.812494847678857, "learning_rate": 4.323671497584541e-06, "loss": 0.9001, "step": 70 }, { "epoch": 0.16598480420806547, "grad_norm": 4.5207315366098255, "learning_rate": 4.3115942028985515e-06, "loss": 0.8164, "step": 71 }, { "epoch": 0.16832261835184104, "grad_norm": 4.468118689699742, "learning_rate": 4.299516908212561e-06, "loss": 0.828, "step": 72 }, { "epoch": 0.1706604324956166, "grad_norm": 4.957803820804726, "learning_rate": 4.28743961352657e-06, "loss": 0.8509, "step": 73 }, { "epoch": 0.17299824663939217, "grad_norm": 4.994668979616406, "learning_rate": 4.27536231884058e-06, "loss": 0.8264, "step": 74 }, { "epoch": 0.17533606078316774, "grad_norm": 5.051317651149785, "learning_rate": 4.26328502415459e-06, "loss": 0.8575, "step": 75 }, { "epoch": 0.1776738749269433, "grad_norm": 4.982871471593161, "learning_rate": 4.251207729468599e-06, "loss": 0.7766, "step": 76 }, { "epoch": 0.18001168907071888, "grad_norm": 4.812654963388801, "learning_rate": 4.239130434782609e-06, "loss": 0.842, "step": 77 }, { "epoch": 0.18234950321449445, "grad_norm": 4.849638872368005, "learning_rate": 4.227053140096619e-06, "loss": 0.8493, "step": 78 }, { "epoch": 0.18468731735827001, "grad_norm": 4.941754403496056, "learning_rate": 4.214975845410628e-06, "loss": 0.8705, "step": 79 }, { "epoch": 0.18702513150204558, "grad_norm": 4.118521255369774, "learning_rate": 4.202898550724638e-06, "loss": 0.9022, "step": 80 }, { "epoch": 0.18936294564582115, "grad_norm": 5.048580106033392, "learning_rate": 4.190821256038647e-06, "loss": 0.8431, "step": 81 }, { "epoch": 0.19170075978959672, "grad_norm": 5.383766123063546, "learning_rate": 4.178743961352658e-06, "loss": 0.8892, "step": 82 }, { "epoch": 0.1940385739333723, "grad_norm": 4.850111002487489, "learning_rate": 4.166666666666667e-06, "loss": 0.9147, "step": 83 }, { "epoch": 0.19637638807714786, "grad_norm": 4.703827358788699, "learning_rate": 4.154589371980677e-06, "loss": 0.8407, "step": 84 }, { "epoch": 0.19871420222092342, "grad_norm": 4.5132494951253275, "learning_rate": 4.142512077294686e-06, "loss": 0.859, "step": 85 }, { "epoch": 0.201052016364699, "grad_norm": 4.425801289741148, "learning_rate": 4.130434782608696e-06, "loss": 0.8643, "step": 86 }, { "epoch": 0.2033898305084746, "grad_norm": 4.6519866473202285, "learning_rate": 4.118357487922706e-06, "loss": 0.8559, "step": 87 }, { "epoch": 0.20572764465225016, "grad_norm": 4.271767242791549, "learning_rate": 4.106280193236716e-06, "loss": 0.8115, "step": 88 }, { "epoch": 0.20806545879602573, "grad_norm": 5.056579518750136, "learning_rate": 4.0942028985507246e-06, "loss": 0.8447, "step": 89 }, { "epoch": 0.2104032729398013, "grad_norm": 4.075100416572746, "learning_rate": 4.082125603864734e-06, "loss": 0.7837, "step": 90 }, { "epoch": 0.21274108708357686, "grad_norm": 4.393779666632264, "learning_rate": 4.070048309178744e-06, "loss": 0.8368, "step": 91 }, { "epoch": 0.21507890122735243, "grad_norm": 4.322824034406939, "learning_rate": 4.057971014492754e-06, "loss": 0.7942, "step": 92 }, { "epoch": 0.217416715371128, "grad_norm": 4.691982719838354, "learning_rate": 4.045893719806764e-06, "loss": 0.8384, "step": 93 }, { "epoch": 0.21975452951490357, "grad_norm": 4.749714290659545, "learning_rate": 4.033816425120773e-06, "loss": 0.86, "step": 94 }, { "epoch": 0.22209234365867914, "grad_norm": 4.49073749526097, "learning_rate": 4.021739130434783e-06, "loss": 0.8796, "step": 95 }, { "epoch": 0.2244301578024547, "grad_norm": 4.612026680332374, "learning_rate": 4.009661835748793e-06, "loss": 0.7836, "step": 96 }, { "epoch": 0.22676797194623027, "grad_norm": 4.5466671401291165, "learning_rate": 3.997584541062803e-06, "loss": 0.8213, "step": 97 }, { "epoch": 0.22910578609000584, "grad_norm": 4.578959418279228, "learning_rate": 3.9855072463768115e-06, "loss": 0.8302, "step": 98 }, { "epoch": 0.2314436002337814, "grad_norm": 4.471310182272502, "learning_rate": 3.973429951690821e-06, "loss": 0.8386, "step": 99 }, { "epoch": 0.23378141437755698, "grad_norm": 4.444066950873127, "learning_rate": 3.961352657004831e-06, "loss": 0.8672, "step": 100 }, { "epoch": 0.23611922852133255, "grad_norm": 4.08994098536812, "learning_rate": 3.949275362318841e-06, "loss": 0.7914, "step": 101 }, { "epoch": 0.23845704266510812, "grad_norm": 5.867972858556011, "learning_rate": 3.9371980676328506e-06, "loss": 0.834, "step": 102 }, { "epoch": 0.24079485680888368, "grad_norm": 4.33178424044995, "learning_rate": 3.92512077294686e-06, "loss": 0.8312, "step": 103 }, { "epoch": 0.24313267095265925, "grad_norm": 4.422360019571021, "learning_rate": 3.91304347826087e-06, "loss": 0.8054, "step": 104 }, { "epoch": 0.24547048509643482, "grad_norm": 4.540760031449362, "learning_rate": 3.90096618357488e-06, "loss": 0.8011, "step": 105 }, { "epoch": 0.24780829924021042, "grad_norm": 4.577644817701169, "learning_rate": 3.88888888888889e-06, "loss": 0.7851, "step": 106 }, { "epoch": 0.25014611338398596, "grad_norm": 4.750903595759052, "learning_rate": 3.8768115942028985e-06, "loss": 0.8496, "step": 107 }, { "epoch": 0.25248392752776155, "grad_norm": 4.744977001781623, "learning_rate": 3.864734299516908e-06, "loss": 0.8218, "step": 108 }, { "epoch": 0.2548217416715371, "grad_norm": 4.548950141262851, "learning_rate": 3.852657004830918e-06, "loss": 0.8053, "step": 109 }, { "epoch": 0.2571595558153127, "grad_norm": 4.44828603075951, "learning_rate": 3.840579710144928e-06, "loss": 0.8231, "step": 110 }, { "epoch": 0.25949736995908823, "grad_norm": 4.672161591073822, "learning_rate": 3.8285024154589375e-06, "loss": 0.8389, "step": 111 }, { "epoch": 0.2618351841028638, "grad_norm": 4.526274586937092, "learning_rate": 3.816425120772947e-06, "loss": 0.8683, "step": 112 }, { "epoch": 0.26417299824663937, "grad_norm": 4.603415978914653, "learning_rate": 3.804347826086957e-06, "loss": 0.8206, "step": 113 }, { "epoch": 0.26651081239041496, "grad_norm": 4.343843088593362, "learning_rate": 3.792270531400967e-06, "loss": 0.823, "step": 114 }, { "epoch": 0.2688486265341905, "grad_norm": 4.131180727748698, "learning_rate": 3.780193236714976e-06, "loss": 0.7964, "step": 115 }, { "epoch": 0.2711864406779661, "grad_norm": 5.611563677944062, "learning_rate": 3.768115942028986e-06, "loss": 0.8529, "step": 116 }, { "epoch": 0.2735242548217417, "grad_norm": 4.315382063517201, "learning_rate": 3.7560386473429956e-06, "loss": 0.7849, "step": 117 }, { "epoch": 0.27586206896551724, "grad_norm": 4.3301657812789776, "learning_rate": 3.743961352657005e-06, "loss": 0.8392, "step": 118 }, { "epoch": 0.27819988310929283, "grad_norm": 4.763659062354643, "learning_rate": 3.7318840579710147e-06, "loss": 0.7846, "step": 119 }, { "epoch": 0.2805376972530684, "grad_norm": 4.531318611414816, "learning_rate": 3.7198067632850245e-06, "loss": 0.8335, "step": 120 }, { "epoch": 0.28287551139684397, "grad_norm": 4.4418077648050485, "learning_rate": 3.707729468599034e-06, "loss": 0.7858, "step": 121 }, { "epoch": 0.2852133255406195, "grad_norm": 4.39068842397474, "learning_rate": 3.6956521739130436e-06, "loss": 0.8408, "step": 122 }, { "epoch": 0.2875511396843951, "grad_norm": 4.585137838540199, "learning_rate": 3.6835748792270538e-06, "loss": 0.8316, "step": 123 }, { "epoch": 0.28988895382817065, "grad_norm": 4.319672080062613, "learning_rate": 3.6714975845410635e-06, "loss": 0.8241, "step": 124 }, { "epoch": 0.29222676797194624, "grad_norm": 4.131090234388279, "learning_rate": 3.659420289855073e-06, "loss": 0.7416, "step": 125 }, { "epoch": 0.2945645821157218, "grad_norm": 4.081456252490184, "learning_rate": 3.6473429951690826e-06, "loss": 0.7958, "step": 126 }, { "epoch": 0.2969023962594974, "grad_norm": 4.090503599319394, "learning_rate": 3.635265700483092e-06, "loss": 0.8096, "step": 127 }, { "epoch": 0.2992402104032729, "grad_norm": 4.129285724564573, "learning_rate": 3.6231884057971017e-06, "loss": 0.7918, "step": 128 }, { "epoch": 0.3015780245470485, "grad_norm": 4.506022555765926, "learning_rate": 3.6111111111111115e-06, "loss": 0.8333, "step": 129 }, { "epoch": 0.30391583869082406, "grad_norm": 4.151575198600969, "learning_rate": 3.5990338164251208e-06, "loss": 0.7713, "step": 130 }, { "epoch": 0.30625365283459965, "grad_norm": 4.614683656771631, "learning_rate": 3.5869565217391305e-06, "loss": 0.8298, "step": 131 }, { "epoch": 0.3085914669783752, "grad_norm": 4.6094981031628075, "learning_rate": 3.5748792270531403e-06, "loss": 0.8217, "step": 132 }, { "epoch": 0.3109292811221508, "grad_norm": 4.2999582776551675, "learning_rate": 3.5628019323671496e-06, "loss": 0.7968, "step": 133 }, { "epoch": 0.31326709526592633, "grad_norm": 4.864198700798981, "learning_rate": 3.55072463768116e-06, "loss": 0.8141, "step": 134 }, { "epoch": 0.31560490940970193, "grad_norm": 4.601546334463328, "learning_rate": 3.5386473429951696e-06, "loss": 0.7925, "step": 135 }, { "epoch": 0.3179427235534775, "grad_norm": 4.089485101723296, "learning_rate": 3.5265700483091793e-06, "loss": 0.7873, "step": 136 }, { "epoch": 0.32028053769725306, "grad_norm": 4.0777367885745806, "learning_rate": 3.5144927536231887e-06, "loss": 0.7985, "step": 137 }, { "epoch": 0.32261835184102866, "grad_norm": 4.832689220436005, "learning_rate": 3.5024154589371984e-06, "loss": 0.8306, "step": 138 }, { "epoch": 0.3249561659848042, "grad_norm": 4.888417681228503, "learning_rate": 3.490338164251208e-06, "loss": 0.8353, "step": 139 }, { "epoch": 0.3272939801285798, "grad_norm": 4.28948650105686, "learning_rate": 3.4782608695652175e-06, "loss": 0.8057, "step": 140 }, { "epoch": 0.32963179427235534, "grad_norm": 4.203178774124529, "learning_rate": 3.4661835748792273e-06, "loss": 0.7788, "step": 141 }, { "epoch": 0.33196960841613093, "grad_norm": 4.637106026831514, "learning_rate": 3.4541062801932366e-06, "loss": 0.8521, "step": 142 }, { "epoch": 0.3343074225599065, "grad_norm": 4.350395114537057, "learning_rate": 3.4420289855072464e-06, "loss": 0.7968, "step": 143 }, { "epoch": 0.33664523670368207, "grad_norm": 4.474607456827939, "learning_rate": 3.4299516908212565e-06, "loss": 0.8208, "step": 144 }, { "epoch": 0.3389830508474576, "grad_norm": 4.100288353060924, "learning_rate": 3.4178743961352663e-06, "loss": 0.8165, "step": 145 }, { "epoch": 0.3413208649912332, "grad_norm": 4.6247997756094845, "learning_rate": 3.4057971014492756e-06, "loss": 0.8294, "step": 146 }, { "epoch": 0.34365867913500875, "grad_norm": 4.525169765596723, "learning_rate": 3.3937198067632854e-06, "loss": 0.7713, "step": 147 }, { "epoch": 0.34599649327878435, "grad_norm": 4.442206881786442, "learning_rate": 3.381642512077295e-06, "loss": 0.82, "step": 148 }, { "epoch": 0.3483343074225599, "grad_norm": 4.225556484795958, "learning_rate": 3.3695652173913045e-06, "loss": 0.7886, "step": 149 }, { "epoch": 0.3506721215663355, "grad_norm": 4.268743583707888, "learning_rate": 3.3574879227053142e-06, "loss": 0.7762, "step": 150 }, { "epoch": 0.353009935710111, "grad_norm": 4.338428118785664, "learning_rate": 3.345410628019324e-06, "loss": 0.7719, "step": 151 }, { "epoch": 0.3553477498538866, "grad_norm": 4.188696391446484, "learning_rate": 3.3333333333333333e-06, "loss": 0.7745, "step": 152 }, { "epoch": 0.35768556399766216, "grad_norm": 4.310914121193176, "learning_rate": 3.321256038647343e-06, "loss": 0.8188, "step": 153 }, { "epoch": 0.36002337814143776, "grad_norm": 4.391656829031555, "learning_rate": 3.3091787439613533e-06, "loss": 0.8148, "step": 154 }, { "epoch": 0.36236119228521335, "grad_norm": 4.104259018738402, "learning_rate": 3.2971014492753626e-06, "loss": 0.8519, "step": 155 }, { "epoch": 0.3646990064289889, "grad_norm": 4.25629990334181, "learning_rate": 3.2850241545893724e-06, "loss": 0.7983, "step": 156 }, { "epoch": 0.3670368205727645, "grad_norm": 4.134990269789036, "learning_rate": 3.272946859903382e-06, "loss": 0.7852, "step": 157 }, { "epoch": 0.36937463471654003, "grad_norm": 4.0420970622040135, "learning_rate": 3.2608695652173914e-06, "loss": 0.7992, "step": 158 }, { "epoch": 0.3717124488603156, "grad_norm": 4.341222672754024, "learning_rate": 3.248792270531401e-06, "loss": 0.7704, "step": 159 }, { "epoch": 0.37405026300409117, "grad_norm": 4.115523347634753, "learning_rate": 3.236714975845411e-06, "loss": 0.791, "step": 160 }, { "epoch": 0.37638807714786676, "grad_norm": 4.136587110231359, "learning_rate": 3.2246376811594203e-06, "loss": 0.7752, "step": 161 }, { "epoch": 0.3787258912916423, "grad_norm": 4.504460772929252, "learning_rate": 3.21256038647343e-06, "loss": 0.8176, "step": 162 }, { "epoch": 0.3810637054354179, "grad_norm": 4.629377407126395, "learning_rate": 3.20048309178744e-06, "loss": 0.8275, "step": 163 }, { "epoch": 0.38340151957919344, "grad_norm": 4.422477761962599, "learning_rate": 3.188405797101449e-06, "loss": 0.7847, "step": 164 }, { "epoch": 0.38573933372296904, "grad_norm": 3.9888038106102153, "learning_rate": 3.1763285024154593e-06, "loss": 0.7939, "step": 165 }, { "epoch": 0.3880771478667446, "grad_norm": 4.125918892903183, "learning_rate": 3.164251207729469e-06, "loss": 0.7717, "step": 166 }, { "epoch": 0.3904149620105202, "grad_norm": 6.885413034719951, "learning_rate": 3.152173913043479e-06, "loss": 0.8514, "step": 167 }, { "epoch": 0.3927527761542957, "grad_norm": 4.446340003037039, "learning_rate": 3.140096618357488e-06, "loss": 0.813, "step": 168 }, { "epoch": 0.3950905902980713, "grad_norm": 3.9959566422822346, "learning_rate": 3.128019323671498e-06, "loss": 0.7776, "step": 169 }, { "epoch": 0.39742840444184685, "grad_norm": 4.627421389612255, "learning_rate": 3.1159420289855073e-06, "loss": 0.8395, "step": 170 }, { "epoch": 0.39976621858562245, "grad_norm": 4.118715295949323, "learning_rate": 3.103864734299517e-06, "loss": 0.7824, "step": 171 }, { "epoch": 0.402104032729398, "grad_norm": 4.109354113391549, "learning_rate": 3.0917874396135268e-06, "loss": 0.7961, "step": 172 }, { "epoch": 0.4044418468731736, "grad_norm": 4.439845150489727, "learning_rate": 3.079710144927536e-06, "loss": 0.8035, "step": 173 }, { "epoch": 0.4067796610169492, "grad_norm": 4.358250626799815, "learning_rate": 3.067632850241546e-06, "loss": 0.7829, "step": 174 }, { "epoch": 0.4091174751607247, "grad_norm": 4.43053050152037, "learning_rate": 3.055555555555556e-06, "loss": 0.7554, "step": 175 }, { "epoch": 0.4114552893045003, "grad_norm": 4.324105830729812, "learning_rate": 3.043478260869566e-06, "loss": 0.7763, "step": 176 }, { "epoch": 0.41379310344827586, "grad_norm": 4.505708676229393, "learning_rate": 3.031400966183575e-06, "loss": 0.8052, "step": 177 }, { "epoch": 0.41613091759205145, "grad_norm": 4.198009455233572, "learning_rate": 3.019323671497585e-06, "loss": 0.8036, "step": 178 }, { "epoch": 0.418468731735827, "grad_norm": 4.255888057785401, "learning_rate": 3.0072463768115946e-06, "loss": 0.8675, "step": 179 }, { "epoch": 0.4208065458796026, "grad_norm": 4.166498155365259, "learning_rate": 2.995169082125604e-06, "loss": 0.8099, "step": 180 }, { "epoch": 0.42314436002337813, "grad_norm": 4.471408293419965, "learning_rate": 2.9830917874396137e-06, "loss": 0.8025, "step": 181 }, { "epoch": 0.4254821741671537, "grad_norm": 4.910816764257679, "learning_rate": 2.9710144927536235e-06, "loss": 0.7702, "step": 182 }, { "epoch": 0.42781998831092927, "grad_norm": 4.071039233797094, "learning_rate": 2.958937198067633e-06, "loss": 0.8143, "step": 183 }, { "epoch": 0.43015780245470486, "grad_norm": 4.738565032335615, "learning_rate": 2.9468599033816426e-06, "loss": 0.8158, "step": 184 }, { "epoch": 0.4324956165984804, "grad_norm": 4.2936029356268195, "learning_rate": 2.9347826086956528e-06, "loss": 0.7874, "step": 185 }, { "epoch": 0.434833430742256, "grad_norm": 4.206590096270031, "learning_rate": 2.922705314009662e-06, "loss": 0.7997, "step": 186 }, { "epoch": 0.43717124488603154, "grad_norm": 4.2051171328892085, "learning_rate": 2.910628019323672e-06, "loss": 0.787, "step": 187 }, { "epoch": 0.43950905902980714, "grad_norm": 4.245918333471198, "learning_rate": 2.8985507246376816e-06, "loss": 0.7997, "step": 188 }, { "epoch": 0.4418468731735827, "grad_norm": 4.179370789694772, "learning_rate": 2.886473429951691e-06, "loss": 0.7759, "step": 189 }, { "epoch": 0.4441846873173583, "grad_norm": 4.41515451612343, "learning_rate": 2.8743961352657007e-06, "loss": 0.7886, "step": 190 }, { "epoch": 0.4465225014611338, "grad_norm": 4.227222440463386, "learning_rate": 2.8623188405797105e-06, "loss": 0.8294, "step": 191 }, { "epoch": 0.4488603156049094, "grad_norm": 4.095256731977333, "learning_rate": 2.85024154589372e-06, "loss": 0.7604, "step": 192 }, { "epoch": 0.451198129748685, "grad_norm": 4.286339845869899, "learning_rate": 2.8381642512077295e-06, "loss": 0.8237, "step": 193 }, { "epoch": 0.45353594389246055, "grad_norm": 4.141328341649525, "learning_rate": 2.8260869565217393e-06, "loss": 0.7961, "step": 194 }, { "epoch": 0.45587375803623614, "grad_norm": 4.522982085235291, "learning_rate": 2.8140096618357486e-06, "loss": 0.7918, "step": 195 }, { "epoch": 0.4582115721800117, "grad_norm": 4.9933547683547745, "learning_rate": 2.801932367149759e-06, "loss": 0.8151, "step": 196 }, { "epoch": 0.4605493863237873, "grad_norm": 3.8642864404581463, "learning_rate": 2.7898550724637686e-06, "loss": 0.7411, "step": 197 }, { "epoch": 0.4628872004675628, "grad_norm": 4.304180579272247, "learning_rate": 2.7777777777777783e-06, "loss": 0.7975, "step": 198 }, { "epoch": 0.4652250146113384, "grad_norm": 4.246581554029021, "learning_rate": 2.7657004830917877e-06, "loss": 0.829, "step": 199 }, { "epoch": 0.46756282875511396, "grad_norm": 4.257923593734172, "learning_rate": 2.7536231884057974e-06, "loss": 0.7475, "step": 200 }, { "epoch": 0.46990064289888955, "grad_norm": 4.001585884428085, "learning_rate": 2.7415458937198068e-06, "loss": 0.7866, "step": 201 }, { "epoch": 0.4722384570426651, "grad_norm": 4.064057741085377, "learning_rate": 2.7294685990338165e-06, "loss": 0.7861, "step": 202 }, { "epoch": 0.4745762711864407, "grad_norm": 4.10748108128691, "learning_rate": 2.7173913043478263e-06, "loss": 0.7735, "step": 203 }, { "epoch": 0.47691408533021623, "grad_norm": 3.9433247912828455, "learning_rate": 2.7053140096618356e-06, "loss": 0.7494, "step": 204 }, { "epoch": 0.4792518994739918, "grad_norm": 4.368990885761068, "learning_rate": 2.6932367149758454e-06, "loss": 0.7961, "step": 205 }, { "epoch": 0.48158971361776737, "grad_norm": 4.323297445539955, "learning_rate": 2.6811594202898555e-06, "loss": 0.7498, "step": 206 }, { "epoch": 0.48392752776154296, "grad_norm": 4.276797241413841, "learning_rate": 2.6690821256038653e-06, "loss": 0.79, "step": 207 }, { "epoch": 0.4862653419053185, "grad_norm": 4.29615738858519, "learning_rate": 2.6570048309178746e-06, "loss": 0.7762, "step": 208 }, { "epoch": 0.4886031560490941, "grad_norm": 4.24658335062537, "learning_rate": 2.6449275362318844e-06, "loss": 0.7547, "step": 209 }, { "epoch": 0.49094097019286964, "grad_norm": 4.140652638469078, "learning_rate": 2.632850241545894e-06, "loss": 0.7568, "step": 210 }, { "epoch": 0.49327878433664524, "grad_norm": 4.355835930781116, "learning_rate": 2.6207729468599035e-06, "loss": 0.8005, "step": 211 }, { "epoch": 0.49561659848042083, "grad_norm": 4.1002906789316045, "learning_rate": 2.6086956521739132e-06, "loss": 0.7791, "step": 212 }, { "epoch": 0.4979544126241964, "grad_norm": 4.210038749172179, "learning_rate": 2.596618357487923e-06, "loss": 0.7777, "step": 213 }, { "epoch": 0.5002922267679719, "grad_norm": 4.1435757469488985, "learning_rate": 2.5845410628019323e-06, "loss": 0.7824, "step": 214 }, { "epoch": 0.5026300409117476, "grad_norm": 4.309944612009968, "learning_rate": 2.572463768115942e-06, "loss": 0.7625, "step": 215 }, { "epoch": 0.5049678550555231, "grad_norm": 4.662526042139382, "learning_rate": 2.5603864734299523e-06, "loss": 0.7873, "step": 216 }, { "epoch": 0.5073056691992986, "grad_norm": 4.473614799031895, "learning_rate": 2.5483091787439616e-06, "loss": 0.7737, "step": 217 }, { "epoch": 0.5096434833430742, "grad_norm": 4.54082051832202, "learning_rate": 2.5362318840579714e-06, "loss": 0.782, "step": 218 }, { "epoch": 0.5119812974868498, "grad_norm": 3.9808775866846817, "learning_rate": 2.524154589371981e-06, "loss": 0.7592, "step": 219 }, { "epoch": 0.5143191116306254, "grad_norm": 4.233088111283031, "learning_rate": 2.5120772946859904e-06, "loss": 0.774, "step": 220 }, { "epoch": 0.5166569257744009, "grad_norm": 4.179314655537464, "learning_rate": 2.5e-06, "loss": 0.7936, "step": 221 }, { "epoch": 0.5189947399181765, "grad_norm": 4.808766886416466, "learning_rate": 2.48792270531401e-06, "loss": 0.7961, "step": 222 }, { "epoch": 0.5213325540619521, "grad_norm": 4.088801764052967, "learning_rate": 2.4758454106280193e-06, "loss": 0.7693, "step": 223 }, { "epoch": 0.5236703682057277, "grad_norm": 4.1844548782576005, "learning_rate": 2.4637681159420295e-06, "loss": 0.7961, "step": 224 }, { "epoch": 0.5260081823495032, "grad_norm": 3.909844659514703, "learning_rate": 2.451690821256039e-06, "loss": 0.7304, "step": 225 }, { "epoch": 0.5283459964932787, "grad_norm": 3.7096435860994346, "learning_rate": 2.4396135265700486e-06, "loss": 0.7712, "step": 226 }, { "epoch": 0.5306838106370544, "grad_norm": 4.0389484559123305, "learning_rate": 2.4275362318840583e-06, "loss": 0.7711, "step": 227 }, { "epoch": 0.5330216247808299, "grad_norm": 4.171802534409844, "learning_rate": 2.4154589371980677e-06, "loss": 0.7768, "step": 228 }, { "epoch": 0.5353594389246055, "grad_norm": 4.636520882862149, "learning_rate": 2.4033816425120774e-06, "loss": 0.7832, "step": 229 }, { "epoch": 0.537697253068381, "grad_norm": 4.2073440647978675, "learning_rate": 2.391304347826087e-06, "loss": 0.7816, "step": 230 }, { "epoch": 0.5400350672121567, "grad_norm": 4.115009346971059, "learning_rate": 2.379227053140097e-06, "loss": 0.7152, "step": 231 }, { "epoch": 0.5423728813559322, "grad_norm": 4.47134068227285, "learning_rate": 2.3671497584541063e-06, "loss": 0.7898, "step": 232 }, { "epoch": 0.5447106954997077, "grad_norm": 4.78251740854767, "learning_rate": 2.355072463768116e-06, "loss": 0.8101, "step": 233 }, { "epoch": 0.5470485096434834, "grad_norm": 4.735288223469208, "learning_rate": 2.3429951690821258e-06, "loss": 0.7864, "step": 234 }, { "epoch": 0.5493863237872589, "grad_norm": 4.445520808429391, "learning_rate": 2.3309178743961355e-06, "loss": 0.7986, "step": 235 }, { "epoch": 0.5517241379310345, "grad_norm": 4.83504723163877, "learning_rate": 2.3188405797101453e-06, "loss": 0.8231, "step": 236 }, { "epoch": 0.55406195207481, "grad_norm": 3.9498177063802897, "learning_rate": 2.3067632850241546e-06, "loss": 0.7834, "step": 237 }, { "epoch": 0.5563997662185857, "grad_norm": 4.190234074575243, "learning_rate": 2.2946859903381644e-06, "loss": 0.7839, "step": 238 }, { "epoch": 0.5587375803623612, "grad_norm": 4.76462271734834, "learning_rate": 2.282608695652174e-06, "loss": 0.8258, "step": 239 }, { "epoch": 0.5610753945061367, "grad_norm": 4.369965626736373, "learning_rate": 2.270531400966184e-06, "loss": 0.7927, "step": 240 }, { "epoch": 0.5634132086499123, "grad_norm": 4.423067504974851, "learning_rate": 2.2584541062801937e-06, "loss": 0.8181, "step": 241 }, { "epoch": 0.5657510227936879, "grad_norm": 4.117514088831818, "learning_rate": 2.246376811594203e-06, "loss": 0.7471, "step": 242 }, { "epoch": 0.5680888369374635, "grad_norm": 4.208191494707427, "learning_rate": 2.2342995169082127e-06, "loss": 0.7936, "step": 243 }, { "epoch": 0.570426651081239, "grad_norm": 4.30348767627021, "learning_rate": 2.222222222222222e-06, "loss": 0.8087, "step": 244 }, { "epoch": 0.5727644652250146, "grad_norm": 4.08781387103947, "learning_rate": 2.2101449275362323e-06, "loss": 0.7712, "step": 245 }, { "epoch": 0.5751022793687902, "grad_norm": 4.255214633571236, "learning_rate": 2.1980676328502416e-06, "loss": 0.7327, "step": 246 }, { "epoch": 0.5774400935125658, "grad_norm": 4.249395888532918, "learning_rate": 2.1859903381642513e-06, "loss": 0.8115, "step": 247 }, { "epoch": 0.5797779076563413, "grad_norm": 4.048350886158577, "learning_rate": 2.173913043478261e-06, "loss": 0.7629, "step": 248 }, { "epoch": 0.5821157218001168, "grad_norm": 4.286991029118236, "learning_rate": 2.1618357487922704e-06, "loss": 0.7748, "step": 249 }, { "epoch": 0.5844535359438925, "grad_norm": 4.473519294462659, "learning_rate": 2.1497584541062806e-06, "loss": 0.7786, "step": 250 }, { "epoch": 0.586791350087668, "grad_norm": 4.511510327301669, "learning_rate": 2.13768115942029e-06, "loss": 0.8125, "step": 251 }, { "epoch": 0.5891291642314436, "grad_norm": 4.198745204040387, "learning_rate": 2.1256038647342997e-06, "loss": 0.7843, "step": 252 }, { "epoch": 0.5914669783752192, "grad_norm": 4.3568648354588655, "learning_rate": 2.1135265700483095e-06, "loss": 0.7346, "step": 253 }, { "epoch": 0.5938047925189948, "grad_norm": 3.8942460823301412, "learning_rate": 2.101449275362319e-06, "loss": 0.7879, "step": 254 }, { "epoch": 0.5961426066627703, "grad_norm": 4.221148903821956, "learning_rate": 2.089371980676329e-06, "loss": 0.799, "step": 255 }, { "epoch": 0.5984804208065458, "grad_norm": 4.041691704636457, "learning_rate": 2.0772946859903383e-06, "loss": 0.767, "step": 256 }, { "epoch": 0.6008182349503215, "grad_norm": 4.03197715174544, "learning_rate": 2.065217391304348e-06, "loss": 0.7487, "step": 257 }, { "epoch": 0.603156049094097, "grad_norm": 4.082902353599498, "learning_rate": 2.053140096618358e-06, "loss": 0.7874, "step": 258 }, { "epoch": 0.6054938632378726, "grad_norm": 3.7781639431570557, "learning_rate": 2.041062801932367e-06, "loss": 0.7721, "step": 259 }, { "epoch": 0.6078316773816481, "grad_norm": 4.280421267303715, "learning_rate": 2.028985507246377e-06, "loss": 0.783, "step": 260 }, { "epoch": 0.6101694915254238, "grad_norm": 4.073869260462684, "learning_rate": 2.0169082125603867e-06, "loss": 0.7759, "step": 261 }, { "epoch": 0.6125073056691993, "grad_norm": 3.935130784068012, "learning_rate": 2.0048309178743964e-06, "loss": 0.7669, "step": 262 }, { "epoch": 0.6148451198129748, "grad_norm": 4.40643829592683, "learning_rate": 1.9927536231884058e-06, "loss": 0.7572, "step": 263 }, { "epoch": 0.6171829339567504, "grad_norm": 4.337844456783807, "learning_rate": 1.9806763285024155e-06, "loss": 0.7605, "step": 264 }, { "epoch": 0.619520748100526, "grad_norm": 4.281102087431204, "learning_rate": 1.9685990338164253e-06, "loss": 0.7393, "step": 265 }, { "epoch": 0.6218585622443016, "grad_norm": 4.23207914041172, "learning_rate": 1.956521739130435e-06, "loss": 0.7794, "step": 266 }, { "epoch": 0.6241963763880771, "grad_norm": 3.9282868393703896, "learning_rate": 1.944444444444445e-06, "loss": 0.7782, "step": 267 }, { "epoch": 0.6265341905318527, "grad_norm": 4.098138917146235, "learning_rate": 1.932367149758454e-06, "loss": 0.7725, "step": 268 }, { "epoch": 0.6288720046756283, "grad_norm": 4.141313603560724, "learning_rate": 1.920289855072464e-06, "loss": 0.7785, "step": 269 }, { "epoch": 0.6312098188194039, "grad_norm": 4.611198038918517, "learning_rate": 1.9082125603864736e-06, "loss": 0.8185, "step": 270 }, { "epoch": 0.6335476329631794, "grad_norm": 4.452172749748544, "learning_rate": 1.8961352657004834e-06, "loss": 0.7703, "step": 271 }, { "epoch": 0.635885447106955, "grad_norm": 4.454099100217199, "learning_rate": 1.884057971014493e-06, "loss": 0.7756, "step": 272 }, { "epoch": 0.6382232612507306, "grad_norm": 4.159216947583455, "learning_rate": 1.8719806763285025e-06, "loss": 0.7358, "step": 273 }, { "epoch": 0.6405610753945061, "grad_norm": 4.0088196320012885, "learning_rate": 1.8599033816425122e-06, "loss": 0.8002, "step": 274 }, { "epoch": 0.6428988895382817, "grad_norm": 4.197686175636046, "learning_rate": 1.8478260869565218e-06, "loss": 0.7998, "step": 275 }, { "epoch": 0.6452367036820573, "grad_norm": 4.373828840174765, "learning_rate": 1.8357487922705318e-06, "loss": 0.742, "step": 276 }, { "epoch": 0.6475745178258329, "grad_norm": 4.212073348085054, "learning_rate": 1.8236714975845413e-06, "loss": 0.7678, "step": 277 }, { "epoch": 0.6499123319696084, "grad_norm": 3.972532257275605, "learning_rate": 1.8115942028985508e-06, "loss": 0.7757, "step": 278 }, { "epoch": 0.6522501461133839, "grad_norm": 4.141324887414669, "learning_rate": 1.7995169082125604e-06, "loss": 0.7447, "step": 279 }, { "epoch": 0.6545879602571596, "grad_norm": 4.319306461683, "learning_rate": 1.7874396135265702e-06, "loss": 0.7669, "step": 280 }, { "epoch": 0.6569257744009351, "grad_norm": 4.13159761798667, "learning_rate": 1.77536231884058e-06, "loss": 0.753, "step": 281 }, { "epoch": 0.6592635885447107, "grad_norm": 4.261205598617194, "learning_rate": 1.7632850241545897e-06, "loss": 0.7867, "step": 282 }, { "epoch": 0.6616014026884862, "grad_norm": 4.043224440888056, "learning_rate": 1.7512077294685992e-06, "loss": 0.7634, "step": 283 }, { "epoch": 0.6639392168322619, "grad_norm": 4.221366014724788, "learning_rate": 1.7391304347826088e-06, "loss": 0.8032, "step": 284 }, { "epoch": 0.6662770309760374, "grad_norm": 4.2120362159497935, "learning_rate": 1.7270531400966183e-06, "loss": 0.7449, "step": 285 }, { "epoch": 0.668614845119813, "grad_norm": 4.330019099169185, "learning_rate": 1.7149758454106283e-06, "loss": 0.7641, "step": 286 }, { "epoch": 0.6709526592635885, "grad_norm": 4.234551345137344, "learning_rate": 1.7028985507246378e-06, "loss": 0.7785, "step": 287 }, { "epoch": 0.6732904734073641, "grad_norm": 4.789360597178873, "learning_rate": 1.6908212560386476e-06, "loss": 0.7517, "step": 288 }, { "epoch": 0.6756282875511397, "grad_norm": 4.087545337483895, "learning_rate": 1.6787439613526571e-06, "loss": 0.7398, "step": 289 }, { "epoch": 0.6779661016949152, "grad_norm": 4.048928229662754, "learning_rate": 1.6666666666666667e-06, "loss": 0.7759, "step": 290 }, { "epoch": 0.6803039158386909, "grad_norm": 4.258228190717208, "learning_rate": 1.6545893719806766e-06, "loss": 0.7816, "step": 291 }, { "epoch": 0.6826417299824664, "grad_norm": 4.207730290983508, "learning_rate": 1.6425120772946862e-06, "loss": 0.7492, "step": 292 }, { "epoch": 0.684979544126242, "grad_norm": 4.211632269620855, "learning_rate": 1.6304347826086957e-06, "loss": 0.8045, "step": 293 }, { "epoch": 0.6873173582700175, "grad_norm": 4.2791266083196575, "learning_rate": 1.6183574879227055e-06, "loss": 0.7686, "step": 294 }, { "epoch": 0.6896551724137931, "grad_norm": 4.400251918863611, "learning_rate": 1.606280193236715e-06, "loss": 0.7346, "step": 295 }, { "epoch": 0.6919929865575687, "grad_norm": 3.8930859729711, "learning_rate": 1.5942028985507246e-06, "loss": 0.7476, "step": 296 }, { "epoch": 0.6943308007013442, "grad_norm": 4.179140087181349, "learning_rate": 1.5821256038647345e-06, "loss": 0.7758, "step": 297 }, { "epoch": 0.6966686148451198, "grad_norm": 4.1025982230247005, "learning_rate": 1.570048309178744e-06, "loss": 0.764, "step": 298 }, { "epoch": 0.6990064289888954, "grad_norm": 4.54359763623282, "learning_rate": 1.5579710144927536e-06, "loss": 0.813, "step": 299 }, { "epoch": 0.701344243132671, "grad_norm": 3.8868646182191333, "learning_rate": 1.5458937198067634e-06, "loss": 0.7809, "step": 300 }, { "epoch": 0.7036820572764465, "grad_norm": 4.027087287618028, "learning_rate": 1.533816425120773e-06, "loss": 0.7421, "step": 301 }, { "epoch": 0.706019871420222, "grad_norm": 4.221180533576584, "learning_rate": 1.521739130434783e-06, "loss": 0.7437, "step": 302 }, { "epoch": 0.7083576855639977, "grad_norm": 4.025585601097397, "learning_rate": 1.5096618357487924e-06, "loss": 0.7587, "step": 303 }, { "epoch": 0.7106954997077732, "grad_norm": 4.082415548970675, "learning_rate": 1.497584541062802e-06, "loss": 0.7437, "step": 304 }, { "epoch": 0.7130333138515488, "grad_norm": 3.9885030268207764, "learning_rate": 1.4855072463768117e-06, "loss": 0.7342, "step": 305 }, { "epoch": 0.7153711279953243, "grad_norm": 4.110847006439374, "learning_rate": 1.4734299516908213e-06, "loss": 0.7643, "step": 306 }, { "epoch": 0.7177089421391, "grad_norm": 4.018479338411149, "learning_rate": 1.461352657004831e-06, "loss": 0.7524, "step": 307 }, { "epoch": 0.7200467562828755, "grad_norm": 3.8679633701250835, "learning_rate": 1.4492753623188408e-06, "loss": 0.7854, "step": 308 }, { "epoch": 0.722384570426651, "grad_norm": 4.308222321507237, "learning_rate": 1.4371980676328504e-06, "loss": 0.7805, "step": 309 }, { "epoch": 0.7247223845704267, "grad_norm": 3.8916559653506018, "learning_rate": 1.42512077294686e-06, "loss": 0.6789, "step": 310 }, { "epoch": 0.7270601987142022, "grad_norm": 4.208472724847014, "learning_rate": 1.4130434782608697e-06, "loss": 0.7624, "step": 311 }, { "epoch": 0.7293980128579778, "grad_norm": 4.541098999570629, "learning_rate": 1.4009661835748794e-06, "loss": 0.7754, "step": 312 }, { "epoch": 0.7317358270017533, "grad_norm": 3.894542881557041, "learning_rate": 1.3888888888888892e-06, "loss": 0.7327, "step": 313 }, { "epoch": 0.734073641145529, "grad_norm": 4.316419064602019, "learning_rate": 1.3768115942028987e-06, "loss": 0.7785, "step": 314 }, { "epoch": 0.7364114552893045, "grad_norm": 3.840444616763943, "learning_rate": 1.3647342995169083e-06, "loss": 0.7296, "step": 315 }, { "epoch": 0.7387492694330801, "grad_norm": 4.0101608921412835, "learning_rate": 1.3526570048309178e-06, "loss": 0.7199, "step": 316 }, { "epoch": 0.7410870835768556, "grad_norm": 4.02178577481216, "learning_rate": 1.3405797101449278e-06, "loss": 0.7662, "step": 317 }, { "epoch": 0.7434248977206313, "grad_norm": 3.955088131738884, "learning_rate": 1.3285024154589373e-06, "loss": 0.7196, "step": 318 }, { "epoch": 0.7457627118644068, "grad_norm": 4.130879922008592, "learning_rate": 1.316425120772947e-06, "loss": 0.787, "step": 319 }, { "epoch": 0.7481005260081823, "grad_norm": 4.0739088224040705, "learning_rate": 1.3043478260869566e-06, "loss": 0.7509, "step": 320 }, { "epoch": 0.7504383401519579, "grad_norm": 4.2499948389358595, "learning_rate": 1.2922705314009662e-06, "loss": 0.7373, "step": 321 }, { "epoch": 0.7527761542957335, "grad_norm": 4.048557241149405, "learning_rate": 1.2801932367149761e-06, "loss": 0.781, "step": 322 }, { "epoch": 0.7551139684395091, "grad_norm": 4.2499198906024205, "learning_rate": 1.2681159420289857e-06, "loss": 0.7674, "step": 323 }, { "epoch": 0.7574517825832846, "grad_norm": 4.1878094914635255, "learning_rate": 1.2560386473429952e-06, "loss": 0.726, "step": 324 }, { "epoch": 0.7597895967270601, "grad_norm": 4.531895242987001, "learning_rate": 1.243961352657005e-06, "loss": 0.7849, "step": 325 }, { "epoch": 0.7621274108708358, "grad_norm": 4.042779330179229, "learning_rate": 1.2318840579710147e-06, "loss": 0.7532, "step": 326 }, { "epoch": 0.7644652250146113, "grad_norm": 3.9930786810311254, "learning_rate": 1.2198067632850243e-06, "loss": 0.7286, "step": 327 }, { "epoch": 0.7668030391583869, "grad_norm": 5.948998810978814, "learning_rate": 1.2077294685990338e-06, "loss": 0.8127, "step": 328 }, { "epoch": 0.7691408533021625, "grad_norm": 4.144487299852383, "learning_rate": 1.1956521739130436e-06, "loss": 0.7691, "step": 329 }, { "epoch": 0.7714786674459381, "grad_norm": 4.128733708034505, "learning_rate": 1.1835748792270531e-06, "loss": 0.768, "step": 330 }, { "epoch": 0.7738164815897136, "grad_norm": 4.530767375631303, "learning_rate": 1.1714975845410629e-06, "loss": 0.7798, "step": 331 }, { "epoch": 0.7761542957334892, "grad_norm": 3.9238729668993835, "learning_rate": 1.1594202898550726e-06, "loss": 0.7642, "step": 332 }, { "epoch": 0.7784921098772648, "grad_norm": 4.111531783109019, "learning_rate": 1.1473429951690822e-06, "loss": 0.7616, "step": 333 }, { "epoch": 0.7808299240210403, "grad_norm": 4.140234356572554, "learning_rate": 1.135265700483092e-06, "loss": 0.8308, "step": 334 }, { "epoch": 0.7831677381648159, "grad_norm": 4.5225578335616845, "learning_rate": 1.1231884057971015e-06, "loss": 0.7688, "step": 335 }, { "epoch": 0.7855055523085914, "grad_norm": 4.253055596048113, "learning_rate": 1.111111111111111e-06, "loss": 0.7823, "step": 336 }, { "epoch": 0.7878433664523671, "grad_norm": 4.214973850734774, "learning_rate": 1.0990338164251208e-06, "loss": 0.7015, "step": 337 }, { "epoch": 0.7901811805961426, "grad_norm": 4.242093529547378, "learning_rate": 1.0869565217391306e-06, "loss": 0.7902, "step": 338 }, { "epoch": 0.7925189947399182, "grad_norm": 4.27860016507252, "learning_rate": 1.0748792270531403e-06, "loss": 0.7893, "step": 339 }, { "epoch": 0.7948568088836937, "grad_norm": 4.193517659739712, "learning_rate": 1.0628019323671499e-06, "loss": 0.7932, "step": 340 }, { "epoch": 0.7971946230274694, "grad_norm": 3.861888360541971, "learning_rate": 1.0507246376811594e-06, "loss": 0.7271, "step": 341 }, { "epoch": 0.7995324371712449, "grad_norm": 4.044324859369637, "learning_rate": 1.0386473429951692e-06, "loss": 0.7651, "step": 342 }, { "epoch": 0.8018702513150204, "grad_norm": 4.143848474405527, "learning_rate": 1.026570048309179e-06, "loss": 0.7991, "step": 343 }, { "epoch": 0.804208065458796, "grad_norm": 4.543740361976109, "learning_rate": 1.0144927536231885e-06, "loss": 0.7871, "step": 344 }, { "epoch": 0.8065458796025716, "grad_norm": 4.053324740509495, "learning_rate": 1.0024154589371982e-06, "loss": 0.7181, "step": 345 }, { "epoch": 0.8088836937463472, "grad_norm": 3.91170761323185, "learning_rate": 9.903381642512078e-07, "loss": 0.7167, "step": 346 }, { "epoch": 0.8112215078901227, "grad_norm": 3.9769619064751174, "learning_rate": 9.782608695652175e-07, "loss": 0.7152, "step": 347 }, { "epoch": 0.8135593220338984, "grad_norm": 4.141477101296879, "learning_rate": 9.66183574879227e-07, "loss": 0.806, "step": 348 }, { "epoch": 0.8158971361776739, "grad_norm": 3.9266793661338566, "learning_rate": 9.541062801932368e-07, "loss": 0.74, "step": 349 }, { "epoch": 0.8182349503214494, "grad_norm": 3.905819434278297, "learning_rate": 9.420289855072465e-07, "loss": 0.7621, "step": 350 }, { "epoch": 0.820572764465225, "grad_norm": 4.271457136544383, "learning_rate": 9.299516908212561e-07, "loss": 0.7108, "step": 351 }, { "epoch": 0.8229105786090006, "grad_norm": 3.9018935668444907, "learning_rate": 9.178743961352659e-07, "loss": 0.7326, "step": 352 }, { "epoch": 0.8252483927527762, "grad_norm": 3.842764627332658, "learning_rate": 9.057971014492754e-07, "loss": 0.769, "step": 353 }, { "epoch": 0.8275862068965517, "grad_norm": 4.12270406926976, "learning_rate": 8.937198067632851e-07, "loss": 0.7462, "step": 354 }, { "epoch": 0.8299240210403273, "grad_norm": 4.25238665717318, "learning_rate": 8.816425120772948e-07, "loss": 0.7417, "step": 355 }, { "epoch": 0.8322618351841029, "grad_norm": 4.10405871770544, "learning_rate": 8.695652173913044e-07, "loss": 0.7769, "step": 356 }, { "epoch": 0.8345996493278784, "grad_norm": 4.132898802117579, "learning_rate": 8.574879227053141e-07, "loss": 0.7334, "step": 357 }, { "epoch": 0.836937463471654, "grad_norm": 3.9812833871444573, "learning_rate": 8.454106280193238e-07, "loss": 0.7437, "step": 358 }, { "epoch": 0.8392752776154295, "grad_norm": 4.819360178352156, "learning_rate": 8.333333333333333e-07, "loss": 0.7594, "step": 359 }, { "epoch": 0.8416130917592052, "grad_norm": 4.27077723520544, "learning_rate": 8.212560386473431e-07, "loss": 0.7282, "step": 360 }, { "epoch": 0.8439509059029807, "grad_norm": 4.234704486935872, "learning_rate": 8.091787439613527e-07, "loss": 0.7844, "step": 361 }, { "epoch": 0.8462887200467563, "grad_norm": 3.660518143878683, "learning_rate": 7.971014492753623e-07, "loss": 0.6846, "step": 362 }, { "epoch": 0.8486265341905318, "grad_norm": 4.382898231252646, "learning_rate": 7.85024154589372e-07, "loss": 0.7378, "step": 363 }, { "epoch": 0.8509643483343075, "grad_norm": 4.03693007471031, "learning_rate": 7.729468599033817e-07, "loss": 0.7321, "step": 364 }, { "epoch": 0.853302162478083, "grad_norm": 4.061417655548705, "learning_rate": 7.608695652173914e-07, "loss": 0.7427, "step": 365 }, { "epoch": 0.8556399766218585, "grad_norm": 4.033537459659518, "learning_rate": 7.48792270531401e-07, "loss": 0.7631, "step": 366 }, { "epoch": 0.8579777907656342, "grad_norm": 3.8672964986217377, "learning_rate": 7.367149758454106e-07, "loss": 0.7277, "step": 367 }, { "epoch": 0.8603156049094097, "grad_norm": 4.1614750880483795, "learning_rate": 7.246376811594204e-07, "loss": 0.7821, "step": 368 }, { "epoch": 0.8626534190531853, "grad_norm": 4.0347237221296846, "learning_rate": 7.1256038647343e-07, "loss": 0.7229, "step": 369 }, { "epoch": 0.8649912331969608, "grad_norm": 4.419235250329394, "learning_rate": 7.004830917874397e-07, "loss": 0.7912, "step": 370 }, { "epoch": 0.8673290473407365, "grad_norm": 4.0395927745176925, "learning_rate": 6.884057971014494e-07, "loss": 0.7781, "step": 371 }, { "epoch": 0.869666861484512, "grad_norm": 4.323154501136669, "learning_rate": 6.763285024154589e-07, "loss": 0.7489, "step": 372 }, { "epoch": 0.8720046756282875, "grad_norm": 4.0036925914792, "learning_rate": 6.642512077294687e-07, "loss": 0.7488, "step": 373 }, { "epoch": 0.8743424897720631, "grad_norm": 4.081792943103691, "learning_rate": 6.521739130434783e-07, "loss": 0.7506, "step": 374 }, { "epoch": 0.8766803039158387, "grad_norm": 3.961593904598705, "learning_rate": 6.400966183574881e-07, "loss": 0.7365, "step": 375 }, { "epoch": 0.8790181180596143, "grad_norm": 5.343637922572841, "learning_rate": 6.280193236714976e-07, "loss": 0.8142, "step": 376 }, { "epoch": 0.8813559322033898, "grad_norm": 4.234613953777181, "learning_rate": 6.159420289855074e-07, "loss": 0.7685, "step": 377 }, { "epoch": 0.8836937463471654, "grad_norm": 3.914888154011919, "learning_rate": 6.038647342995169e-07, "loss": 0.7442, "step": 378 }, { "epoch": 0.886031560490941, "grad_norm": 3.998960956090034, "learning_rate": 5.917874396135266e-07, "loss": 0.7724, "step": 379 }, { "epoch": 0.8883693746347165, "grad_norm": 3.7467228875291885, "learning_rate": 5.797101449275363e-07, "loss": 0.7157, "step": 380 }, { "epoch": 0.8907071887784921, "grad_norm": 3.921411494491602, "learning_rate": 5.67632850241546e-07, "loss": 0.7604, "step": 381 }, { "epoch": 0.8930450029222676, "grad_norm": 4.171395377831423, "learning_rate": 5.555555555555555e-07, "loss": 0.7498, "step": 382 }, { "epoch": 0.8953828170660433, "grad_norm": 4.1347642411133725, "learning_rate": 5.434782608695653e-07, "loss": 0.7472, "step": 383 }, { "epoch": 0.8977206312098188, "grad_norm": 4.092973708302494, "learning_rate": 5.314009661835749e-07, "loss": 0.7237, "step": 384 }, { "epoch": 0.9000584453535944, "grad_norm": 3.9933326706118875, "learning_rate": 5.193236714975846e-07, "loss": 0.7389, "step": 385 }, { "epoch": 0.90239625949737, "grad_norm": 3.8068860103615174, "learning_rate": 5.072463768115942e-07, "loss": 0.7177, "step": 386 }, { "epoch": 0.9047340736411456, "grad_norm": 4.25980749026596, "learning_rate": 4.951690821256039e-07, "loss": 0.758, "step": 387 }, { "epoch": 0.9070718877849211, "grad_norm": 3.8688206778681278, "learning_rate": 4.830917874396135e-07, "loss": 0.7577, "step": 388 }, { "epoch": 0.9094097019286966, "grad_norm": 4.072604714599362, "learning_rate": 4.7101449275362324e-07, "loss": 0.7655, "step": 389 }, { "epoch": 0.9117475160724723, "grad_norm": 4.216731514011164, "learning_rate": 4.5893719806763294e-07, "loss": 0.7572, "step": 390 }, { "epoch": 0.9140853302162478, "grad_norm": 4.204400645393741, "learning_rate": 4.4685990338164254e-07, "loss": 0.7595, "step": 391 }, { "epoch": 0.9164231443600234, "grad_norm": 4.327014987328045, "learning_rate": 4.347826086956522e-07, "loss": 0.7347, "step": 392 }, { "epoch": 0.9187609585037989, "grad_norm": 4.381847799007514, "learning_rate": 4.227053140096619e-07, "loss": 0.7505, "step": 393 }, { "epoch": 0.9210987726475746, "grad_norm": 4.019350453750999, "learning_rate": 4.1062801932367154e-07, "loss": 0.7488, "step": 394 }, { "epoch": 0.9234365867913501, "grad_norm": 3.958102022071496, "learning_rate": 3.9855072463768114e-07, "loss": 0.7436, "step": 395 }, { "epoch": 0.9257744009351256, "grad_norm": 4.3569068621437745, "learning_rate": 3.8647342995169085e-07, "loss": 0.7323, "step": 396 }, { "epoch": 0.9281122150789012, "grad_norm": 3.9242746982918777, "learning_rate": 3.743961352657005e-07, "loss": 0.7255, "step": 397 }, { "epoch": 0.9304500292226768, "grad_norm": 3.91121815410949, "learning_rate": 3.623188405797102e-07, "loss": 0.7471, "step": 398 }, { "epoch": 0.9327878433664524, "grad_norm": 3.973005041304068, "learning_rate": 3.5024154589371985e-07, "loss": 0.6823, "step": 399 }, { "epoch": 0.9351256575102279, "grad_norm": 3.988161090830406, "learning_rate": 3.3816425120772945e-07, "loss": 0.6871, "step": 400 }, { "epoch": 0.9374634716540035, "grad_norm": 4.296337191130102, "learning_rate": 3.2608695652173915e-07, "loss": 0.7236, "step": 401 }, { "epoch": 0.9398012857977791, "grad_norm": 4.3179225277967515, "learning_rate": 3.140096618357488e-07, "loss": 0.7582, "step": 402 }, { "epoch": 0.9421390999415546, "grad_norm": 4.191674727829652, "learning_rate": 3.0193236714975846e-07, "loss": 0.7238, "step": 403 }, { "epoch": 0.9444769140853302, "grad_norm": 3.8257966103380765, "learning_rate": 2.8985507246376816e-07, "loss": 0.7475, "step": 404 }, { "epoch": 0.9468147282291058, "grad_norm": 4.06630469936539, "learning_rate": 2.7777777777777776e-07, "loss": 0.7109, "step": 405 }, { "epoch": 0.9491525423728814, "grad_norm": 4.583718694034358, "learning_rate": 2.6570048309178746e-07, "loss": 0.7623, "step": 406 }, { "epoch": 0.9514903565166569, "grad_norm": 3.9553370864295694, "learning_rate": 2.536231884057971e-07, "loss": 0.7911, "step": 407 }, { "epoch": 0.9538281706604325, "grad_norm": 4.221184826167876, "learning_rate": 2.4154589371980677e-07, "loss": 0.7322, "step": 408 }, { "epoch": 0.9561659848042081, "grad_norm": 4.196761181297048, "learning_rate": 2.2946859903381647e-07, "loss": 0.7476, "step": 409 }, { "epoch": 0.9585037989479837, "grad_norm": 4.185489684411542, "learning_rate": 2.173913043478261e-07, "loss": 0.7548, "step": 410 }, { "epoch": 0.9608416130917592, "grad_norm": 4.371686498083367, "learning_rate": 2.0531400966183577e-07, "loss": 0.7328, "step": 411 }, { "epoch": 0.9631794272355347, "grad_norm": 4.314986686818614, "learning_rate": 1.9323671497584542e-07, "loss": 0.7304, "step": 412 }, { "epoch": 0.9655172413793104, "grad_norm": 3.9822912414587806, "learning_rate": 1.811594202898551e-07, "loss": 0.7395, "step": 413 }, { "epoch": 0.9678550555230859, "grad_norm": 4.218523033535868, "learning_rate": 1.6908212560386473e-07, "loss": 0.7302, "step": 414 }, { "epoch": 0.9701928696668615, "grad_norm": 4.092187481356195, "learning_rate": 1.570048309178744e-07, "loss": 0.7351, "step": 415 }, { "epoch": 0.972530683810637, "grad_norm": 4.184125537002853, "learning_rate": 1.4492753623188408e-07, "loss": 0.7413, "step": 416 }, { "epoch": 0.9748684979544127, "grad_norm": 3.889649663413063, "learning_rate": 1.3285024154589373e-07, "loss": 0.7365, "step": 417 }, { "epoch": 0.9772063120981882, "grad_norm": 4.139378543594781, "learning_rate": 1.2077294685990338e-07, "loss": 0.7626, "step": 418 }, { "epoch": 0.9795441262419637, "grad_norm": 4.016007817051792, "learning_rate": 1.0869565217391305e-07, "loss": 0.7428, "step": 419 }, { "epoch": 0.9818819403857393, "grad_norm": 4.31935746465498, "learning_rate": 9.661835748792271e-08, "loss": 0.7886, "step": 420 }, { "epoch": 0.9842197545295149, "grad_norm": 4.305755648868578, "learning_rate": 8.454106280193236e-08, "loss": 0.7552, "step": 421 }, { "epoch": 0.9865575686732905, "grad_norm": 4.324910095691635, "learning_rate": 7.246376811594204e-08, "loss": 0.7465, "step": 422 }, { "epoch": 0.988895382817066, "grad_norm": 3.694300442393254, "learning_rate": 6.038647342995169e-08, "loss": 0.7093, "step": 423 }, { "epoch": 0.9912331969608417, "grad_norm": 3.829444377626212, "learning_rate": 4.8309178743961356e-08, "loss": 0.7241, "step": 424 }, { "epoch": 0.9935710111046172, "grad_norm": 4.179549227414933, "learning_rate": 3.623188405797102e-08, "loss": 0.7663, "step": 425 }, { "epoch": 0.9959088252483927, "grad_norm": 3.832323286806212, "learning_rate": 2.4154589371980678e-08, "loss": 0.7859, "step": 426 }, { "epoch": 0.9982466393921683, "grad_norm": 4.123264294362188, "learning_rate": 1.2077294685990339e-08, "loss": 0.7678, "step": 427 }, { "epoch": 0.9982466393921683, "step": 427, "total_flos": 77746305761280.0, "train_loss": 0.8145518043281323, "train_runtime": 5184.5045, "train_samples_per_second": 10.56, "train_steps_per_second": 0.082 } ], "logging_steps": 1.0, "max_steps": 427, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 77746305761280.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }