random_VNKC0sAx1rO2dpBi / trainer_state.json
cutelemonlili's picture
Add files using upload-large-folder tool
efbbc3d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 1190,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0016806722689075631,
"grad_norm": 8.209144697154139,
"learning_rate": 9.99998257609161e-06,
"loss": 0.2445,
"step": 1
},
{
"epoch": 0.0033613445378151263,
"grad_norm": 7.387136868109575,
"learning_rate": 9.999930304487874e-06,
"loss": 0.232,
"step": 2
},
{
"epoch": 0.005042016806722689,
"grad_norm": 4.703733759728634,
"learning_rate": 9.999843185553106e-06,
"loss": 0.1499,
"step": 3
},
{
"epoch": 0.0067226890756302525,
"grad_norm": 4.59672828172244,
"learning_rate": 9.999721219894482e-06,
"loss": 0.1312,
"step": 4
},
{
"epoch": 0.008403361344537815,
"grad_norm": 7.869163994125804,
"learning_rate": 9.999564408362054e-06,
"loss": 0.2678,
"step": 5
},
{
"epoch": 0.010084033613445379,
"grad_norm": 5.069247100993632,
"learning_rate": 9.999372752048729e-06,
"loss": 0.1643,
"step": 6
},
{
"epoch": 0.011764705882352941,
"grad_norm": 4.971040093251534,
"learning_rate": 9.999146252290264e-06,
"loss": 0.1881,
"step": 7
},
{
"epoch": 0.013445378151260505,
"grad_norm": 4.612745404018306,
"learning_rate": 9.998884910665267e-06,
"loss": 0.2474,
"step": 8
},
{
"epoch": 0.015126050420168067,
"grad_norm": 5.582880543106663,
"learning_rate": 9.998588728995176e-06,
"loss": 0.2341,
"step": 9
},
{
"epoch": 0.01680672268907563,
"grad_norm": 5.10777402267266,
"learning_rate": 9.998257709344246e-06,
"loss": 0.2462,
"step": 10
},
{
"epoch": 0.018487394957983194,
"grad_norm": 3.912332856239327,
"learning_rate": 9.997891854019538e-06,
"loss": 0.2171,
"step": 11
},
{
"epoch": 0.020168067226890758,
"grad_norm": 4.514978773458917,
"learning_rate": 9.997491165570907e-06,
"loss": 0.2079,
"step": 12
},
{
"epoch": 0.021848739495798318,
"grad_norm": 4.982439292936562,
"learning_rate": 9.997055646790974e-06,
"loss": 0.2801,
"step": 13
},
{
"epoch": 0.023529411764705882,
"grad_norm": 6.199005044554472,
"learning_rate": 9.996585300715117e-06,
"loss": 0.2362,
"step": 14
},
{
"epoch": 0.025210084033613446,
"grad_norm": 4.2318843117354,
"learning_rate": 9.99608013062144e-06,
"loss": 0.2437,
"step": 15
},
{
"epoch": 0.02689075630252101,
"grad_norm": 4.678511423998445,
"learning_rate": 9.995540140030759e-06,
"loss": 0.2413,
"step": 16
},
{
"epoch": 0.02857142857142857,
"grad_norm": 5.019790515416029,
"learning_rate": 9.994965332706574e-06,
"loss": 0.2763,
"step": 17
},
{
"epoch": 0.030252100840336135,
"grad_norm": 3.7745510637286572,
"learning_rate": 9.99435571265504e-06,
"loss": 0.2413,
"step": 18
},
{
"epoch": 0.031932773109243695,
"grad_norm": 4.954591562450657,
"learning_rate": 9.993711284124943e-06,
"loss": 0.2748,
"step": 19
},
{
"epoch": 0.03361344537815126,
"grad_norm": 4.187965473046551,
"learning_rate": 9.99303205160767e-06,
"loss": 0.274,
"step": 20
},
{
"epoch": 0.03529411764705882,
"grad_norm": 4.6295824547220095,
"learning_rate": 9.992318019837171e-06,
"loss": 0.2448,
"step": 21
},
{
"epoch": 0.03697478991596639,
"grad_norm": 5.621884903621694,
"learning_rate": 9.991569193789938e-06,
"loss": 0.2579,
"step": 22
},
{
"epoch": 0.03865546218487395,
"grad_norm": 5.123406042716482,
"learning_rate": 9.990785578684963e-06,
"loss": 0.2391,
"step": 23
},
{
"epoch": 0.040336134453781515,
"grad_norm": 5.074868330604315,
"learning_rate": 9.989967179983699e-06,
"loss": 0.2584,
"step": 24
},
{
"epoch": 0.04201680672268908,
"grad_norm": 5.376525151914496,
"learning_rate": 9.989114003390028e-06,
"loss": 0.277,
"step": 25
},
{
"epoch": 0.043697478991596636,
"grad_norm": 4.0946093021497845,
"learning_rate": 9.988226054850218e-06,
"loss": 0.215,
"step": 26
},
{
"epoch": 0.0453781512605042,
"grad_norm": 4.552724543313885,
"learning_rate": 9.987303340552885e-06,
"loss": 0.2744,
"step": 27
},
{
"epoch": 0.047058823529411764,
"grad_norm": 5.012152565084418,
"learning_rate": 9.98634586692894e-06,
"loss": 0.385,
"step": 28
},
{
"epoch": 0.04873949579831933,
"grad_norm": 5.423054570055711,
"learning_rate": 9.985353640651563e-06,
"loss": 0.2702,
"step": 29
},
{
"epoch": 0.05042016806722689,
"grad_norm": 5.127889256940364,
"learning_rate": 9.984326668636131e-06,
"loss": 0.3168,
"step": 30
},
{
"epoch": 0.052100840336134456,
"grad_norm": 3.679450662065014,
"learning_rate": 9.983264958040194e-06,
"loss": 0.2874,
"step": 31
},
{
"epoch": 0.05378151260504202,
"grad_norm": 3.3004814033041585,
"learning_rate": 9.98216851626341e-06,
"loss": 0.2216,
"step": 32
},
{
"epoch": 0.05546218487394958,
"grad_norm": 3.6744435985276502,
"learning_rate": 9.981037350947503e-06,
"loss": 0.1946,
"step": 33
},
{
"epoch": 0.05714285714285714,
"grad_norm": 4.945817900667547,
"learning_rate": 9.979871469976197e-06,
"loss": 0.2352,
"step": 34
},
{
"epoch": 0.058823529411764705,
"grad_norm": 4.199740458372265,
"learning_rate": 9.978670881475173e-06,
"loss": 0.2658,
"step": 35
},
{
"epoch": 0.06050420168067227,
"grad_norm": 4.098117482797779,
"learning_rate": 9.977435593812013e-06,
"loss": 0.2907,
"step": 36
},
{
"epoch": 0.06218487394957983,
"grad_norm": 4.042885470534792,
"learning_rate": 9.976165615596128e-06,
"loss": 0.2,
"step": 37
},
{
"epoch": 0.06386554621848739,
"grad_norm": 4.617103574848525,
"learning_rate": 9.974860955678715e-06,
"loss": 0.2306,
"step": 38
},
{
"epoch": 0.06554621848739496,
"grad_norm": 4.4388689219941675,
"learning_rate": 9.973521623152682e-06,
"loss": 0.2407,
"step": 39
},
{
"epoch": 0.06722689075630252,
"grad_norm": 6.477467686130754,
"learning_rate": 9.972147627352593e-06,
"loss": 0.3914,
"step": 40
},
{
"epoch": 0.06890756302521009,
"grad_norm": 4.607889040112,
"learning_rate": 9.970738977854597e-06,
"loss": 0.2606,
"step": 41
},
{
"epoch": 0.07058823529411765,
"grad_norm": 5.8414772767780665,
"learning_rate": 9.96929568447637e-06,
"loss": 0.2878,
"step": 42
},
{
"epoch": 0.07226890756302522,
"grad_norm": 4.530793945019326,
"learning_rate": 9.967817757277031e-06,
"loss": 0.222,
"step": 43
},
{
"epoch": 0.07394957983193277,
"grad_norm": 5.7019752450601295,
"learning_rate": 9.966305206557092e-06,
"loss": 0.3215,
"step": 44
},
{
"epoch": 0.07563025210084033,
"grad_norm": 4.976614437008921,
"learning_rate": 9.964758042858368e-06,
"loss": 0.3344,
"step": 45
},
{
"epoch": 0.0773109243697479,
"grad_norm": 4.411005767374718,
"learning_rate": 9.963176276963916e-06,
"loss": 0.2107,
"step": 46
},
{
"epoch": 0.07899159663865546,
"grad_norm": 5.409158327460203,
"learning_rate": 9.961559919897954e-06,
"loss": 0.2534,
"step": 47
},
{
"epoch": 0.08067226890756303,
"grad_norm": 4.484574670660062,
"learning_rate": 9.959908982925783e-06,
"loss": 0.2586,
"step": 48
},
{
"epoch": 0.08235294117647059,
"grad_norm": 4.587259020764486,
"learning_rate": 9.958223477553715e-06,
"loss": 0.2533,
"step": 49
},
{
"epoch": 0.08403361344537816,
"grad_norm": 4.405358710771115,
"learning_rate": 9.956503415528984e-06,
"loss": 0.2559,
"step": 50
},
{
"epoch": 0.08571428571428572,
"grad_norm": 4.967732320368905,
"learning_rate": 9.954748808839675e-06,
"loss": 0.2503,
"step": 51
},
{
"epoch": 0.08739495798319327,
"grad_norm": 5.218196839992478,
"learning_rate": 9.952959669714627e-06,
"loss": 0.2862,
"step": 52
},
{
"epoch": 0.08907563025210084,
"grad_norm": 4.8935636677407475,
"learning_rate": 9.951136010623359e-06,
"loss": 0.3151,
"step": 53
},
{
"epoch": 0.0907563025210084,
"grad_norm": 3.9500115872361072,
"learning_rate": 9.94927784427598e-06,
"loss": 0.2711,
"step": 54
},
{
"epoch": 0.09243697478991597,
"grad_norm": 4.817134300002951,
"learning_rate": 9.947385183623099e-06,
"loss": 0.2697,
"step": 55
},
{
"epoch": 0.09411764705882353,
"grad_norm": 5.486354681435603,
"learning_rate": 9.945458041855732e-06,
"loss": 0.3244,
"step": 56
},
{
"epoch": 0.0957983193277311,
"grad_norm": 3.924865124104986,
"learning_rate": 9.943496432405213e-06,
"loss": 0.2126,
"step": 57
},
{
"epoch": 0.09747899159663866,
"grad_norm": 4.059739131992801,
"learning_rate": 9.941500368943111e-06,
"loss": 0.2209,
"step": 58
},
{
"epoch": 0.09915966386554621,
"grad_norm": 3.925936844915292,
"learning_rate": 9.939469865381111e-06,
"loss": 0.2238,
"step": 59
},
{
"epoch": 0.10084033613445378,
"grad_norm": 4.929638111711791,
"learning_rate": 9.937404935870938e-06,
"loss": 0.2876,
"step": 60
},
{
"epoch": 0.10252100840336134,
"grad_norm": 4.333643541029462,
"learning_rate": 9.935305594804247e-06,
"loss": 0.2622,
"step": 61
},
{
"epoch": 0.10420168067226891,
"grad_norm": 6.551835764229068,
"learning_rate": 9.933171856812533e-06,
"loss": 0.3608,
"step": 62
},
{
"epoch": 0.10588235294117647,
"grad_norm": 4.370528317389317,
"learning_rate": 9.931003736767013e-06,
"loss": 0.2189,
"step": 63
},
{
"epoch": 0.10756302521008404,
"grad_norm": 3.9034215243844557,
"learning_rate": 9.92880124977854e-06,
"loss": 0.2076,
"step": 64
},
{
"epoch": 0.1092436974789916,
"grad_norm": 6.462115529099049,
"learning_rate": 9.926564411197488e-06,
"loss": 0.2704,
"step": 65
},
{
"epoch": 0.11092436974789915,
"grad_norm": 4.974959833762651,
"learning_rate": 9.924293236613643e-06,
"loss": 0.2586,
"step": 66
},
{
"epoch": 0.11260504201680673,
"grad_norm": 5.397773235075291,
"learning_rate": 9.921987741856099e-06,
"loss": 0.3708,
"step": 67
},
{
"epoch": 0.11428571428571428,
"grad_norm": 4.290267842379414,
"learning_rate": 9.91964794299315e-06,
"loss": 0.297,
"step": 68
},
{
"epoch": 0.11596638655462185,
"grad_norm": 4.297436061861156,
"learning_rate": 9.91727385633217e-06,
"loss": 0.2723,
"step": 69
},
{
"epoch": 0.11764705882352941,
"grad_norm": 3.6926079145905097,
"learning_rate": 9.91486549841951e-06,
"loss": 0.2431,
"step": 70
},
{
"epoch": 0.11932773109243698,
"grad_norm": 4.305921925067097,
"learning_rate": 9.91242288604037e-06,
"loss": 0.2779,
"step": 71
},
{
"epoch": 0.12100840336134454,
"grad_norm": 4.517120872670056,
"learning_rate": 9.909946036218694e-06,
"loss": 0.2961,
"step": 72
},
{
"epoch": 0.1226890756302521,
"grad_norm": 4.785215176674839,
"learning_rate": 9.907434966217041e-06,
"loss": 0.2819,
"step": 73
},
{
"epoch": 0.12436974789915967,
"grad_norm": 4.214559462932771,
"learning_rate": 9.904889693536475e-06,
"loss": 0.2109,
"step": 74
},
{
"epoch": 0.12605042016806722,
"grad_norm": 4.389318542382428,
"learning_rate": 9.902310235916435e-06,
"loss": 0.2522,
"step": 75
},
{
"epoch": 0.12773109243697478,
"grad_norm": 3.8325771771754664,
"learning_rate": 9.899696611334612e-06,
"loss": 0.2598,
"step": 76
},
{
"epoch": 0.12941176470588237,
"grad_norm": 4.860313041028263,
"learning_rate": 9.89704883800683e-06,
"loss": 0.3235,
"step": 77
},
{
"epoch": 0.13109243697478992,
"grad_norm": 4.910636980849552,
"learning_rate": 9.894366934386913e-06,
"loss": 0.2905,
"step": 78
},
{
"epoch": 0.13277310924369748,
"grad_norm": 5.308926346498934,
"learning_rate": 9.891650919166558e-06,
"loss": 0.3174,
"step": 79
},
{
"epoch": 0.13445378151260504,
"grad_norm": 5.932647263725452,
"learning_rate": 9.888900811275205e-06,
"loss": 0.3704,
"step": 80
},
{
"epoch": 0.1361344537815126,
"grad_norm": 4.809268256405562,
"learning_rate": 9.886116629879906e-06,
"loss": 0.3038,
"step": 81
},
{
"epoch": 0.13781512605042018,
"grad_norm": 4.03544501605081,
"learning_rate": 9.883298394385186e-06,
"loss": 0.262,
"step": 82
},
{
"epoch": 0.13949579831932774,
"grad_norm": 4.712835324738283,
"learning_rate": 9.880446124432921e-06,
"loss": 0.2692,
"step": 83
},
{
"epoch": 0.1411764705882353,
"grad_norm": 4.563081327265746,
"learning_rate": 9.877559839902185e-06,
"loss": 0.3518,
"step": 84
},
{
"epoch": 0.14285714285714285,
"grad_norm": 4.028984316421756,
"learning_rate": 9.874639560909118e-06,
"loss": 0.277,
"step": 85
},
{
"epoch": 0.14453781512605043,
"grad_norm": 4.3533769418599055,
"learning_rate": 9.871685307806796e-06,
"loss": 0.2605,
"step": 86
},
{
"epoch": 0.146218487394958,
"grad_norm": 4.32320216003599,
"learning_rate": 9.868697101185066e-06,
"loss": 0.2483,
"step": 87
},
{
"epoch": 0.14789915966386555,
"grad_norm": 4.252161322978531,
"learning_rate": 9.865674961870428e-06,
"loss": 0.2292,
"step": 88
},
{
"epoch": 0.1495798319327731,
"grad_norm": 3.7472917316130707,
"learning_rate": 9.862618910925873e-06,
"loss": 0.2314,
"step": 89
},
{
"epoch": 0.15126050420168066,
"grad_norm": 3.4288032918623017,
"learning_rate": 9.859528969650739e-06,
"loss": 0.2511,
"step": 90
},
{
"epoch": 0.15294117647058825,
"grad_norm": 4.6329004322037335,
"learning_rate": 9.85640515958057e-06,
"loss": 0.2949,
"step": 91
},
{
"epoch": 0.1546218487394958,
"grad_norm": 3.5652517822054794,
"learning_rate": 9.853247502486957e-06,
"loss": 0.2423,
"step": 92
},
{
"epoch": 0.15630252100840336,
"grad_norm": 5.123085560371222,
"learning_rate": 9.850056020377392e-06,
"loss": 0.326,
"step": 93
},
{
"epoch": 0.15798319327731092,
"grad_norm": 4.810628133927044,
"learning_rate": 9.846830735495112e-06,
"loss": 0.2907,
"step": 94
},
{
"epoch": 0.15966386554621848,
"grad_norm": 4.024916887888118,
"learning_rate": 9.843571670318943e-06,
"loss": 0.3123,
"step": 95
},
{
"epoch": 0.16134453781512606,
"grad_norm": 7.985561798082157,
"learning_rate": 9.840278847563147e-06,
"loss": 0.3295,
"step": 96
},
{
"epoch": 0.16302521008403362,
"grad_norm": 3.7710475419246254,
"learning_rate": 9.836952290177261e-06,
"loss": 0.2212,
"step": 97
},
{
"epoch": 0.16470588235294117,
"grad_norm": 4.423657356806522,
"learning_rate": 9.833592021345938e-06,
"loss": 0.31,
"step": 98
},
{
"epoch": 0.16638655462184873,
"grad_norm": 4.222677175561096,
"learning_rate": 9.830198064488783e-06,
"loss": 0.2842,
"step": 99
},
{
"epoch": 0.16806722689075632,
"grad_norm": 4.168091860783304,
"learning_rate": 9.826770443260193e-06,
"loss": 0.252,
"step": 100
},
{
"epoch": 0.16974789915966387,
"grad_norm": 6.3620832971275085,
"learning_rate": 9.823309181549194e-06,
"loss": 0.2902,
"step": 101
},
{
"epoch": 0.17142857142857143,
"grad_norm": 6.17536019379638,
"learning_rate": 9.819814303479268e-06,
"loss": 0.2962,
"step": 102
},
{
"epoch": 0.173109243697479,
"grad_norm": 4.438145668562799,
"learning_rate": 9.816285833408185e-06,
"loss": 0.3056,
"step": 103
},
{
"epoch": 0.17478991596638654,
"grad_norm": 3.916870643757942,
"learning_rate": 9.812723795927848e-06,
"loss": 0.2413,
"step": 104
},
{
"epoch": 0.17647058823529413,
"grad_norm": 4.137088990144677,
"learning_rate": 9.809128215864096e-06,
"loss": 0.2926,
"step": 105
},
{
"epoch": 0.1781512605042017,
"grad_norm": 4.803813178194908,
"learning_rate": 9.805499118276555e-06,
"loss": 0.3213,
"step": 106
},
{
"epoch": 0.17983193277310924,
"grad_norm": 4.13461633288206,
"learning_rate": 9.801836528458453e-06,
"loss": 0.2581,
"step": 107
},
{
"epoch": 0.1815126050420168,
"grad_norm": 4.0431954077083,
"learning_rate": 9.798140471936437e-06,
"loss": 0.2011,
"step": 108
},
{
"epoch": 0.18319327731092436,
"grad_norm": 4.983771680315847,
"learning_rate": 9.79441097447041e-06,
"loss": 0.304,
"step": 109
},
{
"epoch": 0.18487394957983194,
"grad_norm": 4.001740838488305,
"learning_rate": 9.790648062053341e-06,
"loss": 0.3108,
"step": 110
},
{
"epoch": 0.1865546218487395,
"grad_norm": 4.286049724314065,
"learning_rate": 9.786851760911084e-06,
"loss": 0.222,
"step": 111
},
{
"epoch": 0.18823529411764706,
"grad_norm": 4.087795367081155,
"learning_rate": 9.783022097502204e-06,
"loss": 0.3072,
"step": 112
},
{
"epoch": 0.1899159663865546,
"grad_norm": 3.543885725592475,
"learning_rate": 9.779159098517781e-06,
"loss": 0.2562,
"step": 113
},
{
"epoch": 0.1915966386554622,
"grad_norm": 4.331258375709638,
"learning_rate": 9.77526279088123e-06,
"loss": 0.3,
"step": 114
},
{
"epoch": 0.19327731092436976,
"grad_norm": 5.020497063978092,
"learning_rate": 9.771333201748116e-06,
"loss": 0.2768,
"step": 115
},
{
"epoch": 0.1949579831932773,
"grad_norm": 3.3790105404739355,
"learning_rate": 9.767370358505958e-06,
"loss": 0.2275,
"step": 116
},
{
"epoch": 0.19663865546218487,
"grad_norm": 5.114492285734836,
"learning_rate": 9.763374288774043e-06,
"loss": 0.3194,
"step": 117
},
{
"epoch": 0.19831932773109243,
"grad_norm": 4.147457648794141,
"learning_rate": 9.759345020403233e-06,
"loss": 0.2384,
"step": 118
},
{
"epoch": 0.2,
"grad_norm": 4.002739168668194,
"learning_rate": 9.755282581475769e-06,
"loss": 0.2595,
"step": 119
},
{
"epoch": 0.20168067226890757,
"grad_norm": 3.665748728737787,
"learning_rate": 9.751187000305076e-06,
"loss": 0.2456,
"step": 120
},
{
"epoch": 0.20336134453781513,
"grad_norm": 4.317063513127341,
"learning_rate": 9.747058305435566e-06,
"loss": 0.3031,
"step": 121
},
{
"epoch": 0.20504201680672268,
"grad_norm": 4.491629737709359,
"learning_rate": 9.742896525642442e-06,
"loss": 0.3095,
"step": 122
},
{
"epoch": 0.20672268907563024,
"grad_norm": 4.5060463998958245,
"learning_rate": 9.738701689931488e-06,
"loss": 0.309,
"step": 123
},
{
"epoch": 0.20840336134453782,
"grad_norm": 3.8391374970872487,
"learning_rate": 9.734473827538881e-06,
"loss": 0.2701,
"step": 124
},
{
"epoch": 0.21008403361344538,
"grad_norm": 5.610803912721021,
"learning_rate": 9.730212967930974e-06,
"loss": 0.3006,
"step": 125
},
{
"epoch": 0.21176470588235294,
"grad_norm": 3.865567382675256,
"learning_rate": 9.7259191408041e-06,
"loss": 0.2494,
"step": 126
},
{
"epoch": 0.2134453781512605,
"grad_norm": 4.157071231774723,
"learning_rate": 9.721592376084355e-06,
"loss": 0.2929,
"step": 127
},
{
"epoch": 0.21512605042016808,
"grad_norm": 4.69641419523669,
"learning_rate": 9.717232703927402e-06,
"loss": 0.2922,
"step": 128
},
{
"epoch": 0.21680672268907564,
"grad_norm": 3.9492507253815137,
"learning_rate": 9.712840154718253e-06,
"loss": 0.2449,
"step": 129
},
{
"epoch": 0.2184873949579832,
"grad_norm": 4.438337881075212,
"learning_rate": 9.70841475907106e-06,
"loss": 0.2738,
"step": 130
},
{
"epoch": 0.22016806722689075,
"grad_norm": 4.515382778757333,
"learning_rate": 9.703956547828893e-06,
"loss": 0.264,
"step": 131
},
{
"epoch": 0.2218487394957983,
"grad_norm": 3.2827204648104185,
"learning_rate": 9.69946555206354e-06,
"loss": 0.2103,
"step": 132
},
{
"epoch": 0.2235294117647059,
"grad_norm": 6.877887492871786,
"learning_rate": 9.694941803075285e-06,
"loss": 0.4638,
"step": 133
},
{
"epoch": 0.22521008403361345,
"grad_norm": 3.826504365543723,
"learning_rate": 9.690385332392676e-06,
"loss": 0.2347,
"step": 134
},
{
"epoch": 0.226890756302521,
"grad_norm": 4.474707023359214,
"learning_rate": 9.685796171772327e-06,
"loss": 0.4061,
"step": 135
},
{
"epoch": 0.22857142857142856,
"grad_norm": 4.228391017498601,
"learning_rate": 9.681174353198687e-06,
"loss": 0.3177,
"step": 136
},
{
"epoch": 0.23025210084033612,
"grad_norm": 3.7515452786239196,
"learning_rate": 9.67651990888381e-06,
"loss": 0.3122,
"step": 137
},
{
"epoch": 0.2319327731092437,
"grad_norm": 3.8337555170576305,
"learning_rate": 9.67183287126714e-06,
"loss": 0.2236,
"step": 138
},
{
"epoch": 0.23361344537815126,
"grad_norm": 3.575500732595517,
"learning_rate": 9.667113273015283e-06,
"loss": 0.257,
"step": 139
},
{
"epoch": 0.23529411764705882,
"grad_norm": 4.960211439105124,
"learning_rate": 9.66236114702178e-06,
"loss": 0.3158,
"step": 140
},
{
"epoch": 0.23697478991596638,
"grad_norm": 3.7297597163549945,
"learning_rate": 9.657576526406872e-06,
"loss": 0.2612,
"step": 141
},
{
"epoch": 0.23865546218487396,
"grad_norm": 5.716132411582617,
"learning_rate": 9.652759444517276e-06,
"loss": 0.2984,
"step": 142
},
{
"epoch": 0.24033613445378152,
"grad_norm": 3.9938378467188556,
"learning_rate": 9.647909934925952e-06,
"loss": 0.2578,
"step": 143
},
{
"epoch": 0.24201680672268908,
"grad_norm": 4.768607552808638,
"learning_rate": 9.64302803143186e-06,
"loss": 0.3166,
"step": 144
},
{
"epoch": 0.24369747899159663,
"grad_norm": 4.36886602623328,
"learning_rate": 9.63811376805974e-06,
"loss": 0.2879,
"step": 145
},
{
"epoch": 0.2453781512605042,
"grad_norm": 5.159036155402597,
"learning_rate": 9.633167179059859e-06,
"loss": 0.3313,
"step": 146
},
{
"epoch": 0.24705882352941178,
"grad_norm": 3.9079551640082775,
"learning_rate": 9.628188298907782e-06,
"loss": 0.2307,
"step": 147
},
{
"epoch": 0.24873949579831933,
"grad_norm": 4.972815617641453,
"learning_rate": 9.623177162304132e-06,
"loss": 0.3627,
"step": 148
},
{
"epoch": 0.2504201680672269,
"grad_norm": 3.7182475542942637,
"learning_rate": 9.618133804174341e-06,
"loss": 0.2088,
"step": 149
},
{
"epoch": 0.25210084033613445,
"grad_norm": 5.105115930829285,
"learning_rate": 9.613058259668416e-06,
"loss": 0.3468,
"step": 150
},
{
"epoch": 0.253781512605042,
"grad_norm": 4.2243880410090515,
"learning_rate": 9.607950564160682e-06,
"loss": 0.278,
"step": 151
},
{
"epoch": 0.25546218487394956,
"grad_norm": 3.5788756892431417,
"learning_rate": 9.602810753249549e-06,
"loss": 0.2613,
"step": 152
},
{
"epoch": 0.2571428571428571,
"grad_norm": 3.6146625169924502,
"learning_rate": 9.597638862757255e-06,
"loss": 0.2197,
"step": 153
},
{
"epoch": 0.25882352941176473,
"grad_norm": 5.192851016189807,
"learning_rate": 9.592434928729617e-06,
"loss": 0.2943,
"step": 154
},
{
"epoch": 0.2605042016806723,
"grad_norm": 5.012984899286897,
"learning_rate": 9.587198987435782e-06,
"loss": 0.2742,
"step": 155
},
{
"epoch": 0.26218487394957984,
"grad_norm": 4.243030317874798,
"learning_rate": 9.581931075367979e-06,
"loss": 0.3106,
"step": 156
},
{
"epoch": 0.2638655462184874,
"grad_norm": 4.352999777089954,
"learning_rate": 9.576631229241248e-06,
"loss": 0.2602,
"step": 157
},
{
"epoch": 0.26554621848739496,
"grad_norm": 3.8746827574175016,
"learning_rate": 9.57129948599321e-06,
"loss": 0.2375,
"step": 158
},
{
"epoch": 0.2672268907563025,
"grad_norm": 4.370512006750166,
"learning_rate": 9.565935882783784e-06,
"loss": 0.3138,
"step": 159
},
{
"epoch": 0.2689075630252101,
"grad_norm": 4.530164031043821,
"learning_rate": 9.56054045699494e-06,
"loss": 0.2808,
"step": 160
},
{
"epoch": 0.27058823529411763,
"grad_norm": 4.240473901674803,
"learning_rate": 9.555113246230443e-06,
"loss": 0.2534,
"step": 161
},
{
"epoch": 0.2722689075630252,
"grad_norm": 4.518086654583603,
"learning_rate": 9.54965428831558e-06,
"loss": 0.338,
"step": 162
},
{
"epoch": 0.2739495798319328,
"grad_norm": 4.703674977488523,
"learning_rate": 9.544163621296906e-06,
"loss": 0.3264,
"step": 163
},
{
"epoch": 0.27563025210084036,
"grad_norm": 3.5692443136214327,
"learning_rate": 9.538641283441974e-06,
"loss": 0.3219,
"step": 164
},
{
"epoch": 0.2773109243697479,
"grad_norm": 3.6543200856527696,
"learning_rate": 9.533087313239065e-06,
"loss": 0.2259,
"step": 165
},
{
"epoch": 0.27899159663865547,
"grad_norm": 3.215902167038249,
"learning_rate": 9.527501749396924e-06,
"loss": 0.2681,
"step": 166
},
{
"epoch": 0.280672268907563,
"grad_norm": 3.63082759717829,
"learning_rate": 9.521884630844498e-06,
"loss": 0.3603,
"step": 167
},
{
"epoch": 0.2823529411764706,
"grad_norm": 4.395327561528559,
"learning_rate": 9.516235996730645e-06,
"loss": 0.3001,
"step": 168
},
{
"epoch": 0.28403361344537814,
"grad_norm": 3.402290374462376,
"learning_rate": 9.510555886423883e-06,
"loss": 0.2243,
"step": 169
},
{
"epoch": 0.2857142857142857,
"grad_norm": 4.229267629127821,
"learning_rate": 9.504844339512096e-06,
"loss": 0.2396,
"step": 170
},
{
"epoch": 0.28739495798319326,
"grad_norm": 5.626539496937702,
"learning_rate": 9.499101395802277e-06,
"loss": 0.3552,
"step": 171
},
{
"epoch": 0.28907563025210087,
"grad_norm": 4.261902788104125,
"learning_rate": 9.493327095320231e-06,
"loss": 0.2933,
"step": 172
},
{
"epoch": 0.2907563025210084,
"grad_norm": 4.315799314745412,
"learning_rate": 9.487521478310316e-06,
"loss": 0.337,
"step": 173
},
{
"epoch": 0.292436974789916,
"grad_norm": 3.2957622174585466,
"learning_rate": 9.481684585235145e-06,
"loss": 0.2044,
"step": 174
},
{
"epoch": 0.29411764705882354,
"grad_norm": 3.7840993226552264,
"learning_rate": 9.475816456775313e-06,
"loss": 0.355,
"step": 175
},
{
"epoch": 0.2957983193277311,
"grad_norm": 4.315595576291231,
"learning_rate": 9.469917133829114e-06,
"loss": 0.2866,
"step": 176
},
{
"epoch": 0.29747899159663865,
"grad_norm": 3.3039529791986375,
"learning_rate": 9.463986657512254e-06,
"loss": 0.2351,
"step": 177
},
{
"epoch": 0.2991596638655462,
"grad_norm": 3.501475550122351,
"learning_rate": 9.458025069157563e-06,
"loss": 0.2393,
"step": 178
},
{
"epoch": 0.30084033613445377,
"grad_norm": 2.96105228086886,
"learning_rate": 9.452032410314709e-06,
"loss": 0.1883,
"step": 179
},
{
"epoch": 0.3025210084033613,
"grad_norm": 2.680755297383461,
"learning_rate": 9.446008722749906e-06,
"loss": 0.1813,
"step": 180
},
{
"epoch": 0.3042016806722689,
"grad_norm": 4.321387391660858,
"learning_rate": 9.439954048445628e-06,
"loss": 0.2939,
"step": 181
},
{
"epoch": 0.3058823529411765,
"grad_norm": 3.0370917327024367,
"learning_rate": 9.43386842960031e-06,
"loss": 0.2264,
"step": 182
},
{
"epoch": 0.30756302521008405,
"grad_norm": 3.310352207725729,
"learning_rate": 9.427751908628059e-06,
"loss": 0.2324,
"step": 183
},
{
"epoch": 0.3092436974789916,
"grad_norm": 4.805046115785464,
"learning_rate": 9.421604528158355e-06,
"loss": 0.3299,
"step": 184
},
{
"epoch": 0.31092436974789917,
"grad_norm": 4.739559492527739,
"learning_rate": 9.415426331035754e-06,
"loss": 0.3299,
"step": 185
},
{
"epoch": 0.3126050420168067,
"grad_norm": 4.490880194826058,
"learning_rate": 9.409217360319594e-06,
"loss": 0.3312,
"step": 186
},
{
"epoch": 0.3142857142857143,
"grad_norm": 4.700636400248507,
"learning_rate": 9.40297765928369e-06,
"loss": 0.3269,
"step": 187
},
{
"epoch": 0.31596638655462184,
"grad_norm": 4.776516740186767,
"learning_rate": 9.396707271416035e-06,
"loss": 0.4301,
"step": 188
},
{
"epoch": 0.3176470588235294,
"grad_norm": 4.1685832195956465,
"learning_rate": 9.39040624041849e-06,
"loss": 0.2683,
"step": 189
},
{
"epoch": 0.31932773109243695,
"grad_norm": 3.746172051394499,
"learning_rate": 9.384074610206495e-06,
"loss": 0.2646,
"step": 190
},
{
"epoch": 0.32100840336134456,
"grad_norm": 4.379779974988923,
"learning_rate": 9.377712424908743e-06,
"loss": 0.2967,
"step": 191
},
{
"epoch": 0.3226890756302521,
"grad_norm": 4.319433814312415,
"learning_rate": 9.371319728866892e-06,
"loss": 0.3311,
"step": 192
},
{
"epoch": 0.3243697478991597,
"grad_norm": 4.861684100170174,
"learning_rate": 9.36489656663524e-06,
"loss": 0.354,
"step": 193
},
{
"epoch": 0.32605042016806723,
"grad_norm": 4.50859409722337,
"learning_rate": 9.35844298298042e-06,
"loss": 0.3593,
"step": 194
},
{
"epoch": 0.3277310924369748,
"grad_norm": 3.5195246430906493,
"learning_rate": 9.351959022881098e-06,
"loss": 0.2799,
"step": 195
},
{
"epoch": 0.32941176470588235,
"grad_norm": 4.0869900443685045,
"learning_rate": 9.345444731527642e-06,
"loss": 0.2529,
"step": 196
},
{
"epoch": 0.3310924369747899,
"grad_norm": 3.546421899021016,
"learning_rate": 9.338900154321818e-06,
"loss": 0.2463,
"step": 197
},
{
"epoch": 0.33277310924369746,
"grad_norm": 3.9484095318347627,
"learning_rate": 9.332325336876472e-06,
"loss": 0.2371,
"step": 198
},
{
"epoch": 0.334453781512605,
"grad_norm": 4.483956700532791,
"learning_rate": 9.325720325015211e-06,
"loss": 0.3235,
"step": 199
},
{
"epoch": 0.33613445378151263,
"grad_norm": 4.211527014803536,
"learning_rate": 9.319085164772082e-06,
"loss": 0.2591,
"step": 200
},
{
"epoch": 0.3378151260504202,
"grad_norm": 3.622584139095102,
"learning_rate": 9.312419902391256e-06,
"loss": 0.2483,
"step": 201
},
{
"epoch": 0.33949579831932775,
"grad_norm": 3.88188639438146,
"learning_rate": 9.305724584326702e-06,
"loss": 0.2586,
"step": 202
},
{
"epoch": 0.3411764705882353,
"grad_norm": 3.9405976888051475,
"learning_rate": 9.298999257241862e-06,
"loss": 0.2274,
"step": 203
},
{
"epoch": 0.34285714285714286,
"grad_norm": 4.146224854604681,
"learning_rate": 9.292243968009332e-06,
"loss": 0.2594,
"step": 204
},
{
"epoch": 0.3445378151260504,
"grad_norm": 3.989185944129595,
"learning_rate": 9.285458763710524e-06,
"loss": 0.2683,
"step": 205
},
{
"epoch": 0.346218487394958,
"grad_norm": 5.608784350311355,
"learning_rate": 9.278643691635352e-06,
"loss": 0.3709,
"step": 206
},
{
"epoch": 0.34789915966386553,
"grad_norm": 5.532967872431193,
"learning_rate": 9.271798799281893e-06,
"loss": 0.297,
"step": 207
},
{
"epoch": 0.3495798319327731,
"grad_norm": 4.155027058077883,
"learning_rate": 9.264924134356057e-06,
"loss": 0.2531,
"step": 208
},
{
"epoch": 0.35126050420168065,
"grad_norm": 3.754241120066514,
"learning_rate": 9.258019744771256e-06,
"loss": 0.2435,
"step": 209
},
{
"epoch": 0.35294117647058826,
"grad_norm": 4.052985466968748,
"learning_rate": 9.251085678648072e-06,
"loss": 0.3153,
"step": 210
},
{
"epoch": 0.3546218487394958,
"grad_norm": 4.148768432508325,
"learning_rate": 9.244121984313916e-06,
"loss": 0.2732,
"step": 211
},
{
"epoch": 0.3563025210084034,
"grad_norm": 5.068122975449146,
"learning_rate": 9.2371287103027e-06,
"loss": 0.3473,
"step": 212
},
{
"epoch": 0.35798319327731093,
"grad_norm": 4.006936512374535,
"learning_rate": 9.23010590535449e-06,
"loss": 0.315,
"step": 213
},
{
"epoch": 0.3596638655462185,
"grad_norm": 3.547781254170819,
"learning_rate": 9.223053618415168e-06,
"loss": 0.2365,
"step": 214
},
{
"epoch": 0.36134453781512604,
"grad_norm": 4.36880445829326,
"learning_rate": 9.215971898636094e-06,
"loss": 0.2967,
"step": 215
},
{
"epoch": 0.3630252100840336,
"grad_norm": 4.496167262693602,
"learning_rate": 9.208860795373765e-06,
"loss": 0.2878,
"step": 216
},
{
"epoch": 0.36470588235294116,
"grad_norm": 3.887663796616842,
"learning_rate": 9.201720358189464e-06,
"loss": 0.2525,
"step": 217
},
{
"epoch": 0.3663865546218487,
"grad_norm": 4.233661308098312,
"learning_rate": 9.194550636848923e-06,
"loss": 0.2519,
"step": 218
},
{
"epoch": 0.3680672268907563,
"grad_norm": 4.54484885962988,
"learning_rate": 9.187351681321965e-06,
"loss": 0.3203,
"step": 219
},
{
"epoch": 0.3697478991596639,
"grad_norm": 3.4662216920786424,
"learning_rate": 9.180123541782172e-06,
"loss": 0.2214,
"step": 220
},
{
"epoch": 0.37142857142857144,
"grad_norm": 3.5300962901500506,
"learning_rate": 9.172866268606514e-06,
"loss": 0.2963,
"step": 221
},
{
"epoch": 0.373109243697479,
"grad_norm": 4.206031375820797,
"learning_rate": 9.16557991237502e-06,
"loss": 0.2203,
"step": 222
},
{
"epoch": 0.37478991596638656,
"grad_norm": 3.757840416808929,
"learning_rate": 9.158264523870413e-06,
"loss": 0.3239,
"step": 223
},
{
"epoch": 0.3764705882352941,
"grad_norm": 3.597090675392685,
"learning_rate": 9.150920154077753e-06,
"loss": 0.2665,
"step": 224
},
{
"epoch": 0.37815126050420167,
"grad_norm": 4.446089735200772,
"learning_rate": 9.143546854184095e-06,
"loss": 0.2796,
"step": 225
},
{
"epoch": 0.3798319327731092,
"grad_norm": 4.073434401093653,
"learning_rate": 9.136144675578114e-06,
"loss": 0.3209,
"step": 226
},
{
"epoch": 0.3815126050420168,
"grad_norm": 5.293045962034339,
"learning_rate": 9.128713669849767e-06,
"loss": 0.2809,
"step": 227
},
{
"epoch": 0.3831932773109244,
"grad_norm": 3.680410107923094,
"learning_rate": 9.121253888789916e-06,
"loss": 0.2351,
"step": 228
},
{
"epoch": 0.38487394957983195,
"grad_norm": 4.088899341235213,
"learning_rate": 9.113765384389984e-06,
"loss": 0.241,
"step": 229
},
{
"epoch": 0.3865546218487395,
"grad_norm": 4.210493690445477,
"learning_rate": 9.106248208841568e-06,
"loss": 0.2192,
"step": 230
},
{
"epoch": 0.38823529411764707,
"grad_norm": 3.587405660068709,
"learning_rate": 9.098702414536107e-06,
"loss": 0.2711,
"step": 231
},
{
"epoch": 0.3899159663865546,
"grad_norm": 3.825397280710117,
"learning_rate": 9.091128054064487e-06,
"loss": 0.3151,
"step": 232
},
{
"epoch": 0.3915966386554622,
"grad_norm": 4.285084366810055,
"learning_rate": 9.083525180216697e-06,
"loss": 0.3844,
"step": 233
},
{
"epoch": 0.39327731092436974,
"grad_norm": 4.755201193219765,
"learning_rate": 9.075893845981445e-06,
"loss": 0.3369,
"step": 234
},
{
"epoch": 0.3949579831932773,
"grad_norm": 4.281159237147637,
"learning_rate": 9.0682341045458e-06,
"loss": 0.2796,
"step": 235
},
{
"epoch": 0.39663865546218485,
"grad_norm": 4.368844736645276,
"learning_rate": 9.060546009294818e-06,
"loss": 0.2487,
"step": 236
},
{
"epoch": 0.3983193277310924,
"grad_norm": 4.6683321618853135,
"learning_rate": 9.05282961381116e-06,
"loss": 0.3492,
"step": 237
},
{
"epoch": 0.4,
"grad_norm": 5.315996126677612,
"learning_rate": 9.045084971874738e-06,
"loss": 0.3656,
"step": 238
},
{
"epoch": 0.4016806722689076,
"grad_norm": 3.858049576778821,
"learning_rate": 9.037312137462323e-06,
"loss": 0.2944,
"step": 239
},
{
"epoch": 0.40336134453781514,
"grad_norm": 3.759649106675766,
"learning_rate": 9.029511164747175e-06,
"loss": 0.2459,
"step": 240
},
{
"epoch": 0.4050420168067227,
"grad_norm": 3.202243641118359,
"learning_rate": 9.021682108098671e-06,
"loss": 0.2363,
"step": 241
},
{
"epoch": 0.40672268907563025,
"grad_norm": 3.6191519137994645,
"learning_rate": 9.013825022081915e-06,
"loss": 0.2047,
"step": 242
},
{
"epoch": 0.4084033613445378,
"grad_norm": 3.6644452904006033,
"learning_rate": 9.005939961457366e-06,
"loss": 0.262,
"step": 243
},
{
"epoch": 0.41008403361344536,
"grad_norm": 3.7496680659825086,
"learning_rate": 8.998026981180454e-06,
"loss": 0.2771,
"step": 244
},
{
"epoch": 0.4117647058823529,
"grad_norm": 3.8508524533519846,
"learning_rate": 8.990086136401199e-06,
"loss": 0.2579,
"step": 245
},
{
"epoch": 0.4134453781512605,
"grad_norm": 3.7111410813135364,
"learning_rate": 8.982117482463817e-06,
"loss": 0.2416,
"step": 246
},
{
"epoch": 0.4151260504201681,
"grad_norm": 3.989630255920349,
"learning_rate": 8.97412107490635e-06,
"loss": 0.298,
"step": 247
},
{
"epoch": 0.41680672268907565,
"grad_norm": 4.732255647732948,
"learning_rate": 8.966096969460263e-06,
"loss": 0.32,
"step": 248
},
{
"epoch": 0.4184873949579832,
"grad_norm": 4.411373768817184,
"learning_rate": 8.958045222050073e-06,
"loss": 0.2691,
"step": 249
},
{
"epoch": 0.42016806722689076,
"grad_norm": 3.1311996398393442,
"learning_rate": 8.94996588879294e-06,
"loss": 0.221,
"step": 250
},
{
"epoch": 0.4218487394957983,
"grad_norm": 4.469878722633478,
"learning_rate": 8.94185902599829e-06,
"loss": 0.2651,
"step": 251
},
{
"epoch": 0.4235294117647059,
"grad_norm": 4.365076580209745,
"learning_rate": 8.933724690167417e-06,
"loss": 0.2595,
"step": 252
},
{
"epoch": 0.42521008403361343,
"grad_norm": 4.633047058160098,
"learning_rate": 8.92556293799309e-06,
"loss": 0.3017,
"step": 253
},
{
"epoch": 0.426890756302521,
"grad_norm": 5.2053731159390155,
"learning_rate": 8.917373826359156e-06,
"loss": 0.299,
"step": 254
},
{
"epoch": 0.42857142857142855,
"grad_norm": 4.447949765493168,
"learning_rate": 8.90915741234015e-06,
"loss": 0.3193,
"step": 255
},
{
"epoch": 0.43025210084033616,
"grad_norm": 3.8152143367175513,
"learning_rate": 8.900913753200887e-06,
"loss": 0.3044,
"step": 256
},
{
"epoch": 0.4319327731092437,
"grad_norm": 4.174809354409252,
"learning_rate": 8.892642906396076e-06,
"loss": 0.2842,
"step": 257
},
{
"epoch": 0.4336134453781513,
"grad_norm": 3.77208471800023,
"learning_rate": 8.884344929569905e-06,
"loss": 0.2609,
"step": 258
},
{
"epoch": 0.43529411764705883,
"grad_norm": 4.447034923610601,
"learning_rate": 8.87601988055565e-06,
"loss": 0.2726,
"step": 259
},
{
"epoch": 0.4369747899159664,
"grad_norm": 5.8600808097772195,
"learning_rate": 8.867667817375266e-06,
"loss": 0.346,
"step": 260
},
{
"epoch": 0.43865546218487395,
"grad_norm": 4.540496751153247,
"learning_rate": 8.859288798238988e-06,
"loss": 0.3182,
"step": 261
},
{
"epoch": 0.4403361344537815,
"grad_norm": 2.8122638765624397,
"learning_rate": 8.850882881544923e-06,
"loss": 0.1844,
"step": 262
},
{
"epoch": 0.44201680672268906,
"grad_norm": 3.5705823570496684,
"learning_rate": 8.842450125878634e-06,
"loss": 0.2554,
"step": 263
},
{
"epoch": 0.4436974789915966,
"grad_norm": 3.455189753673065,
"learning_rate": 8.833990590012749e-06,
"loss": 0.2733,
"step": 264
},
{
"epoch": 0.44537815126050423,
"grad_norm": 4.535794944046676,
"learning_rate": 8.825504332906542e-06,
"loss": 0.3123,
"step": 265
},
{
"epoch": 0.4470588235294118,
"grad_norm": 4.501300539384694,
"learning_rate": 8.816991413705515e-06,
"loss": 0.3499,
"step": 266
},
{
"epoch": 0.44873949579831934,
"grad_norm": 3.772508646537353,
"learning_rate": 8.808451891741001e-06,
"loss": 0.2276,
"step": 267
},
{
"epoch": 0.4504201680672269,
"grad_norm": 4.46019811065151,
"learning_rate": 8.799885826529736e-06,
"loss": 0.3519,
"step": 268
},
{
"epoch": 0.45210084033613446,
"grad_norm": 3.836911860530734,
"learning_rate": 8.79129327777346e-06,
"loss": 0.311,
"step": 269
},
{
"epoch": 0.453781512605042,
"grad_norm": 4.17176003502619,
"learning_rate": 8.782674305358481e-06,
"loss": 0.43,
"step": 270
},
{
"epoch": 0.45546218487394957,
"grad_norm": 4.563428438365214,
"learning_rate": 8.774028969355273e-06,
"loss": 0.3458,
"step": 271
},
{
"epoch": 0.45714285714285713,
"grad_norm": 3.1154650936868693,
"learning_rate": 8.765357330018056e-06,
"loss": 0.2571,
"step": 272
},
{
"epoch": 0.4588235294117647,
"grad_norm": 4.122139441670632,
"learning_rate": 8.756659447784367e-06,
"loss": 0.262,
"step": 273
},
{
"epoch": 0.46050420168067224,
"grad_norm": 3.8315294608965855,
"learning_rate": 8.74793538327465e-06,
"loss": 0.2821,
"step": 274
},
{
"epoch": 0.46218487394957986,
"grad_norm": 3.486277326170669,
"learning_rate": 8.739185197291824e-06,
"loss": 0.2609,
"step": 275
},
{
"epoch": 0.4638655462184874,
"grad_norm": 3.9161973496705804,
"learning_rate": 8.730408950820864e-06,
"loss": 0.2659,
"step": 276
},
{
"epoch": 0.46554621848739497,
"grad_norm": 3.9755393105316026,
"learning_rate": 8.721606705028376e-06,
"loss": 0.3386,
"step": 277
},
{
"epoch": 0.4672268907563025,
"grad_norm": 4.113828914872,
"learning_rate": 8.71277852126217e-06,
"loss": 0.3051,
"step": 278
},
{
"epoch": 0.4689075630252101,
"grad_norm": 4.36471703505004,
"learning_rate": 8.703924461050832e-06,
"loss": 0.3559,
"step": 279
},
{
"epoch": 0.47058823529411764,
"grad_norm": 3.2200963840010313,
"learning_rate": 8.695044586103297e-06,
"loss": 0.2411,
"step": 280
},
{
"epoch": 0.4722689075630252,
"grad_norm": 3.703533034697428,
"learning_rate": 8.686138958308415e-06,
"loss": 0.2494,
"step": 281
},
{
"epoch": 0.47394957983193275,
"grad_norm": 4.893742503645441,
"learning_rate": 8.67720763973452e-06,
"loss": 0.3223,
"step": 282
},
{
"epoch": 0.4756302521008403,
"grad_norm": 3.4635478444534953,
"learning_rate": 8.668250692629008e-06,
"loss": 0.2756,
"step": 283
},
{
"epoch": 0.4773109243697479,
"grad_norm": 3.6201745404997614,
"learning_rate": 8.659268179417886e-06,
"loss": 0.2908,
"step": 284
},
{
"epoch": 0.4789915966386555,
"grad_norm": 3.782243262021975,
"learning_rate": 8.65026016270535e-06,
"loss": 0.234,
"step": 285
},
{
"epoch": 0.48067226890756304,
"grad_norm": 3.5478171634722813,
"learning_rate": 8.641226705273344e-06,
"loss": 0.2861,
"step": 286
},
{
"epoch": 0.4823529411764706,
"grad_norm": 4.037626445390373,
"learning_rate": 8.632167870081122e-06,
"loss": 0.2432,
"step": 287
},
{
"epoch": 0.48403361344537815,
"grad_norm": 3.2184403921642213,
"learning_rate": 8.623083720264806e-06,
"loss": 0.2363,
"step": 288
},
{
"epoch": 0.4857142857142857,
"grad_norm": 5.052805269729211,
"learning_rate": 8.613974319136959e-06,
"loss": 0.3165,
"step": 289
},
{
"epoch": 0.48739495798319327,
"grad_norm": 3.218456415544736,
"learning_rate": 8.604839730186125e-06,
"loss": 0.2267,
"step": 290
},
{
"epoch": 0.4890756302521008,
"grad_norm": 4.252440000738233,
"learning_rate": 8.595680017076403e-06,
"loss": 0.2729,
"step": 291
},
{
"epoch": 0.4907563025210084,
"grad_norm": 3.1859679446297022,
"learning_rate": 8.586495243646992e-06,
"loss": 0.251,
"step": 292
},
{
"epoch": 0.492436974789916,
"grad_norm": 3.66415139207086,
"learning_rate": 8.577285473911753e-06,
"loss": 0.2324,
"step": 293
},
{
"epoch": 0.49411764705882355,
"grad_norm": 4.099894312537824,
"learning_rate": 8.568050772058763e-06,
"loss": 0.2386,
"step": 294
},
{
"epoch": 0.4957983193277311,
"grad_norm": 3.9209399542308607,
"learning_rate": 8.558791202449857e-06,
"loss": 0.3748,
"step": 295
},
{
"epoch": 0.49747899159663866,
"grad_norm": 3.3376430249824636,
"learning_rate": 8.549506829620193e-06,
"loss": 0.2286,
"step": 296
},
{
"epoch": 0.4991596638655462,
"grad_norm": 3.062020828679799,
"learning_rate": 8.540197718277797e-06,
"loss": 0.2708,
"step": 297
},
{
"epoch": 0.5008403361344538,
"grad_norm": 3.1937828441308334,
"learning_rate": 8.530863933303108e-06,
"loss": 0.2086,
"step": 298
},
{
"epoch": 0.5025210084033613,
"grad_norm": 3.521499964472593,
"learning_rate": 8.521505539748535e-06,
"loss": 0.2447,
"step": 299
},
{
"epoch": 0.5042016806722689,
"grad_norm": 4.642967511496677,
"learning_rate": 8.512122602837993e-06,
"loss": 0.2472,
"step": 300
},
{
"epoch": 0.5058823529411764,
"grad_norm": 4.284360904697484,
"learning_rate": 8.502715187966455e-06,
"loss": 0.3206,
"step": 301
},
{
"epoch": 0.507563025210084,
"grad_norm": 3.465500865378122,
"learning_rate": 8.493283360699496e-06,
"loss": 0.2294,
"step": 302
},
{
"epoch": 0.5092436974789916,
"grad_norm": 3.4457509437221177,
"learning_rate": 8.483827186772832e-06,
"loss": 0.2912,
"step": 303
},
{
"epoch": 0.5109243697478991,
"grad_norm": 3.808581617500537,
"learning_rate": 8.47434673209187e-06,
"loss": 0.2446,
"step": 304
},
{
"epoch": 0.5126050420168067,
"grad_norm": 3.3314215181937095,
"learning_rate": 8.464842062731235e-06,
"loss": 0.2212,
"step": 305
},
{
"epoch": 0.5142857142857142,
"grad_norm": 4.039124686139426,
"learning_rate": 8.455313244934324e-06,
"loss": 0.2455,
"step": 306
},
{
"epoch": 0.5159663865546219,
"grad_norm": 4.193722450586983,
"learning_rate": 8.445760345112836e-06,
"loss": 0.2481,
"step": 307
},
{
"epoch": 0.5176470588235295,
"grad_norm": 3.889796309337689,
"learning_rate": 8.436183429846314e-06,
"loss": 0.2859,
"step": 308
},
{
"epoch": 0.519327731092437,
"grad_norm": 3.9169255433081664,
"learning_rate": 8.426582565881674e-06,
"loss": 0.263,
"step": 309
},
{
"epoch": 0.5210084033613446,
"grad_norm": 3.418188058193833,
"learning_rate": 8.416957820132743e-06,
"loss": 0.2356,
"step": 310
},
{
"epoch": 0.5226890756302521,
"grad_norm": 3.2839702099179067,
"learning_rate": 8.407309259679801e-06,
"loss": 0.2555,
"step": 311
},
{
"epoch": 0.5243697478991597,
"grad_norm": 3.9981862118067957,
"learning_rate": 8.397636951769099e-06,
"loss": 0.2468,
"step": 312
},
{
"epoch": 0.5260504201680672,
"grad_norm": 4.1369216765373595,
"learning_rate": 8.387940963812398e-06,
"loss": 0.2481,
"step": 313
},
{
"epoch": 0.5277310924369748,
"grad_norm": 4.270925434895349,
"learning_rate": 8.378221363386506e-06,
"loss": 0.2558,
"step": 314
},
{
"epoch": 0.5294117647058824,
"grad_norm": 3.240664369248623,
"learning_rate": 8.368478218232787e-06,
"loss": 0.2583,
"step": 315
},
{
"epoch": 0.5310924369747899,
"grad_norm": 3.1933231590113142,
"learning_rate": 8.358711596256712e-06,
"loss": 0.215,
"step": 316
},
{
"epoch": 0.5327731092436975,
"grad_norm": 4.167983149331361,
"learning_rate": 8.348921565527373e-06,
"loss": 0.3324,
"step": 317
},
{
"epoch": 0.534453781512605,
"grad_norm": 3.4358411113634646,
"learning_rate": 8.339108194277006e-06,
"loss": 0.2589,
"step": 318
},
{
"epoch": 0.5361344537815126,
"grad_norm": 4.164325821445187,
"learning_rate": 8.329271550900528e-06,
"loss": 0.298,
"step": 319
},
{
"epoch": 0.5378151260504201,
"grad_norm": 3.5852432229900613,
"learning_rate": 8.319411703955042e-06,
"loss": 0.2575,
"step": 320
},
{
"epoch": 0.5394957983193277,
"grad_norm": 4.392211699734561,
"learning_rate": 8.309528722159383e-06,
"loss": 0.2755,
"step": 321
},
{
"epoch": 0.5411764705882353,
"grad_norm": 4.185705868785948,
"learning_rate": 8.299622674393615e-06,
"loss": 0.2893,
"step": 322
},
{
"epoch": 0.5428571428571428,
"grad_norm": 4.547143925765926,
"learning_rate": 8.289693629698564e-06,
"loss": 0.3375,
"step": 323
},
{
"epoch": 0.5445378151260504,
"grad_norm": 3.9462639969766977,
"learning_rate": 8.27974165727534e-06,
"loss": 0.3053,
"step": 324
},
{
"epoch": 0.5462184873949579,
"grad_norm": 3.972277144129639,
"learning_rate": 8.269766826484841e-06,
"loss": 0.2827,
"step": 325
},
{
"epoch": 0.5478991596638656,
"grad_norm": 3.7091300891124526,
"learning_rate": 8.259769206847286e-06,
"loss": 0.2751,
"step": 326
},
{
"epoch": 0.5495798319327732,
"grad_norm": 3.8728056075092736,
"learning_rate": 8.249748868041717e-06,
"loss": 0.2629,
"step": 327
},
{
"epoch": 0.5512605042016807,
"grad_norm": 3.230198623667927,
"learning_rate": 8.239705879905519e-06,
"loss": 0.2557,
"step": 328
},
{
"epoch": 0.5529411764705883,
"grad_norm": 4.162100326672927,
"learning_rate": 8.229640312433938e-06,
"loss": 0.2249,
"step": 329
},
{
"epoch": 0.5546218487394958,
"grad_norm": 3.030431828388766,
"learning_rate": 8.219552235779578e-06,
"loss": 0.2382,
"step": 330
},
{
"epoch": 0.5563025210084034,
"grad_norm": 3.2618326842590446,
"learning_rate": 8.209441720251934e-06,
"loss": 0.2562,
"step": 331
},
{
"epoch": 0.5579831932773109,
"grad_norm": 4.60003136005453,
"learning_rate": 8.199308836316883e-06,
"loss": 0.2635,
"step": 332
},
{
"epoch": 0.5596638655462185,
"grad_norm": 5.124504363614122,
"learning_rate": 8.189153654596199e-06,
"loss": 0.2848,
"step": 333
},
{
"epoch": 0.561344537815126,
"grad_norm": 3.093400315613663,
"learning_rate": 8.178976245867068e-06,
"loss": 0.2058,
"step": 334
},
{
"epoch": 0.5630252100840336,
"grad_norm": 3.393259880948687,
"learning_rate": 8.168776681061583e-06,
"loss": 0.2009,
"step": 335
},
{
"epoch": 0.5647058823529412,
"grad_norm": 3.9012793474429226,
"learning_rate": 8.158555031266255e-06,
"loss": 0.2728,
"step": 336
},
{
"epoch": 0.5663865546218487,
"grad_norm": 4.010363842008162,
"learning_rate": 8.148311367721524e-06,
"loss": 0.2487,
"step": 337
},
{
"epoch": 0.5680672268907563,
"grad_norm": 3.313090386559345,
"learning_rate": 8.138045761821252e-06,
"loss": 0.2392,
"step": 338
},
{
"epoch": 0.5697478991596638,
"grad_norm": 3.003737875930333,
"learning_rate": 8.127758285112226e-06,
"loss": 0.196,
"step": 339
},
{
"epoch": 0.5714285714285714,
"grad_norm": 4.813841107800995,
"learning_rate": 8.117449009293668e-06,
"loss": 0.2345,
"step": 340
},
{
"epoch": 0.573109243697479,
"grad_norm": 3.820262645541521,
"learning_rate": 8.107118006216732e-06,
"loss": 0.3774,
"step": 341
},
{
"epoch": 0.5747899159663865,
"grad_norm": 3.5798699267278256,
"learning_rate": 8.096765347883995e-06,
"loss": 0.323,
"step": 342
},
{
"epoch": 0.5764705882352941,
"grad_norm": 3.7918963101834695,
"learning_rate": 8.086391106448965e-06,
"loss": 0.2771,
"step": 343
},
{
"epoch": 0.5781512605042017,
"grad_norm": 3.710871164871179,
"learning_rate": 8.075995354215578e-06,
"loss": 0.2296,
"step": 344
},
{
"epoch": 0.5798319327731093,
"grad_norm": 3.081675164930532,
"learning_rate": 8.065578163637686e-06,
"loss": 0.239,
"step": 345
},
{
"epoch": 0.5815126050420169,
"grad_norm": 3.472850199568395,
"learning_rate": 8.055139607318558e-06,
"loss": 0.2581,
"step": 346
},
{
"epoch": 0.5831932773109244,
"grad_norm": 3.5895910105236988,
"learning_rate": 8.044679758010376e-06,
"loss": 0.25,
"step": 347
},
{
"epoch": 0.584873949579832,
"grad_norm": 5.03178448321192,
"learning_rate": 8.03419868861372e-06,
"loss": 0.1916,
"step": 348
},
{
"epoch": 0.5865546218487395,
"grad_norm": 3.821763309544136,
"learning_rate": 8.023696472177068e-06,
"loss": 0.2909,
"step": 349
},
{
"epoch": 0.5882352941176471,
"grad_norm": 4.0276823826082095,
"learning_rate": 8.013173181896283e-06,
"loss": 0.2604,
"step": 350
},
{
"epoch": 0.5899159663865546,
"grad_norm": 2.9398477769509705,
"learning_rate": 8.002628891114104e-06,
"loss": 0.2271,
"step": 351
},
{
"epoch": 0.5915966386554622,
"grad_norm": 4.554336270535579,
"learning_rate": 7.992063673319632e-06,
"loss": 0.3412,
"step": 352
},
{
"epoch": 0.5932773109243697,
"grad_norm": 4.154201529208768,
"learning_rate": 7.981477602147823e-06,
"loss": 0.265,
"step": 353
},
{
"epoch": 0.5949579831932773,
"grad_norm": 2.665436237812907,
"learning_rate": 7.97087075137897e-06,
"loss": 0.1938,
"step": 354
},
{
"epoch": 0.5966386554621849,
"grad_norm": 3.3626195738967577,
"learning_rate": 7.960243194938192e-06,
"loss": 0.228,
"step": 355
},
{
"epoch": 0.5983193277310924,
"grad_norm": 3.8286004476475335,
"learning_rate": 7.949595006894917e-06,
"loss": 0.257,
"step": 356
},
{
"epoch": 0.6,
"grad_norm": 3.0558429385007275,
"learning_rate": 7.938926261462366e-06,
"loss": 0.3105,
"step": 357
},
{
"epoch": 0.6016806722689075,
"grad_norm": 4.545539695509062,
"learning_rate": 7.928237032997037e-06,
"loss": 0.2787,
"step": 358
},
{
"epoch": 0.6033613445378151,
"grad_norm": 3.4144872596036326,
"learning_rate": 7.917527395998183e-06,
"loss": 0.2107,
"step": 359
},
{
"epoch": 0.6050420168067226,
"grad_norm": 3.197576947653472,
"learning_rate": 7.9067974251073e-06,
"loss": 0.2402,
"step": 360
},
{
"epoch": 0.6067226890756302,
"grad_norm": 4.953522954492192,
"learning_rate": 7.896047195107599e-06,
"loss": 0.4079,
"step": 361
},
{
"epoch": 0.6084033613445378,
"grad_norm": 3.913644693006596,
"learning_rate": 7.885276780923488e-06,
"loss": 0.2446,
"step": 362
},
{
"epoch": 0.6100840336134454,
"grad_norm": 4.162061152262566,
"learning_rate": 7.87448625762005e-06,
"loss": 0.2775,
"step": 363
},
{
"epoch": 0.611764705882353,
"grad_norm": 3.9118577002779915,
"learning_rate": 7.863675700402527e-06,
"loss": 0.2945,
"step": 364
},
{
"epoch": 0.6134453781512605,
"grad_norm": 3.835742001918319,
"learning_rate": 7.852845184615776e-06,
"loss": 0.3243,
"step": 365
},
{
"epoch": 0.6151260504201681,
"grad_norm": 3.811064930664159,
"learning_rate": 7.841994785743765e-06,
"loss": 0.275,
"step": 366
},
{
"epoch": 0.6168067226890757,
"grad_norm": 3.638636507424936,
"learning_rate": 7.831124579409036e-06,
"loss": 0.2385,
"step": 367
},
{
"epoch": 0.6184873949579832,
"grad_norm": 3.7590755941327263,
"learning_rate": 7.820234641372182e-06,
"loss": 0.2615,
"step": 368
},
{
"epoch": 0.6201680672268908,
"grad_norm": 4.262512697571422,
"learning_rate": 7.809325047531315e-06,
"loss": 0.2305,
"step": 369
},
{
"epoch": 0.6218487394957983,
"grad_norm": 3.5789208550854137,
"learning_rate": 7.798395873921542e-06,
"loss": 0.2829,
"step": 370
},
{
"epoch": 0.6235294117647059,
"grad_norm": 3.6770787617731657,
"learning_rate": 7.787447196714428e-06,
"loss": 0.3103,
"step": 371
},
{
"epoch": 0.6252100840336134,
"grad_norm": 4.681057793290432,
"learning_rate": 7.776479092217475e-06,
"loss": 0.3245,
"step": 372
},
{
"epoch": 0.626890756302521,
"grad_norm": 4.585623869647911,
"learning_rate": 7.76549163687358e-06,
"loss": 0.2892,
"step": 373
},
{
"epoch": 0.6285714285714286,
"grad_norm": 3.820748783778272,
"learning_rate": 7.754484907260513e-06,
"loss": 0.2256,
"step": 374
},
{
"epoch": 0.6302521008403361,
"grad_norm": 3.9111468680258086,
"learning_rate": 7.743458980090371e-06,
"loss": 0.2549,
"step": 375
},
{
"epoch": 0.6319327731092437,
"grad_norm": 3.1067953029914888,
"learning_rate": 7.73241393220905e-06,
"loss": 0.2132,
"step": 376
},
{
"epoch": 0.6336134453781512,
"grad_norm": 3.164293144904975,
"learning_rate": 7.721349840595713e-06,
"loss": 0.2092,
"step": 377
},
{
"epoch": 0.6352941176470588,
"grad_norm": 3.734593960662017,
"learning_rate": 7.710266782362248e-06,
"loss": 0.2976,
"step": 378
},
{
"epoch": 0.6369747899159663,
"grad_norm": 3.1220699118832407,
"learning_rate": 7.69916483475273e-06,
"loss": 0.2478,
"step": 379
},
{
"epoch": 0.6386554621848739,
"grad_norm": 3.2574034724313177,
"learning_rate": 7.688044075142888e-06,
"loss": 0.1939,
"step": 380
},
{
"epoch": 0.6403361344537815,
"grad_norm": 3.4284207068175907,
"learning_rate": 7.676904581039559e-06,
"loss": 0.2866,
"step": 381
},
{
"epoch": 0.6420168067226891,
"grad_norm": 3.040455959047564,
"learning_rate": 7.665746430080155e-06,
"loss": 0.217,
"step": 382
},
{
"epoch": 0.6436974789915967,
"grad_norm": 3.224524280206461,
"learning_rate": 7.654569700032112e-06,
"loss": 0.2332,
"step": 383
},
{
"epoch": 0.6453781512605042,
"grad_norm": 3.6681207305304744,
"learning_rate": 7.643374468792364e-06,
"loss": 0.23,
"step": 384
},
{
"epoch": 0.6470588235294118,
"grad_norm": 2.76803712271111,
"learning_rate": 7.63216081438678e-06,
"loss": 0.1768,
"step": 385
},
{
"epoch": 0.6487394957983194,
"grad_norm": 3.172049754177004,
"learning_rate": 7.620928814969636e-06,
"loss": 0.2458,
"step": 386
},
{
"epoch": 0.6504201680672269,
"grad_norm": 4.021324325221757,
"learning_rate": 7.609678548823065e-06,
"loss": 0.2878,
"step": 387
},
{
"epoch": 0.6521008403361345,
"grad_norm": 4.45631664976017,
"learning_rate": 7.5984100943565055e-06,
"loss": 0.3669,
"step": 388
},
{
"epoch": 0.653781512605042,
"grad_norm": 4.350536114923283,
"learning_rate": 7.587123530106171e-06,
"loss": 0.3114,
"step": 389
},
{
"epoch": 0.6554621848739496,
"grad_norm": 4.110763780984955,
"learning_rate": 7.57581893473448e-06,
"loss": 0.3002,
"step": 390
},
{
"epoch": 0.6571428571428571,
"grad_norm": 3.824638074318695,
"learning_rate": 7.564496387029532e-06,
"loss": 0.2555,
"step": 391
},
{
"epoch": 0.6588235294117647,
"grad_norm": 4.166305509765287,
"learning_rate": 7.553155965904535e-06,
"loss": 0.2617,
"step": 392
},
{
"epoch": 0.6605042016806723,
"grad_norm": 4.290626462011601,
"learning_rate": 7.541797750397277e-06,
"loss": 0.2565,
"step": 393
},
{
"epoch": 0.6621848739495798,
"grad_norm": 3.688859506019526,
"learning_rate": 7.530421819669558e-06,
"loss": 0.2639,
"step": 394
},
{
"epoch": 0.6638655462184874,
"grad_norm": 3.6789226419918446,
"learning_rate": 7.519028253006649e-06,
"loss": 0.2741,
"step": 395
},
{
"epoch": 0.6655462184873949,
"grad_norm": 3.3382701043360647,
"learning_rate": 7.507617129816733e-06,
"loss": 0.2206,
"step": 396
},
{
"epoch": 0.6672268907563025,
"grad_norm": 3.3066950479168153,
"learning_rate": 7.496188529630359e-06,
"loss": 0.2993,
"step": 397
},
{
"epoch": 0.66890756302521,
"grad_norm": 3.893635706930087,
"learning_rate": 7.484742532099878e-06,
"loss": 0.2326,
"step": 398
},
{
"epoch": 0.6705882352941176,
"grad_norm": 4.099621838646233,
"learning_rate": 7.473279216998896e-06,
"loss": 0.2828,
"step": 399
},
{
"epoch": 0.6722689075630253,
"grad_norm": 5.661766299326251,
"learning_rate": 7.461798664221711e-06,
"loss": 0.2377,
"step": 400
},
{
"epoch": 0.6739495798319328,
"grad_norm": 2.870347406882497,
"learning_rate": 7.450300953782768e-06,
"loss": 0.151,
"step": 401
},
{
"epoch": 0.6756302521008404,
"grad_norm": 3.530732621066666,
"learning_rate": 7.438786165816084e-06,
"loss": 0.3204,
"step": 402
},
{
"epoch": 0.6773109243697479,
"grad_norm": 4.951946205032062,
"learning_rate": 7.427254380574705e-06,
"loss": 0.3073,
"step": 403
},
{
"epoch": 0.6789915966386555,
"grad_norm": 4.425642984242483,
"learning_rate": 7.415705678430138e-06,
"loss": 0.2526,
"step": 404
},
{
"epoch": 0.680672268907563,
"grad_norm": 4.575800850868975,
"learning_rate": 7.404140139871797e-06,
"loss": 0.3725,
"step": 405
},
{
"epoch": 0.6823529411764706,
"grad_norm": 4.0365196047687535,
"learning_rate": 7.392557845506433e-06,
"loss": 0.3002,
"step": 406
},
{
"epoch": 0.6840336134453782,
"grad_norm": 5.981792145534908,
"learning_rate": 7.380958876057581e-06,
"loss": 0.2435,
"step": 407
},
{
"epoch": 0.6857142857142857,
"grad_norm": 4.363575458393627,
"learning_rate": 7.369343312364994e-06,
"loss": 0.2212,
"step": 408
},
{
"epoch": 0.6873949579831933,
"grad_norm": 4.340605315487809,
"learning_rate": 7.357711235384079e-06,
"loss": 0.2507,
"step": 409
},
{
"epoch": 0.6890756302521008,
"grad_norm": 3.6485734876832274,
"learning_rate": 7.346062726185332e-06,
"loss": 0.2597,
"step": 410
},
{
"epoch": 0.6907563025210084,
"grad_norm": 4.0943244771611,
"learning_rate": 7.3343978659537775e-06,
"loss": 0.2611,
"step": 411
},
{
"epoch": 0.692436974789916,
"grad_norm": 3.5101040552518206,
"learning_rate": 7.3227167359883964e-06,
"loss": 0.2051,
"step": 412
},
{
"epoch": 0.6941176470588235,
"grad_norm": 4.3439206313504535,
"learning_rate": 7.311019417701567e-06,
"loss": 0.2752,
"step": 413
},
{
"epoch": 0.6957983193277311,
"grad_norm": 3.9550057956306093,
"learning_rate": 7.299305992618488e-06,
"loss": 0.2956,
"step": 414
},
{
"epoch": 0.6974789915966386,
"grad_norm": 3.1277606523059927,
"learning_rate": 7.287576542376616e-06,
"loss": 0.2466,
"step": 415
},
{
"epoch": 0.6991596638655462,
"grad_norm": 3.1005601888199292,
"learning_rate": 7.275831148725101e-06,
"loss": 0.2144,
"step": 416
},
{
"epoch": 0.7008403361344537,
"grad_norm": 3.9059268145411625,
"learning_rate": 7.264069893524207e-06,
"loss": 0.3389,
"step": 417
},
{
"epoch": 0.7025210084033613,
"grad_norm": 3.377596564727831,
"learning_rate": 7.252292858744747e-06,
"loss": 0.2157,
"step": 418
},
{
"epoch": 0.704201680672269,
"grad_norm": 4.187645692769235,
"learning_rate": 7.24050012646751e-06,
"loss": 0.3017,
"step": 419
},
{
"epoch": 0.7058823529411765,
"grad_norm": 3.610783613896857,
"learning_rate": 7.2286917788826926e-06,
"loss": 0.2494,
"step": 420
},
{
"epoch": 0.7075630252100841,
"grad_norm": 3.048155560726766,
"learning_rate": 7.216867898289319e-06,
"loss": 0.2092,
"step": 421
},
{
"epoch": 0.7092436974789916,
"grad_norm": 3.416863835083985,
"learning_rate": 7.2050285670946776e-06,
"loss": 0.2012,
"step": 422
},
{
"epoch": 0.7109243697478992,
"grad_norm": 3.6205625981668983,
"learning_rate": 7.193173867813735e-06,
"loss": 0.1958,
"step": 423
},
{
"epoch": 0.7126050420168067,
"grad_norm": 3.2909907100719495,
"learning_rate": 7.181303883068569e-06,
"loss": 0.2534,
"step": 424
},
{
"epoch": 0.7142857142857143,
"grad_norm": 4.094896807207073,
"learning_rate": 7.169418695587791e-06,
"loss": 0.2834,
"step": 425
},
{
"epoch": 0.7159663865546219,
"grad_norm": 3.7570453300774975,
"learning_rate": 7.157518388205969e-06,
"loss": 0.2357,
"step": 426
},
{
"epoch": 0.7176470588235294,
"grad_norm": 4.34455688743933,
"learning_rate": 7.145603043863045e-06,
"loss": 0.4079,
"step": 427
},
{
"epoch": 0.719327731092437,
"grad_norm": 3.9605795228359453,
"learning_rate": 7.1336727456037716e-06,
"loss": 0.2724,
"step": 428
},
{
"epoch": 0.7210084033613445,
"grad_norm": 4.50690190414524,
"learning_rate": 7.121727576577116e-06,
"loss": 0.2715,
"step": 429
},
{
"epoch": 0.7226890756302521,
"grad_norm": 3.4305670733319635,
"learning_rate": 7.109767620035689e-06,
"loss": 0.2527,
"step": 430
},
{
"epoch": 0.7243697478991596,
"grad_norm": 3.7556549613232155,
"learning_rate": 7.097792959335169e-06,
"loss": 0.272,
"step": 431
},
{
"epoch": 0.7260504201680672,
"grad_norm": 4.331780366651023,
"learning_rate": 7.0858036779337095e-06,
"loss": 0.269,
"step": 432
},
{
"epoch": 0.7277310924369748,
"grad_norm": 2.9644967270995464,
"learning_rate": 7.0737998593913686e-06,
"loss": 0.2009,
"step": 433
},
{
"epoch": 0.7294117647058823,
"grad_norm": 3.424245481423386,
"learning_rate": 7.061781587369518e-06,
"loss": 0.1963,
"step": 434
},
{
"epoch": 0.7310924369747899,
"grad_norm": 3.5553808491409975,
"learning_rate": 7.049748945630269e-06,
"loss": 0.2204,
"step": 435
},
{
"epoch": 0.7327731092436974,
"grad_norm": 2.988384045794864,
"learning_rate": 7.037702018035878e-06,
"loss": 0.255,
"step": 436
},
{
"epoch": 0.7344537815126051,
"grad_norm": 3.877729077495976,
"learning_rate": 7.0256408885481694e-06,
"loss": 0.2379,
"step": 437
},
{
"epoch": 0.7361344537815127,
"grad_norm": 3.8152006801676293,
"learning_rate": 7.013565641227954e-06,
"loss": 0.2795,
"step": 438
},
{
"epoch": 0.7378151260504202,
"grad_norm": 3.8197950101640408,
"learning_rate": 7.001476360234428e-06,
"loss": 0.2441,
"step": 439
},
{
"epoch": 0.7394957983193278,
"grad_norm": 3.653213590928426,
"learning_rate": 6.989373129824605e-06,
"loss": 0.2333,
"step": 440
},
{
"epoch": 0.7411764705882353,
"grad_norm": 3.1988797850199986,
"learning_rate": 6.977256034352713e-06,
"loss": 0.1969,
"step": 441
},
{
"epoch": 0.7428571428571429,
"grad_norm": 3.6369472387733315,
"learning_rate": 6.965125158269619e-06,
"loss": 0.2039,
"step": 442
},
{
"epoch": 0.7445378151260504,
"grad_norm": 3.504132707054677,
"learning_rate": 6.952980586122231e-06,
"loss": 0.2294,
"step": 443
},
{
"epoch": 0.746218487394958,
"grad_norm": 3.9801905625390166,
"learning_rate": 6.940822402552914e-06,
"loss": 0.2557,
"step": 444
},
{
"epoch": 0.7478991596638656,
"grad_norm": 3.596393116732762,
"learning_rate": 6.928650692298898e-06,
"loss": 0.226,
"step": 445
},
{
"epoch": 0.7495798319327731,
"grad_norm": 3.0142886185199065,
"learning_rate": 6.916465540191692e-06,
"loss": 0.2098,
"step": 446
},
{
"epoch": 0.7512605042016807,
"grad_norm": 4.0997116691327316,
"learning_rate": 6.904267031156482e-06,
"loss": 0.2141,
"step": 447
},
{
"epoch": 0.7529411764705882,
"grad_norm": 3.2816445631622164,
"learning_rate": 6.892055250211552e-06,
"loss": 0.2118,
"step": 448
},
{
"epoch": 0.7546218487394958,
"grad_norm": 2.898884385242415,
"learning_rate": 6.879830282467682e-06,
"loss": 0.2249,
"step": 449
},
{
"epoch": 0.7563025210084033,
"grad_norm": 3.701834773955598,
"learning_rate": 6.867592213127559e-06,
"loss": 0.2975,
"step": 450
},
{
"epoch": 0.7579831932773109,
"grad_norm": 3.3905113341532824,
"learning_rate": 6.855341127485183e-06,
"loss": 0.2285,
"step": 451
},
{
"epoch": 0.7596638655462185,
"grad_norm": 4.432808727705175,
"learning_rate": 6.84307711092527e-06,
"loss": 0.2872,
"step": 452
},
{
"epoch": 0.761344537815126,
"grad_norm": 3.8277259460611974,
"learning_rate": 6.8308002489226645e-06,
"loss": 0.2294,
"step": 453
},
{
"epoch": 0.7630252100840336,
"grad_norm": 3.425696594498262,
"learning_rate": 6.81851062704173e-06,
"loss": 0.1943,
"step": 454
},
{
"epoch": 0.7647058823529411,
"grad_norm": 3.112228899870348,
"learning_rate": 6.806208330935766e-06,
"loss": 0.2217,
"step": 455
},
{
"epoch": 0.7663865546218488,
"grad_norm": 3.6268572961078602,
"learning_rate": 6.793893446346405e-06,
"loss": 0.207,
"step": 456
},
{
"epoch": 0.7680672268907563,
"grad_norm": 3.1270143931104,
"learning_rate": 6.7815660591030155e-06,
"loss": 0.2389,
"step": 457
},
{
"epoch": 0.7697478991596639,
"grad_norm": 3.343441373055428,
"learning_rate": 6.769226255122104e-06,
"loss": 0.1803,
"step": 458
},
{
"epoch": 0.7714285714285715,
"grad_norm": 3.881868079906461,
"learning_rate": 6.7568741204067145e-06,
"loss": 0.264,
"step": 459
},
{
"epoch": 0.773109243697479,
"grad_norm": 3.9170195071144396,
"learning_rate": 6.744509741045835e-06,
"loss": 0.2717,
"step": 460
},
{
"epoch": 0.7747899159663866,
"grad_norm": 3.597422471752963,
"learning_rate": 6.7321332032137875e-06,
"loss": 0.2373,
"step": 461
},
{
"epoch": 0.7764705882352941,
"grad_norm": 3.6326736789138248,
"learning_rate": 6.719744593169642e-06,
"loss": 0.2809,
"step": 462
},
{
"epoch": 0.7781512605042017,
"grad_norm": 3.2242024990634177,
"learning_rate": 6.7073439972565955e-06,
"loss": 0.2738,
"step": 463
},
{
"epoch": 0.7798319327731092,
"grad_norm": 4.097362370899167,
"learning_rate": 6.6949315019013895e-06,
"loss": 0.2422,
"step": 464
},
{
"epoch": 0.7815126050420168,
"grad_norm": 3.0129633820408532,
"learning_rate": 6.682507193613697e-06,
"loss": 0.2308,
"step": 465
},
{
"epoch": 0.7831932773109244,
"grad_norm": 3.3831749684902728,
"learning_rate": 6.670071158985521e-06,
"loss": 0.2536,
"step": 466
},
{
"epoch": 0.7848739495798319,
"grad_norm": 4.06561523313803,
"learning_rate": 6.657623484690593e-06,
"loss": 0.2635,
"step": 467
},
{
"epoch": 0.7865546218487395,
"grad_norm": 3.954232561509824,
"learning_rate": 6.645164257483767e-06,
"loss": 0.2617,
"step": 468
},
{
"epoch": 0.788235294117647,
"grad_norm": 3.2864328535168528,
"learning_rate": 6.6326935642004165e-06,
"loss": 0.2467,
"step": 469
},
{
"epoch": 0.7899159663865546,
"grad_norm": 4.756041222608582,
"learning_rate": 6.62021149175583e-06,
"loss": 0.3166,
"step": 470
},
{
"epoch": 0.7915966386554621,
"grad_norm": 3.2133137441191364,
"learning_rate": 6.607718127144601e-06,
"loss": 0.1957,
"step": 471
},
{
"epoch": 0.7932773109243697,
"grad_norm": 3.9946779468067057,
"learning_rate": 6.595213557440026e-06,
"loss": 0.2561,
"step": 472
},
{
"epoch": 0.7949579831932773,
"grad_norm": 4.178112022529024,
"learning_rate": 6.582697869793495e-06,
"loss": 0.2841,
"step": 473
},
{
"epoch": 0.7966386554621848,
"grad_norm": 3.831343610559717,
"learning_rate": 6.570171151433887e-06,
"loss": 0.204,
"step": 474
},
{
"epoch": 0.7983193277310925,
"grad_norm": 3.6832346179871545,
"learning_rate": 6.557633489666958e-06,
"loss": 0.189,
"step": 475
},
{
"epoch": 0.8,
"grad_norm": 2.917218404225396,
"learning_rate": 6.545084971874738e-06,
"loss": 0.1833,
"step": 476
},
{
"epoch": 0.8016806722689076,
"grad_norm": 2.9993597464552337,
"learning_rate": 6.532525685514915e-06,
"loss": 0.221,
"step": 477
},
{
"epoch": 0.8033613445378152,
"grad_norm": 5.077471303123534,
"learning_rate": 6.519955718120231e-06,
"loss": 0.3785,
"step": 478
},
{
"epoch": 0.8050420168067227,
"grad_norm": 3.44058389211428,
"learning_rate": 6.507375157297872e-06,
"loss": 0.2357,
"step": 479
},
{
"epoch": 0.8067226890756303,
"grad_norm": 3.0384174021471475,
"learning_rate": 6.494784090728852e-06,
"loss": 0.2069,
"step": 480
},
{
"epoch": 0.8084033613445378,
"grad_norm": 4.2110013520640335,
"learning_rate": 6.482182606167409e-06,
"loss": 0.2847,
"step": 481
},
{
"epoch": 0.8100840336134454,
"grad_norm": 3.4294578019501825,
"learning_rate": 6.469570791440385e-06,
"loss": 0.1842,
"step": 482
},
{
"epoch": 0.8117647058823529,
"grad_norm": 3.609094506614432,
"learning_rate": 6.456948734446624e-06,
"loss": 0.2636,
"step": 483
},
{
"epoch": 0.8134453781512605,
"grad_norm": 3.4896585628244656,
"learning_rate": 6.444316523156352e-06,
"loss": 0.2057,
"step": 484
},
{
"epoch": 0.8151260504201681,
"grad_norm": 3.79357908294218,
"learning_rate": 6.4316742456105645e-06,
"loss": 0.2387,
"step": 485
},
{
"epoch": 0.8168067226890756,
"grad_norm": 3.769325505088469,
"learning_rate": 6.419021989920416e-06,
"loss": 0.3014,
"step": 486
},
{
"epoch": 0.8184873949579832,
"grad_norm": 3.361574274750808,
"learning_rate": 6.406359844266607e-06,
"loss": 0.2333,
"step": 487
},
{
"epoch": 0.8201680672268907,
"grad_norm": 3.957312301077667,
"learning_rate": 6.393687896898759e-06,
"loss": 0.2738,
"step": 488
},
{
"epoch": 0.8218487394957983,
"grad_norm": 3.6962832913581245,
"learning_rate": 6.381006236134817e-06,
"loss": 0.202,
"step": 489
},
{
"epoch": 0.8235294117647058,
"grad_norm": 4.3843424567306695,
"learning_rate": 6.368314950360416e-06,
"loss": 0.3356,
"step": 490
},
{
"epoch": 0.8252100840336134,
"grad_norm": 3.988796144368344,
"learning_rate": 6.355614128028277e-06,
"loss": 0.2188,
"step": 491
},
{
"epoch": 0.826890756302521,
"grad_norm": 3.546665934297024,
"learning_rate": 6.342903857657585e-06,
"loss": 0.3062,
"step": 492
},
{
"epoch": 0.8285714285714286,
"grad_norm": 3.464405372369805,
"learning_rate": 6.330184227833376e-06,
"loss": 0.302,
"step": 493
},
{
"epoch": 0.8302521008403362,
"grad_norm": 3.2217174248829505,
"learning_rate": 6.317455327205916e-06,
"loss": 0.1587,
"step": 494
},
{
"epoch": 0.8319327731092437,
"grad_norm": 3.101235516793264,
"learning_rate": 6.304717244490084e-06,
"loss": 0.1999,
"step": 495
},
{
"epoch": 0.8336134453781513,
"grad_norm": 3.5091112695722333,
"learning_rate": 6.291970068464755e-06,
"loss": 0.2298,
"step": 496
},
{
"epoch": 0.8352941176470589,
"grad_norm": 3.327856429000738,
"learning_rate": 6.279213887972179e-06,
"loss": 0.2582,
"step": 497
},
{
"epoch": 0.8369747899159664,
"grad_norm": 3.62144154961139,
"learning_rate": 6.266448791917364e-06,
"loss": 0.3269,
"step": 498
},
{
"epoch": 0.838655462184874,
"grad_norm": 2.953403477082121,
"learning_rate": 6.253674869267457e-06,
"loss": 0.1917,
"step": 499
},
{
"epoch": 0.8403361344537815,
"grad_norm": 3.011235509544136,
"learning_rate": 6.24089220905112e-06,
"loss": 0.2309,
"step": 500
},
{
"epoch": 0.8403361344537815,
"eval_loss": 0.24230201542377472,
"eval_runtime": 7.6191,
"eval_samples_per_second": 6.431,
"eval_steps_per_second": 1.706,
"step": 500
},
{
"epoch": 0.8420168067226891,
"grad_norm": 3.64049833442948,
"learning_rate": 6.228100900357914e-06,
"loss": 0.277,
"step": 501
},
{
"epoch": 0.8436974789915966,
"grad_norm": 3.360168756218691,
"learning_rate": 6.215301032337674e-06,
"loss": 0.1947,
"step": 502
},
{
"epoch": 0.8453781512605042,
"grad_norm": 3.0764894114372976,
"learning_rate": 6.202492694199893e-06,
"loss": 0.2248,
"step": 503
},
{
"epoch": 0.8470588235294118,
"grad_norm": 3.425507539967475,
"learning_rate": 6.189675975213094e-06,
"loss": 0.211,
"step": 504
},
{
"epoch": 0.8487394957983193,
"grad_norm": 3.537768398824082,
"learning_rate": 6.176850964704213e-06,
"loss": 0.2887,
"step": 505
},
{
"epoch": 0.8504201680672269,
"grad_norm": 3.3295569480928515,
"learning_rate": 6.164017752057972e-06,
"loss": 0.2512,
"step": 506
},
{
"epoch": 0.8521008403361344,
"grad_norm": 2.8205057844309795,
"learning_rate": 6.151176426716261e-06,
"loss": 0.2132,
"step": 507
},
{
"epoch": 0.853781512605042,
"grad_norm": 3.313681962254624,
"learning_rate": 6.13832707817751e-06,
"loss": 0.2558,
"step": 508
},
{
"epoch": 0.8554621848739495,
"grad_norm": 2.8394597548886447,
"learning_rate": 6.125469795996065e-06,
"loss": 0.1857,
"step": 509
},
{
"epoch": 0.8571428571428571,
"grad_norm": 3.619219896462597,
"learning_rate": 6.112604669781572e-06,
"loss": 0.3292,
"step": 510
},
{
"epoch": 0.8588235294117647,
"grad_norm": 3.2255068003001677,
"learning_rate": 6.099731789198344e-06,
"loss": 0.1973,
"step": 511
},
{
"epoch": 0.8605042016806723,
"grad_norm": 3.226780905842978,
"learning_rate": 6.0868512439647345e-06,
"loss": 0.2322,
"step": 512
},
{
"epoch": 0.8621848739495799,
"grad_norm": 3.0735886445268297,
"learning_rate": 6.073963123852522e-06,
"loss": 0.2253,
"step": 513
},
{
"epoch": 0.8638655462184874,
"grad_norm": 3.2508167941737205,
"learning_rate": 6.061067518686277e-06,
"loss": 0.2204,
"step": 514
},
{
"epoch": 0.865546218487395,
"grad_norm": 3.48647312788438,
"learning_rate": 6.048164518342734e-06,
"loss": 0.2611,
"step": 515
},
{
"epoch": 0.8672268907563025,
"grad_norm": 3.9021723359330163,
"learning_rate": 6.035254212750172e-06,
"loss": 0.2766,
"step": 516
},
{
"epoch": 0.8689075630252101,
"grad_norm": 3.848856259653677,
"learning_rate": 6.022336691887785e-06,
"loss": 0.2467,
"step": 517
},
{
"epoch": 0.8705882352941177,
"grad_norm": 3.2427477527780995,
"learning_rate": 6.009412045785051e-06,
"loss": 0.2417,
"step": 518
},
{
"epoch": 0.8722689075630252,
"grad_norm": 3.186572269145269,
"learning_rate": 5.996480364521114e-06,
"loss": 0.2362,
"step": 519
},
{
"epoch": 0.8739495798319328,
"grad_norm": 3.4881108893153194,
"learning_rate": 5.983541738224141e-06,
"loss": 0.2492,
"step": 520
},
{
"epoch": 0.8756302521008403,
"grad_norm": 3.451309092320012,
"learning_rate": 5.970596257070711e-06,
"loss": 0.2459,
"step": 521
},
{
"epoch": 0.8773109243697479,
"grad_norm": 3.036808532165684,
"learning_rate": 5.957644011285173e-06,
"loss": 0.2303,
"step": 522
},
{
"epoch": 0.8789915966386554,
"grad_norm": 4.380816137152236,
"learning_rate": 5.944685091139026e-06,
"loss": 0.2671,
"step": 523
},
{
"epoch": 0.880672268907563,
"grad_norm": 3.5404271040433892,
"learning_rate": 5.931719586950286e-06,
"loss": 0.2766,
"step": 524
},
{
"epoch": 0.8823529411764706,
"grad_norm": 3.413794531222058,
"learning_rate": 5.918747589082853e-06,
"loss": 0.2001,
"step": 525
},
{
"epoch": 0.8840336134453781,
"grad_norm": 3.1862842821300834,
"learning_rate": 5.905769187945889e-06,
"loss": 0.2606,
"step": 526
},
{
"epoch": 0.8857142857142857,
"grad_norm": 3.0322653693123347,
"learning_rate": 5.892784473993184e-06,
"loss": 0.183,
"step": 527
},
{
"epoch": 0.8873949579831932,
"grad_norm": 3.160034382975318,
"learning_rate": 5.879793537722525e-06,
"loss": 0.2651,
"step": 528
},
{
"epoch": 0.8890756302521008,
"grad_norm": 3.8858225696631323,
"learning_rate": 5.8667964696750625e-06,
"loss": 0.2103,
"step": 529
},
{
"epoch": 0.8907563025210085,
"grad_norm": 4.046308603443172,
"learning_rate": 5.853793360434687e-06,
"loss": 0.2899,
"step": 530
},
{
"epoch": 0.892436974789916,
"grad_norm": 3.490280050594676,
"learning_rate": 5.840784300627396e-06,
"loss": 0.2757,
"step": 531
},
{
"epoch": 0.8941176470588236,
"grad_norm": 3.4886595872843698,
"learning_rate": 5.82776938092065e-06,
"loss": 0.2226,
"step": 532
},
{
"epoch": 0.8957983193277311,
"grad_norm": 3.459447687093064,
"learning_rate": 5.814748692022761e-06,
"loss": 0.2096,
"step": 533
},
{
"epoch": 0.8974789915966387,
"grad_norm": 3.4503094466900843,
"learning_rate": 5.801722324682243e-06,
"loss": 0.2201,
"step": 534
},
{
"epoch": 0.8991596638655462,
"grad_norm": 6.057225321208548,
"learning_rate": 5.788690369687188e-06,
"loss": 0.2534,
"step": 535
},
{
"epoch": 0.9008403361344538,
"grad_norm": 3.993936335248651,
"learning_rate": 5.775652917864633e-06,
"loss": 0.2468,
"step": 536
},
{
"epoch": 0.9025210084033614,
"grad_norm": 3.8240034384038957,
"learning_rate": 5.762610060079926e-06,
"loss": 0.2468,
"step": 537
},
{
"epoch": 0.9042016806722689,
"grad_norm": 3.506076848751469,
"learning_rate": 5.749561887236088e-06,
"loss": 0.2564,
"step": 538
},
{
"epoch": 0.9058823529411765,
"grad_norm": 3.4486725300170327,
"learning_rate": 5.736508490273189e-06,
"loss": 0.2775,
"step": 539
},
{
"epoch": 0.907563025210084,
"grad_norm": 3.6027114982004584,
"learning_rate": 5.723449960167703e-06,
"loss": 0.2172,
"step": 540
},
{
"epoch": 0.9092436974789916,
"grad_norm": 3.922464109691482,
"learning_rate": 5.710386387931886e-06,
"loss": 0.2995,
"step": 541
},
{
"epoch": 0.9109243697478991,
"grad_norm": 4.202065560048001,
"learning_rate": 5.697317864613127e-06,
"loss": 0.3745,
"step": 542
},
{
"epoch": 0.9126050420168067,
"grad_norm": 4.135603360147499,
"learning_rate": 5.684244481293335e-06,
"loss": 0.3435,
"step": 543
},
{
"epoch": 0.9142857142857143,
"grad_norm": 3.4093772473414066,
"learning_rate": 5.671166329088278e-06,
"loss": 0.2263,
"step": 544
},
{
"epoch": 0.9159663865546218,
"grad_norm": 3.5010536567581023,
"learning_rate": 5.658083499146968e-06,
"loss": 0.2573,
"step": 545
},
{
"epoch": 0.9176470588235294,
"grad_norm": 3.6193713170190582,
"learning_rate": 5.644996082651018e-06,
"loss": 0.2463,
"step": 546
},
{
"epoch": 0.9193277310924369,
"grad_norm": 3.4389315001827905,
"learning_rate": 5.6319041708140045e-06,
"loss": 0.1654,
"step": 547
},
{
"epoch": 0.9210084033613445,
"grad_norm": 3.269477559777336,
"learning_rate": 5.6188078548808366e-06,
"loss": 0.2372,
"step": 548
},
{
"epoch": 0.9226890756302522,
"grad_norm": 2.949744609372753,
"learning_rate": 5.6057072261271194e-06,
"loss": 0.1651,
"step": 549
},
{
"epoch": 0.9243697478991597,
"grad_norm": 3.284532781268528,
"learning_rate": 5.592602375858515e-06,
"loss": 0.3667,
"step": 550
},
{
"epoch": 0.9260504201680673,
"grad_norm": 3.033910900385078,
"learning_rate": 5.579493395410105e-06,
"loss": 0.2187,
"step": 551
},
{
"epoch": 0.9277310924369748,
"grad_norm": 3.8363812188814275,
"learning_rate": 5.566380376145762e-06,
"loss": 0.2711,
"step": 552
},
{
"epoch": 0.9294117647058824,
"grad_norm": 3.206650579866936,
"learning_rate": 5.553263409457504e-06,
"loss": 0.1997,
"step": 553
},
{
"epoch": 0.9310924369747899,
"grad_norm": 3.935436124623862,
"learning_rate": 5.540142586764862e-06,
"loss": 0.2661,
"step": 554
},
{
"epoch": 0.9327731092436975,
"grad_norm": 3.6180857295008395,
"learning_rate": 5.527017999514239e-06,
"loss": 0.2474,
"step": 555
},
{
"epoch": 0.934453781512605,
"grad_norm": 3.459755458614064,
"learning_rate": 5.51388973917828e-06,
"loss": 0.2821,
"step": 556
},
{
"epoch": 0.9361344537815126,
"grad_norm": 3.5773382583048527,
"learning_rate": 5.5007578972552246e-06,
"loss": 0.2927,
"step": 557
},
{
"epoch": 0.9378151260504202,
"grad_norm": 4.6363961630236785,
"learning_rate": 5.4876225652682776e-06,
"loss": 0.3551,
"step": 558
},
{
"epoch": 0.9394957983193277,
"grad_norm": 3.379098842911059,
"learning_rate": 5.474483834764968e-06,
"loss": 0.2059,
"step": 559
},
{
"epoch": 0.9411764705882353,
"grad_norm": 3.9316584340639245,
"learning_rate": 5.46134179731651e-06,
"loss": 0.2953,
"step": 560
},
{
"epoch": 0.9428571428571428,
"grad_norm": 3.3722167978017867,
"learning_rate": 5.448196544517168e-06,
"loss": 0.2451,
"step": 561
},
{
"epoch": 0.9445378151260504,
"grad_norm": 3.1185440228092887,
"learning_rate": 5.435048167983613e-06,
"loss": 0.2173,
"step": 562
},
{
"epoch": 0.946218487394958,
"grad_norm": 3.9485848008617523,
"learning_rate": 5.421896759354288e-06,
"loss": 0.2261,
"step": 563
},
{
"epoch": 0.9478991596638655,
"grad_norm": 2.9170922035357094,
"learning_rate": 5.408742410288769e-06,
"loss": 0.2287,
"step": 564
},
{
"epoch": 0.9495798319327731,
"grad_norm": 3.9244596118525674,
"learning_rate": 5.395585212467124e-06,
"loss": 0.2734,
"step": 565
},
{
"epoch": 0.9512605042016806,
"grad_norm": 4.603168201003037,
"learning_rate": 5.382425257589277e-06,
"loss": 0.3128,
"step": 566
},
{
"epoch": 0.9529411764705882,
"grad_norm": 3.818573488204182,
"learning_rate": 5.36926263737437e-06,
"loss": 0.3068,
"step": 567
},
{
"epoch": 0.9546218487394958,
"grad_norm": 2.467008222654659,
"learning_rate": 5.356097443560116e-06,
"loss": 0.2085,
"step": 568
},
{
"epoch": 0.9563025210084034,
"grad_norm": 3.8596036485613427,
"learning_rate": 5.342929767902168e-06,
"loss": 0.2432,
"step": 569
},
{
"epoch": 0.957983193277311,
"grad_norm": 4.001034423912602,
"learning_rate": 5.329759702173477e-06,
"loss": 0.2557,
"step": 570
},
{
"epoch": 0.9596638655462185,
"grad_norm": 3.4761405695641145,
"learning_rate": 5.316587338163649e-06,
"loss": 0.2165,
"step": 571
},
{
"epoch": 0.9613445378151261,
"grad_norm": 3.1581349516312316,
"learning_rate": 5.30341276767831e-06,
"loss": 0.2046,
"step": 572
},
{
"epoch": 0.9630252100840336,
"grad_norm": 3.717769612339884,
"learning_rate": 5.290236082538464e-06,
"loss": 0.2878,
"step": 573
},
{
"epoch": 0.9647058823529412,
"grad_norm": 3.8206293915711544,
"learning_rate": 5.27705737457985e-06,
"loss": 0.284,
"step": 574
},
{
"epoch": 0.9663865546218487,
"grad_norm": 3.5987474263840293,
"learning_rate": 5.2638767356523125e-06,
"loss": 0.1862,
"step": 575
},
{
"epoch": 0.9680672268907563,
"grad_norm": 2.628708627577784,
"learning_rate": 5.2506942576191466e-06,
"loss": 0.1986,
"step": 576
},
{
"epoch": 0.9697478991596639,
"grad_norm": 3.00948049798796,
"learning_rate": 5.23751003235647e-06,
"loss": 0.2272,
"step": 577
},
{
"epoch": 0.9714285714285714,
"grad_norm": 3.897035347258241,
"learning_rate": 5.224324151752575e-06,
"loss": 0.2377,
"step": 578
},
{
"epoch": 0.973109243697479,
"grad_norm": 3.1171554333300646,
"learning_rate": 5.211136707707293e-06,
"loss": 0.2222,
"step": 579
},
{
"epoch": 0.9747899159663865,
"grad_norm": 3.0039037163365983,
"learning_rate": 5.197947792131348e-06,
"loss": 0.1865,
"step": 580
},
{
"epoch": 0.9764705882352941,
"grad_norm": 6.257110222572669,
"learning_rate": 5.184757496945726e-06,
"loss": 0.2839,
"step": 581
},
{
"epoch": 0.9781512605042016,
"grad_norm": 3.821218218364182,
"learning_rate": 5.1715659140810225e-06,
"loss": 0.2529,
"step": 582
},
{
"epoch": 0.9798319327731092,
"grad_norm": 3.730030674136909,
"learning_rate": 5.158373135476811e-06,
"loss": 0.2524,
"step": 583
},
{
"epoch": 0.9815126050420168,
"grad_norm": 2.840325890140036,
"learning_rate": 5.145179253080997e-06,
"loss": 0.2176,
"step": 584
},
{
"epoch": 0.9831932773109243,
"grad_norm": 3.6607002570737817,
"learning_rate": 5.131984358849182e-06,
"loss": 0.2658,
"step": 585
},
{
"epoch": 0.984873949579832,
"grad_norm": 3.1743768713711322,
"learning_rate": 5.118788544744016e-06,
"loss": 0.2591,
"step": 586
},
{
"epoch": 0.9865546218487395,
"grad_norm": 3.2338897708714507,
"learning_rate": 5.105591902734561e-06,
"loss": 0.1775,
"step": 587
},
{
"epoch": 0.9882352941176471,
"grad_norm": 4.517369770013792,
"learning_rate": 5.09239452479565e-06,
"loss": 0.2587,
"step": 588
},
{
"epoch": 0.9899159663865547,
"grad_norm": 2.6496184847532733,
"learning_rate": 5.079196502907246e-06,
"loss": 0.1447,
"step": 589
},
{
"epoch": 0.9915966386554622,
"grad_norm": 2.7211633275819525,
"learning_rate": 5.065997929053795e-06,
"loss": 0.1926,
"step": 590
},
{
"epoch": 0.9932773109243698,
"grad_norm": 3.0952400745185,
"learning_rate": 5.052798895223597e-06,
"loss": 0.2134,
"step": 591
},
{
"epoch": 0.9949579831932773,
"grad_norm": 3.51837877201114,
"learning_rate": 5.039599493408154e-06,
"loss": 0.2745,
"step": 592
},
{
"epoch": 0.9966386554621849,
"grad_norm": 3.4866227884563026,
"learning_rate": 5.026399815601533e-06,
"loss": 0.2295,
"step": 593
},
{
"epoch": 0.9983193277310924,
"grad_norm": 3.3229672421715644,
"learning_rate": 5.0131999537997235e-06,
"loss": 0.1775,
"step": 594
},
{
"epoch": 1.0,
"grad_norm": 3.6363741518245245,
"learning_rate": 5e-06,
"loss": 0.1833,
"step": 595
},
{
"epoch": 1.0016806722689076,
"grad_norm": 3.0685430970214216,
"learning_rate": 4.986800046200278e-06,
"loss": 0.1295,
"step": 596
},
{
"epoch": 1.0033613445378151,
"grad_norm": 2.642341004416497,
"learning_rate": 4.97360018439847e-06,
"loss": 0.0994,
"step": 597
},
{
"epoch": 1.0050420168067227,
"grad_norm": 2.323376314615566,
"learning_rate": 4.960400506591848e-06,
"loss": 0.075,
"step": 598
},
{
"epoch": 1.0067226890756302,
"grad_norm": 2.1934167410322574,
"learning_rate": 4.947201104776404e-06,
"loss": 0.0863,
"step": 599
},
{
"epoch": 1.0084033613445378,
"grad_norm": 2.779595805074063,
"learning_rate": 4.934002070946206e-06,
"loss": 0.09,
"step": 600
},
{
"epoch": 1.0100840336134453,
"grad_norm": 2.642478794924014,
"learning_rate": 4.920803497092757e-06,
"loss": 0.1072,
"step": 601
},
{
"epoch": 1.011764705882353,
"grad_norm": 2.8206325875789786,
"learning_rate": 4.907605475204352e-06,
"loss": 0.1135,
"step": 602
},
{
"epoch": 1.0134453781512605,
"grad_norm": 2.601945372181,
"learning_rate": 4.894408097265441e-06,
"loss": 0.1026,
"step": 603
},
{
"epoch": 1.015126050420168,
"grad_norm": 2.651228941391384,
"learning_rate": 4.881211455255986e-06,
"loss": 0.0731,
"step": 604
},
{
"epoch": 1.0168067226890756,
"grad_norm": 2.3414792685919994,
"learning_rate": 4.86801564115082e-06,
"loss": 0.0886,
"step": 605
},
{
"epoch": 1.0184873949579831,
"grad_norm": 2.7273995366520243,
"learning_rate": 4.854820746919005e-06,
"loss": 0.0911,
"step": 606
},
{
"epoch": 1.0201680672268907,
"grad_norm": 2.8327315486397113,
"learning_rate": 4.8416268645231915e-06,
"loss": 0.1086,
"step": 607
},
{
"epoch": 1.0218487394957982,
"grad_norm": 2.7069106323936256,
"learning_rate": 4.82843408591898e-06,
"loss": 0.097,
"step": 608
},
{
"epoch": 1.0235294117647058,
"grad_norm": 2.442047706770237,
"learning_rate": 4.815242503054277e-06,
"loss": 0.0607,
"step": 609
},
{
"epoch": 1.0252100840336134,
"grad_norm": 3.033523413937561,
"learning_rate": 4.802052207868654e-06,
"loss": 0.1112,
"step": 610
},
{
"epoch": 1.026890756302521,
"grad_norm": 3.820262837255327,
"learning_rate": 4.78886329229271e-06,
"loss": 0.137,
"step": 611
},
{
"epoch": 1.0285714285714285,
"grad_norm": 2.733397229688364,
"learning_rate": 4.775675848247427e-06,
"loss": 0.0923,
"step": 612
},
{
"epoch": 1.030252100840336,
"grad_norm": 3.4691034150038034,
"learning_rate": 4.762489967643532e-06,
"loss": 0.118,
"step": 613
},
{
"epoch": 1.0319327731092436,
"grad_norm": 3.854013776962153,
"learning_rate": 4.749305742380853e-06,
"loss": 0.1301,
"step": 614
},
{
"epoch": 1.0336134453781514,
"grad_norm": 3.0184243827182833,
"learning_rate": 4.736123264347688e-06,
"loss": 0.0853,
"step": 615
},
{
"epoch": 1.035294117647059,
"grad_norm": 3.7517471305566517,
"learning_rate": 4.7229426254201504e-06,
"loss": 0.1193,
"step": 616
},
{
"epoch": 1.0369747899159665,
"grad_norm": 3.3080181370358566,
"learning_rate": 4.709763917461537e-06,
"loss": 0.1154,
"step": 617
},
{
"epoch": 1.038655462184874,
"grad_norm": 2.2629000826778856,
"learning_rate": 4.696587232321691e-06,
"loss": 0.0932,
"step": 618
},
{
"epoch": 1.0403361344537816,
"grad_norm": 3.1860739367441737,
"learning_rate": 4.683412661836351e-06,
"loss": 0.1039,
"step": 619
},
{
"epoch": 1.0420168067226891,
"grad_norm": 3.0601501008008256,
"learning_rate": 4.6702402978265235e-06,
"loss": 0.089,
"step": 620
},
{
"epoch": 1.0436974789915967,
"grad_norm": 2.603230804321902,
"learning_rate": 4.657070232097832e-06,
"loss": 0.0751,
"step": 621
},
{
"epoch": 1.0453781512605043,
"grad_norm": 2.8411688842957465,
"learning_rate": 4.643902556439885e-06,
"loss": 0.0817,
"step": 622
},
{
"epoch": 1.0470588235294118,
"grad_norm": 2.7986713354700763,
"learning_rate": 4.630737362625631e-06,
"loss": 0.1058,
"step": 623
},
{
"epoch": 1.0487394957983194,
"grad_norm": 3.5321634211727333,
"learning_rate": 4.6175747424107234e-06,
"loss": 0.1169,
"step": 624
},
{
"epoch": 1.050420168067227,
"grad_norm": 2.9302195577434462,
"learning_rate": 4.604414787532877e-06,
"loss": 0.1098,
"step": 625
},
{
"epoch": 1.0521008403361345,
"grad_norm": 2.2686346359628344,
"learning_rate": 4.591257589711233e-06,
"loss": 0.078,
"step": 626
},
{
"epoch": 1.053781512605042,
"grad_norm": 3.2727476625085266,
"learning_rate": 4.578103240645714e-06,
"loss": 0.1095,
"step": 627
},
{
"epoch": 1.0554621848739496,
"grad_norm": 2.341371455709515,
"learning_rate": 4.5649518320163885e-06,
"loss": 0.1126,
"step": 628
},
{
"epoch": 1.0571428571428572,
"grad_norm": 2.738651087640409,
"learning_rate": 4.551803455482833e-06,
"loss": 0.086,
"step": 629
},
{
"epoch": 1.0588235294117647,
"grad_norm": 2.3041964936668555,
"learning_rate": 4.53865820268349e-06,
"loss": 0.0822,
"step": 630
},
{
"epoch": 1.0605042016806723,
"grad_norm": 4.220651913549916,
"learning_rate": 4.525516165235034e-06,
"loss": 0.1141,
"step": 631
},
{
"epoch": 1.0621848739495798,
"grad_norm": 2.9322019645529904,
"learning_rate": 4.512377434731724e-06,
"loss": 0.1477,
"step": 632
},
{
"epoch": 1.0638655462184874,
"grad_norm": 2.5913843452442378,
"learning_rate": 4.499242102744777e-06,
"loss": 0.0929,
"step": 633
},
{
"epoch": 1.065546218487395,
"grad_norm": 2.445571807332056,
"learning_rate": 4.486110260821722e-06,
"loss": 0.0978,
"step": 634
},
{
"epoch": 1.0672268907563025,
"grad_norm": 2.5494116675856366,
"learning_rate": 4.4729820004857625e-06,
"loss": 0.1012,
"step": 635
},
{
"epoch": 1.06890756302521,
"grad_norm": 2.1101261139846783,
"learning_rate": 4.45985741323514e-06,
"loss": 0.0715,
"step": 636
},
{
"epoch": 1.0705882352941176,
"grad_norm": 1.9510646778084884,
"learning_rate": 4.446736590542497e-06,
"loss": 0.0549,
"step": 637
},
{
"epoch": 1.0722689075630252,
"grad_norm": 3.88179760628692,
"learning_rate": 4.4336196238542394e-06,
"loss": 0.1449,
"step": 638
},
{
"epoch": 1.0739495798319327,
"grad_norm": 2.541597748187505,
"learning_rate": 4.420506604589897e-06,
"loss": 0.0913,
"step": 639
},
{
"epoch": 1.0756302521008403,
"grad_norm": 3.519215369238923,
"learning_rate": 4.407397624141487e-06,
"loss": 0.1025,
"step": 640
},
{
"epoch": 1.0773109243697478,
"grad_norm": 2.123778059115166,
"learning_rate": 4.394292773872882e-06,
"loss": 0.0714,
"step": 641
},
{
"epoch": 1.0789915966386554,
"grad_norm": 5.1179373437116285,
"learning_rate": 4.381192145119165e-06,
"loss": 0.1282,
"step": 642
},
{
"epoch": 1.080672268907563,
"grad_norm": 2.8684696011439756,
"learning_rate": 4.368095829185999e-06,
"loss": 0.0791,
"step": 643
},
{
"epoch": 1.0823529411764705,
"grad_norm": 4.102832197816667,
"learning_rate": 4.355003917348985e-06,
"loss": 0.1225,
"step": 644
},
{
"epoch": 1.084033613445378,
"grad_norm": 3.6293736374607013,
"learning_rate": 4.341916500853034e-06,
"loss": 0.1105,
"step": 645
},
{
"epoch": 1.0857142857142856,
"grad_norm": 2.9599263387812873,
"learning_rate": 4.3288336709117246e-06,
"loss": 0.0765,
"step": 646
},
{
"epoch": 1.0873949579831932,
"grad_norm": 2.959898151373633,
"learning_rate": 4.315755518706667e-06,
"loss": 0.1221,
"step": 647
},
{
"epoch": 1.0890756302521007,
"grad_norm": 2.831223871678987,
"learning_rate": 4.302682135386873e-06,
"loss": 0.1047,
"step": 648
},
{
"epoch": 1.0907563025210083,
"grad_norm": 3.883463233494954,
"learning_rate": 4.289613612068118e-06,
"loss": 0.0778,
"step": 649
},
{
"epoch": 1.092436974789916,
"grad_norm": 3.3740664710443315,
"learning_rate": 4.276550039832299e-06,
"loss": 0.1244,
"step": 650
},
{
"epoch": 1.0941176470588236,
"grad_norm": 2.945414543907692,
"learning_rate": 4.263491509726812e-06,
"loss": 0.1062,
"step": 651
},
{
"epoch": 1.0957983193277312,
"grad_norm": 3.5755203026507107,
"learning_rate": 4.250438112763911e-06,
"loss": 0.0765,
"step": 652
},
{
"epoch": 1.0974789915966388,
"grad_norm": 2.864562468571205,
"learning_rate": 4.237389939920075e-06,
"loss": 0.0973,
"step": 653
},
{
"epoch": 1.0991596638655463,
"grad_norm": 3.309117703284289,
"learning_rate": 4.224347082135367e-06,
"loss": 0.1071,
"step": 654
},
{
"epoch": 1.1008403361344539,
"grad_norm": 4.444191677418515,
"learning_rate": 4.211309630312813e-06,
"loss": 0.1255,
"step": 655
},
{
"epoch": 1.1025210084033614,
"grad_norm": 2.6049652407237525,
"learning_rate": 4.198277675317758e-06,
"loss": 0.0953,
"step": 656
},
{
"epoch": 1.104201680672269,
"grad_norm": 2.9301571986545984,
"learning_rate": 4.185251307977241e-06,
"loss": 0.1391,
"step": 657
},
{
"epoch": 1.1058823529411765,
"grad_norm": 2.991359373447863,
"learning_rate": 4.17223061907935e-06,
"loss": 0.1011,
"step": 658
},
{
"epoch": 1.107563025210084,
"grad_norm": 2.908536947858802,
"learning_rate": 4.159215699372605e-06,
"loss": 0.0994,
"step": 659
},
{
"epoch": 1.1092436974789917,
"grad_norm": 3.0562253978122,
"learning_rate": 4.146206639565313e-06,
"loss": 0.0761,
"step": 660
},
{
"epoch": 1.1109243697478992,
"grad_norm": 2.3707071277066865,
"learning_rate": 4.133203530324938e-06,
"loss": 0.0791,
"step": 661
},
{
"epoch": 1.1126050420168068,
"grad_norm": 2.568827912231571,
"learning_rate": 4.120206462277478e-06,
"loss": 0.1079,
"step": 662
},
{
"epoch": 1.1142857142857143,
"grad_norm": 3.3675762016216346,
"learning_rate": 4.107215526006818e-06,
"loss": 0.1004,
"step": 663
},
{
"epoch": 1.1159663865546219,
"grad_norm": 2.711315362662935,
"learning_rate": 4.094230812054113e-06,
"loss": 0.087,
"step": 664
},
{
"epoch": 1.1176470588235294,
"grad_norm": 2.4363571585356296,
"learning_rate": 4.081252410917148e-06,
"loss": 0.0993,
"step": 665
},
{
"epoch": 1.119327731092437,
"grad_norm": 2.2853255955328327,
"learning_rate": 4.068280413049715e-06,
"loss": 0.0951,
"step": 666
},
{
"epoch": 1.1210084033613446,
"grad_norm": 3.1173097538332257,
"learning_rate": 4.0553149088609745e-06,
"loss": 0.1207,
"step": 667
},
{
"epoch": 1.122689075630252,
"grad_norm": 2.3961886874666645,
"learning_rate": 4.042355988714828e-06,
"loss": 0.0984,
"step": 668
},
{
"epoch": 1.1243697478991597,
"grad_norm": 2.6441840658169196,
"learning_rate": 4.02940374292929e-06,
"loss": 0.1055,
"step": 669
},
{
"epoch": 1.1260504201680672,
"grad_norm": 2.589296721112373,
"learning_rate": 4.01645826177586e-06,
"loss": 0.091,
"step": 670
},
{
"epoch": 1.1277310924369748,
"grad_norm": 3.101644326303377,
"learning_rate": 4.003519635478889e-06,
"loss": 0.1136,
"step": 671
},
{
"epoch": 1.1294117647058823,
"grad_norm": 2.085544584261255,
"learning_rate": 3.99058795421495e-06,
"loss": 0.0645,
"step": 672
},
{
"epoch": 1.13109243697479,
"grad_norm": 3.022884264669196,
"learning_rate": 3.977663308112216e-06,
"loss": 0.0982,
"step": 673
},
{
"epoch": 1.1327731092436975,
"grad_norm": 3.0918298593703177,
"learning_rate": 3.9647457872498295e-06,
"loss": 0.1309,
"step": 674
},
{
"epoch": 1.134453781512605,
"grad_norm": 3.097444725372621,
"learning_rate": 3.951835481657268e-06,
"loss": 0.0821,
"step": 675
},
{
"epoch": 1.1361344537815126,
"grad_norm": 3.480720174843856,
"learning_rate": 3.938932481313725e-06,
"loss": 0.1083,
"step": 676
},
{
"epoch": 1.1378151260504201,
"grad_norm": 3.167987925313736,
"learning_rate": 3.926036876147479e-06,
"loss": 0.0947,
"step": 677
},
{
"epoch": 1.1394957983193277,
"grad_norm": 2.9675142893422666,
"learning_rate": 3.913148756035266e-06,
"loss": 0.1074,
"step": 678
},
{
"epoch": 1.1411764705882352,
"grad_norm": 2.88904471583098,
"learning_rate": 3.9002682108016585e-06,
"loss": 0.0947,
"step": 679
},
{
"epoch": 1.1428571428571428,
"grad_norm": 2.3148945814778745,
"learning_rate": 3.887395330218429e-06,
"loss": 0.0543,
"step": 680
},
{
"epoch": 1.1445378151260504,
"grad_norm": 3.4421881750813714,
"learning_rate": 3.874530204003937e-06,
"loss": 0.1261,
"step": 681
},
{
"epoch": 1.146218487394958,
"grad_norm": 2.426782546704889,
"learning_rate": 3.861672921822493e-06,
"loss": 0.0701,
"step": 682
},
{
"epoch": 1.1478991596638655,
"grad_norm": 2.7112499258139673,
"learning_rate": 3.848823573283742e-06,
"loss": 0.0713,
"step": 683
},
{
"epoch": 1.149579831932773,
"grad_norm": 2.0739016767387057,
"learning_rate": 3.83598224794203e-06,
"loss": 0.0649,
"step": 684
},
{
"epoch": 1.1512605042016806,
"grad_norm": 4.360074547910969,
"learning_rate": 3.823149035295789e-06,
"loss": 0.1263,
"step": 685
},
{
"epoch": 1.1529411764705881,
"grad_norm": 3.8208016083874803,
"learning_rate": 3.8103240247869077e-06,
"loss": 0.1078,
"step": 686
},
{
"epoch": 1.1546218487394957,
"grad_norm": 3.136985722111122,
"learning_rate": 3.7975073058001092e-06,
"loss": 0.0801,
"step": 687
},
{
"epoch": 1.1563025210084033,
"grad_norm": 2.7226211242981035,
"learning_rate": 3.7846989676623265e-06,
"loss": 0.0937,
"step": 688
},
{
"epoch": 1.1579831932773108,
"grad_norm": 2.807183390633048,
"learning_rate": 3.7718990996420875e-06,
"loss": 0.0805,
"step": 689
},
{
"epoch": 1.1596638655462184,
"grad_norm": 2.7048742007757234,
"learning_rate": 3.7591077909488817e-06,
"loss": 0.0731,
"step": 690
},
{
"epoch": 1.1613445378151261,
"grad_norm": 2.4917292393625776,
"learning_rate": 3.7463251307325432e-06,
"loss": 0.0814,
"step": 691
},
{
"epoch": 1.1630252100840337,
"grad_norm": 2.863341194127507,
"learning_rate": 3.7335512080826363e-06,
"loss": 0.1151,
"step": 692
},
{
"epoch": 1.1647058823529413,
"grad_norm": 2.9183758099606245,
"learning_rate": 3.720786112027822e-06,
"loss": 0.0995,
"step": 693
},
{
"epoch": 1.1663865546218488,
"grad_norm": 2.5864607722539232,
"learning_rate": 3.708029931535246e-06,
"loss": 0.0796,
"step": 694
},
{
"epoch": 1.1680672268907564,
"grad_norm": 3.468324873932469,
"learning_rate": 3.695282755509917e-06,
"loss": 0.118,
"step": 695
},
{
"epoch": 1.169747899159664,
"grad_norm": 2.1915066606153846,
"learning_rate": 3.682544672794085e-06,
"loss": 0.0823,
"step": 696
},
{
"epoch": 1.1714285714285715,
"grad_norm": 2.8614797614523195,
"learning_rate": 3.669815772166625e-06,
"loss": 0.1117,
"step": 697
},
{
"epoch": 1.173109243697479,
"grad_norm": 3.0243317238966902,
"learning_rate": 3.6570961423424155e-06,
"loss": 0.1382,
"step": 698
},
{
"epoch": 1.1747899159663866,
"grad_norm": 2.5722750997684805,
"learning_rate": 3.644385871971725e-06,
"loss": 0.0901,
"step": 699
},
{
"epoch": 1.1764705882352942,
"grad_norm": 2.2211603476786945,
"learning_rate": 3.6316850496395863e-06,
"loss": 0.0707,
"step": 700
},
{
"epoch": 1.1781512605042017,
"grad_norm": 2.3653294803726754,
"learning_rate": 3.618993763865185e-06,
"loss": 0.0865,
"step": 701
},
{
"epoch": 1.1798319327731093,
"grad_norm": 3.4518859809747746,
"learning_rate": 3.6063121031012417e-06,
"loss": 0.0889,
"step": 702
},
{
"epoch": 1.1815126050420168,
"grad_norm": 3.1717027674030236,
"learning_rate": 3.5936401557333957e-06,
"loss": 0.1078,
"step": 703
},
{
"epoch": 1.1831932773109244,
"grad_norm": 2.5802159610520667,
"learning_rate": 3.5809780100795853e-06,
"loss": 0.0883,
"step": 704
},
{
"epoch": 1.184873949579832,
"grad_norm": 2.51669538204389,
"learning_rate": 3.568325754389438e-06,
"loss": 0.1018,
"step": 705
},
{
"epoch": 1.1865546218487395,
"grad_norm": 3.834199640369919,
"learning_rate": 3.5556834768436498e-06,
"loss": 0.1229,
"step": 706
},
{
"epoch": 1.188235294117647,
"grad_norm": 3.268611445606652,
"learning_rate": 3.5430512655533774e-06,
"loss": 0.1422,
"step": 707
},
{
"epoch": 1.1899159663865546,
"grad_norm": 2.569460634045658,
"learning_rate": 3.5304292085596166e-06,
"loss": 0.0842,
"step": 708
},
{
"epoch": 1.1915966386554622,
"grad_norm": 2.708337321883567,
"learning_rate": 3.517817393832593e-06,
"loss": 0.1071,
"step": 709
},
{
"epoch": 1.1932773109243697,
"grad_norm": 2.295889437868396,
"learning_rate": 3.505215909271149e-06,
"loss": 0.0842,
"step": 710
},
{
"epoch": 1.1949579831932773,
"grad_norm": 2.7494736128846315,
"learning_rate": 3.4926248427021293e-06,
"loss": 0.1067,
"step": 711
},
{
"epoch": 1.1966386554621848,
"grad_norm": 3.037610849894837,
"learning_rate": 3.48004428187977e-06,
"loss": 0.0995,
"step": 712
},
{
"epoch": 1.1983193277310924,
"grad_norm": 3.1209681259023343,
"learning_rate": 3.4674743144850865e-06,
"loss": 0.1054,
"step": 713
},
{
"epoch": 1.2,
"grad_norm": 2.7146088777741353,
"learning_rate": 3.4549150281252635e-06,
"loss": 0.0855,
"step": 714
},
{
"epoch": 1.2016806722689075,
"grad_norm": 2.579928115495318,
"learning_rate": 3.442366510333043e-06,
"loss": 0.1029,
"step": 715
},
{
"epoch": 1.203361344537815,
"grad_norm": 2.051251447279275,
"learning_rate": 3.4298288485661147e-06,
"loss": 0.0724,
"step": 716
},
{
"epoch": 1.2050420168067226,
"grad_norm": 2.25884749164061,
"learning_rate": 3.417302130206507e-06,
"loss": 0.0998,
"step": 717
},
{
"epoch": 1.2067226890756302,
"grad_norm": 2.5203879016401047,
"learning_rate": 3.404786442559976e-06,
"loss": 0.0804,
"step": 718
},
{
"epoch": 1.2084033613445377,
"grad_norm": 2.323136581363694,
"learning_rate": 3.3922818728554008e-06,
"loss": 0.0889,
"step": 719
},
{
"epoch": 1.2100840336134453,
"grad_norm": 2.2224533138933404,
"learning_rate": 3.3797885082441717e-06,
"loss": 0.0809,
"step": 720
},
{
"epoch": 1.2117647058823529,
"grad_norm": 3.4942626722992585,
"learning_rate": 3.3673064357995844e-06,
"loss": 0.1375,
"step": 721
},
{
"epoch": 1.2134453781512604,
"grad_norm": 2.7421302991103182,
"learning_rate": 3.3548357425162347e-06,
"loss": 0.1026,
"step": 722
},
{
"epoch": 1.2151260504201682,
"grad_norm": 2.405476921093656,
"learning_rate": 3.3423765153094097e-06,
"loss": 0.0875,
"step": 723
},
{
"epoch": 1.2168067226890757,
"grad_norm": 1.9474423499915685,
"learning_rate": 3.3299288410144813e-06,
"loss": 0.0645,
"step": 724
},
{
"epoch": 1.2184873949579833,
"grad_norm": 3.3943630595361127,
"learning_rate": 3.3174928063863054e-06,
"loss": 0.0822,
"step": 725
},
{
"epoch": 1.2201680672268909,
"grad_norm": 2.6533504550178177,
"learning_rate": 3.3050684980986105e-06,
"loss": 0.0727,
"step": 726
},
{
"epoch": 1.2218487394957984,
"grad_norm": 2.6778789026347813,
"learning_rate": 3.292656002743405e-06,
"loss": 0.0752,
"step": 727
},
{
"epoch": 1.223529411764706,
"grad_norm": 1.9406512061812329,
"learning_rate": 3.2802554068303595e-06,
"loss": 0.0715,
"step": 728
},
{
"epoch": 1.2252100840336135,
"grad_norm": 2.356227596188501,
"learning_rate": 3.267866796786212e-06,
"loss": 0.0677,
"step": 729
},
{
"epoch": 1.226890756302521,
"grad_norm": 1.937645206088155,
"learning_rate": 3.255490258954167e-06,
"loss": 0.0371,
"step": 730
},
{
"epoch": 1.2285714285714286,
"grad_norm": 3.012214547683442,
"learning_rate": 3.2431258795932863e-06,
"loss": 0.1026,
"step": 731
},
{
"epoch": 1.2302521008403362,
"grad_norm": 3.2105396902791328,
"learning_rate": 3.2307737448778977e-06,
"loss": 0.0909,
"step": 732
},
{
"epoch": 1.2319327731092438,
"grad_norm": 2.599649894232782,
"learning_rate": 3.2184339408969857e-06,
"loss": 0.0682,
"step": 733
},
{
"epoch": 1.2336134453781513,
"grad_norm": 1.761237551093203,
"learning_rate": 3.206106553653596e-06,
"loss": 0.0482,
"step": 734
},
{
"epoch": 1.2352941176470589,
"grad_norm": 3.0540661926673494,
"learning_rate": 3.1937916690642356e-06,
"loss": 0.074,
"step": 735
},
{
"epoch": 1.2369747899159664,
"grad_norm": 2.6423585592436742,
"learning_rate": 3.181489372958272e-06,
"loss": 0.079,
"step": 736
},
{
"epoch": 1.238655462184874,
"grad_norm": 3.5166766786686767,
"learning_rate": 3.1691997510773376e-06,
"loss": 0.0993,
"step": 737
},
{
"epoch": 1.2403361344537815,
"grad_norm": 3.0030664598658494,
"learning_rate": 3.1569228890747305e-06,
"loss": 0.1061,
"step": 738
},
{
"epoch": 1.242016806722689,
"grad_norm": 2.1826647597969906,
"learning_rate": 3.1446588725148186e-06,
"loss": 0.0559,
"step": 739
},
{
"epoch": 1.2436974789915967,
"grad_norm": 3.8308938156876025,
"learning_rate": 3.132407786872442e-06,
"loss": 0.146,
"step": 740
},
{
"epoch": 1.2453781512605042,
"grad_norm": 3.540659391096697,
"learning_rate": 3.120169717532319e-06,
"loss": 0.128,
"step": 741
},
{
"epoch": 1.2470588235294118,
"grad_norm": 2.572082033443637,
"learning_rate": 3.107944749788449e-06,
"loss": 0.1095,
"step": 742
},
{
"epoch": 1.2487394957983193,
"grad_norm": 3.1039972961858466,
"learning_rate": 3.095732968843519e-06,
"loss": 0.1072,
"step": 743
},
{
"epoch": 1.250420168067227,
"grad_norm": 3.433630247978511,
"learning_rate": 3.0835344598083095e-06,
"loss": 0.0995,
"step": 744
},
{
"epoch": 1.2521008403361344,
"grad_norm": 2.9037148814677027,
"learning_rate": 3.0713493077011027e-06,
"loss": 0.0988,
"step": 745
},
{
"epoch": 1.253781512605042,
"grad_norm": 1.6255984404646961,
"learning_rate": 3.059177597447087e-06,
"loss": 0.0472,
"step": 746
},
{
"epoch": 1.2554621848739496,
"grad_norm": 2.8035161644835784,
"learning_rate": 3.0470194138777707e-06,
"loss": 0.0901,
"step": 747
},
{
"epoch": 1.2571428571428571,
"grad_norm": 3.0114191790076346,
"learning_rate": 3.0348748417303826e-06,
"loss": 0.1108,
"step": 748
},
{
"epoch": 1.2588235294117647,
"grad_norm": 2.6244638367291415,
"learning_rate": 3.0227439656472878e-06,
"loss": 0.0622,
"step": 749
},
{
"epoch": 1.2605042016806722,
"grad_norm": 2.0291708665569934,
"learning_rate": 3.0106268701753967e-06,
"loss": 0.0566,
"step": 750
},
{
"epoch": 1.2621848739495798,
"grad_norm": 2.8532612744730907,
"learning_rate": 2.9985236397655726e-06,
"loss": 0.0888,
"step": 751
},
{
"epoch": 1.2638655462184873,
"grad_norm": 2.2036167732649052,
"learning_rate": 2.986434358772048e-06,
"loss": 0.0546,
"step": 752
},
{
"epoch": 1.265546218487395,
"grad_norm": 2.7784477191560573,
"learning_rate": 2.974359111451831e-06,
"loss": 0.1147,
"step": 753
},
{
"epoch": 1.2672268907563025,
"grad_norm": 3.1299626086221033,
"learning_rate": 2.962297981964124e-06,
"loss": 0.1018,
"step": 754
},
{
"epoch": 1.26890756302521,
"grad_norm": 3.1824195697491935,
"learning_rate": 2.950251054369733e-06,
"loss": 0.1057,
"step": 755
},
{
"epoch": 1.2705882352941176,
"grad_norm": 2.9740777866179,
"learning_rate": 2.9382184126304834e-06,
"loss": 0.1037,
"step": 756
},
{
"epoch": 1.2722689075630251,
"grad_norm": 2.6657894597056817,
"learning_rate": 2.926200140608634e-06,
"loss": 0.1039,
"step": 757
},
{
"epoch": 1.2739495798319327,
"grad_norm": 2.264102839980714,
"learning_rate": 2.9141963220662917e-06,
"loss": 0.076,
"step": 758
},
{
"epoch": 1.2756302521008402,
"grad_norm": 3.5306424078481626,
"learning_rate": 2.902207040664834e-06,
"loss": 0.1123,
"step": 759
},
{
"epoch": 1.2773109243697478,
"grad_norm": 3.1919340141274977,
"learning_rate": 2.8902323799643116e-06,
"loss": 0.0979,
"step": 760
},
{
"epoch": 1.2789915966386554,
"grad_norm": 2.3313048943399224,
"learning_rate": 2.8782724234228876e-06,
"loss": 0.0681,
"step": 761
},
{
"epoch": 1.280672268907563,
"grad_norm": 3.747945457345036,
"learning_rate": 2.8663272543962305e-06,
"loss": 0.1077,
"step": 762
},
{
"epoch": 1.2823529411764705,
"grad_norm": 2.8608755161882184,
"learning_rate": 2.8543969561369556e-06,
"loss": 0.0997,
"step": 763
},
{
"epoch": 1.284033613445378,
"grad_norm": 2.903410246959019,
"learning_rate": 2.842481611794032e-06,
"loss": 0.0793,
"step": 764
},
{
"epoch": 1.2857142857142856,
"grad_norm": 2.6778742078682223,
"learning_rate": 2.83058130441221e-06,
"loss": 0.0689,
"step": 765
},
{
"epoch": 1.2873949579831931,
"grad_norm": 2.7827741894771134,
"learning_rate": 2.818696116931431e-06,
"loss": 0.075,
"step": 766
},
{
"epoch": 1.289075630252101,
"grad_norm": 2.128764344141235,
"learning_rate": 2.8068261321862667e-06,
"loss": 0.09,
"step": 767
},
{
"epoch": 1.2907563025210085,
"grad_norm": 3.329548325521387,
"learning_rate": 2.794971432905323e-06,
"loss": 0.0795,
"step": 768
},
{
"epoch": 1.292436974789916,
"grad_norm": 2.2187604305649695,
"learning_rate": 2.7831321017106805e-06,
"loss": 0.0575,
"step": 769
},
{
"epoch": 1.2941176470588236,
"grad_norm": 3.3967436709413836,
"learning_rate": 2.771308221117309e-06,
"loss": 0.1082,
"step": 770
},
{
"epoch": 1.2957983193277312,
"grad_norm": 3.6080260280942316,
"learning_rate": 2.7594998735324905e-06,
"loss": 0.0971,
"step": 771
},
{
"epoch": 1.2974789915966387,
"grad_norm": 2.621544354495556,
"learning_rate": 2.7477071412552554e-06,
"loss": 0.0846,
"step": 772
},
{
"epoch": 1.2991596638655463,
"grad_norm": 2.5632423256672716,
"learning_rate": 2.735930106475794e-06,
"loss": 0.0786,
"step": 773
},
{
"epoch": 1.3008403361344538,
"grad_norm": 2.819748849096385,
"learning_rate": 2.724168851274901e-06,
"loss": 0.0913,
"step": 774
},
{
"epoch": 1.3025210084033614,
"grad_norm": 3.2483306380332384,
"learning_rate": 2.712423457623385e-06,
"loss": 0.1027,
"step": 775
},
{
"epoch": 1.304201680672269,
"grad_norm": 2.607112390166478,
"learning_rate": 2.7006940073815136e-06,
"loss": 0.0852,
"step": 776
},
{
"epoch": 1.3058823529411765,
"grad_norm": 2.550425691687378,
"learning_rate": 2.6889805822984348e-06,
"loss": 0.0782,
"step": 777
},
{
"epoch": 1.307563025210084,
"grad_norm": 2.4162221875946672,
"learning_rate": 2.6772832640116035e-06,
"loss": 0.0997,
"step": 778
},
{
"epoch": 1.3092436974789916,
"grad_norm": 2.8532159540534363,
"learning_rate": 2.6656021340462246e-06,
"loss": 0.1012,
"step": 779
},
{
"epoch": 1.3109243697478992,
"grad_norm": 3.2586342692184638,
"learning_rate": 2.6539372738146696e-06,
"loss": 0.1058,
"step": 780
},
{
"epoch": 1.3126050420168067,
"grad_norm": 2.752956076373071,
"learning_rate": 2.6422887646159234e-06,
"loss": 0.0999,
"step": 781
},
{
"epoch": 1.3142857142857143,
"grad_norm": 2.6282166566722163,
"learning_rate": 2.6306566876350072e-06,
"loss": 0.0904,
"step": 782
},
{
"epoch": 1.3159663865546218,
"grad_norm": 4.459511701783127,
"learning_rate": 2.619041123942419e-06,
"loss": 0.1412,
"step": 783
},
{
"epoch": 1.3176470588235294,
"grad_norm": 3.0171293567872057,
"learning_rate": 2.607442154493568e-06,
"loss": 0.0778,
"step": 784
},
{
"epoch": 1.319327731092437,
"grad_norm": 2.099785903495667,
"learning_rate": 2.5958598601282036e-06,
"loss": 0.0755,
"step": 785
},
{
"epoch": 1.3210084033613445,
"grad_norm": 2.20038597827373,
"learning_rate": 2.584294321569862e-06,
"loss": 0.0977,
"step": 786
},
{
"epoch": 1.322689075630252,
"grad_norm": 2.784330403766364,
"learning_rate": 2.572745619425296e-06,
"loss": 0.0736,
"step": 787
},
{
"epoch": 1.3243697478991596,
"grad_norm": 2.0879911769632886,
"learning_rate": 2.561213834183919e-06,
"loss": 0.0608,
"step": 788
},
{
"epoch": 1.3260504201680672,
"grad_norm": 3.9806688534541843,
"learning_rate": 2.5496990462172344e-06,
"loss": 0.0723,
"step": 789
},
{
"epoch": 1.3277310924369747,
"grad_norm": 2.7580721019333603,
"learning_rate": 2.538201335778289e-06,
"loss": 0.082,
"step": 790
},
{
"epoch": 1.3294117647058823,
"grad_norm": 2.8722823015793146,
"learning_rate": 2.526720783001107e-06,
"loss": 0.1008,
"step": 791
},
{
"epoch": 1.3310924369747898,
"grad_norm": 2.434260993789297,
"learning_rate": 2.5152574679001236e-06,
"loss": 0.0738,
"step": 792
},
{
"epoch": 1.3327731092436974,
"grad_norm": 2.80994760149212,
"learning_rate": 2.503811470369644e-06,
"loss": 0.0794,
"step": 793
},
{
"epoch": 1.334453781512605,
"grad_norm": 2.9346405462655465,
"learning_rate": 2.4923828701832682e-06,
"loss": 0.1039,
"step": 794
},
{
"epoch": 1.3361344537815127,
"grad_norm": 4.00969514669466,
"learning_rate": 2.4809717469933543e-06,
"loss": 0.0713,
"step": 795
},
{
"epoch": 1.3378151260504203,
"grad_norm": 3.3856635475000134,
"learning_rate": 2.469578180330444e-06,
"loss": 0.1224,
"step": 796
},
{
"epoch": 1.3394957983193279,
"grad_norm": 4.141002149954988,
"learning_rate": 2.458202249602726e-06,
"loss": 0.1295,
"step": 797
},
{
"epoch": 1.3411764705882354,
"grad_norm": 2.3909571975188566,
"learning_rate": 2.4468440340954664e-06,
"loss": 0.0858,
"step": 798
},
{
"epoch": 1.342857142857143,
"grad_norm": 2.732623939016431,
"learning_rate": 2.43550361297047e-06,
"loss": 0.1232,
"step": 799
},
{
"epoch": 1.3445378151260505,
"grad_norm": 2.27043539875792,
"learning_rate": 2.4241810652655197e-06,
"loss": 0.0646,
"step": 800
},
{
"epoch": 1.346218487394958,
"grad_norm": 2.800553021390132,
"learning_rate": 2.4128764698938297e-06,
"loss": 0.0856,
"step": 801
},
{
"epoch": 1.3478991596638656,
"grad_norm": 2.468540615484736,
"learning_rate": 2.4015899056434945e-06,
"loss": 0.1067,
"step": 802
},
{
"epoch": 1.3495798319327732,
"grad_norm": 4.025378528695785,
"learning_rate": 2.390321451176936e-06,
"loss": 0.105,
"step": 803
},
{
"epoch": 1.3512605042016808,
"grad_norm": 2.0577272684146872,
"learning_rate": 2.379071185030365e-06,
"loss": 0.067,
"step": 804
},
{
"epoch": 1.3529411764705883,
"grad_norm": 2.7859891346028185,
"learning_rate": 2.3678391856132203e-06,
"loss": 0.0715,
"step": 805
},
{
"epoch": 1.3546218487394959,
"grad_norm": 3.417663771387606,
"learning_rate": 2.356625531207638e-06,
"loss": 0.1134,
"step": 806
},
{
"epoch": 1.3563025210084034,
"grad_norm": 2.9934064584562807,
"learning_rate": 2.345430299967888e-06,
"loss": 0.099,
"step": 807
},
{
"epoch": 1.357983193277311,
"grad_norm": 3.0446906092550727,
"learning_rate": 2.334253569919846e-06,
"loss": 0.0881,
"step": 808
},
{
"epoch": 1.3596638655462185,
"grad_norm": 2.714244890028747,
"learning_rate": 2.323095418960442e-06,
"loss": 0.0651,
"step": 809
},
{
"epoch": 1.361344537815126,
"grad_norm": 2.7908745015689145,
"learning_rate": 2.311955924857113e-06,
"loss": 0.0928,
"step": 810
},
{
"epoch": 1.3630252100840337,
"grad_norm": 2.593550587022856,
"learning_rate": 2.3008351652472714e-06,
"loss": 0.0936,
"step": 811
},
{
"epoch": 1.3647058823529412,
"grad_norm": 2.6262874429380214,
"learning_rate": 2.289733217637753e-06,
"loss": 0.07,
"step": 812
},
{
"epoch": 1.3663865546218488,
"grad_norm": 2.5534127800889346,
"learning_rate": 2.278650159404289e-06,
"loss": 0.0778,
"step": 813
},
{
"epoch": 1.3680672268907563,
"grad_norm": 2.44366168179335,
"learning_rate": 2.267586067790952e-06,
"loss": 0.0825,
"step": 814
},
{
"epoch": 1.3697478991596639,
"grad_norm": 2.768515991865253,
"learning_rate": 2.2565410199096322e-06,
"loss": 0.086,
"step": 815
},
{
"epoch": 1.3714285714285714,
"grad_norm": 2.9765080509774546,
"learning_rate": 2.245515092739488e-06,
"loss": 0.0967,
"step": 816
},
{
"epoch": 1.373109243697479,
"grad_norm": 2.875743137704446,
"learning_rate": 2.234508363126419e-06,
"loss": 0.1058,
"step": 817
},
{
"epoch": 1.3747899159663866,
"grad_norm": 3.542235960340661,
"learning_rate": 2.2235209077825264e-06,
"loss": 0.153,
"step": 818
},
{
"epoch": 1.3764705882352941,
"grad_norm": 2.403232600671078,
"learning_rate": 2.2125528032855727e-06,
"loss": 0.0689,
"step": 819
},
{
"epoch": 1.3781512605042017,
"grad_norm": 2.7232278008796693,
"learning_rate": 2.2016041260784604e-06,
"loss": 0.0523,
"step": 820
},
{
"epoch": 1.3798319327731092,
"grad_norm": 2.854644874148179,
"learning_rate": 2.1906749524686856e-06,
"loss": 0.0783,
"step": 821
},
{
"epoch": 1.3815126050420168,
"grad_norm": 1.8526957378806435,
"learning_rate": 2.1797653586278193e-06,
"loss": 0.0658,
"step": 822
},
{
"epoch": 1.3831932773109243,
"grad_norm": 3.1776681065587606,
"learning_rate": 2.168875420590965e-06,
"loss": 0.1051,
"step": 823
},
{
"epoch": 1.384873949579832,
"grad_norm": 2.5512188793181623,
"learning_rate": 2.158005214256236e-06,
"loss": 0.0708,
"step": 824
},
{
"epoch": 1.3865546218487395,
"grad_norm": 2.6609634186618156,
"learning_rate": 2.147154815384226e-06,
"loss": 0.0768,
"step": 825
},
{
"epoch": 1.388235294117647,
"grad_norm": 2.3931424820403295,
"learning_rate": 2.136324299597474e-06,
"loss": 0.0715,
"step": 826
},
{
"epoch": 1.3899159663865546,
"grad_norm": 2.8679014750294125,
"learning_rate": 2.12551374237995e-06,
"loss": 0.1095,
"step": 827
},
{
"epoch": 1.3915966386554621,
"grad_norm": 2.180768007063552,
"learning_rate": 2.1147232190765137e-06,
"loss": 0.0675,
"step": 828
},
{
"epoch": 1.3932773109243697,
"grad_norm": 2.5066981219224775,
"learning_rate": 2.1039528048924043e-06,
"loss": 0.059,
"step": 829
},
{
"epoch": 1.3949579831932772,
"grad_norm": 3.3803755201148187,
"learning_rate": 2.0932025748927015e-06,
"loss": 0.1068,
"step": 830
},
{
"epoch": 1.3966386554621848,
"grad_norm": 2.371033561602867,
"learning_rate": 2.0824726040018174e-06,
"loss": 0.0728,
"step": 831
},
{
"epoch": 1.3983193277310924,
"grad_norm": 2.774208580231465,
"learning_rate": 2.0717629670029653e-06,
"loss": 0.0661,
"step": 832
},
{
"epoch": 1.4,
"grad_norm": 2.1336928525875836,
"learning_rate": 2.061073738537635e-06,
"loss": 0.0733,
"step": 833
},
{
"epoch": 1.4016806722689075,
"grad_norm": 3.1234467182623975,
"learning_rate": 2.050404993105085e-06,
"loss": 0.0831,
"step": 834
},
{
"epoch": 1.403361344537815,
"grad_norm": 3.091586518307653,
"learning_rate": 2.0397568050618095e-06,
"loss": 0.1086,
"step": 835
},
{
"epoch": 1.4050420168067226,
"grad_norm": 3.002925392888798,
"learning_rate": 2.0291292486210327e-06,
"loss": 0.0994,
"step": 836
},
{
"epoch": 1.4067226890756301,
"grad_norm": 2.4071605199504105,
"learning_rate": 2.018522397852178e-06,
"loss": 0.0642,
"step": 837
},
{
"epoch": 1.4084033613445377,
"grad_norm": 4.850453820458399,
"learning_rate": 2.0079363266803696e-06,
"loss": 0.1209,
"step": 838
},
{
"epoch": 1.4100840336134453,
"grad_norm": 2.5655766253111913,
"learning_rate": 1.9973711088858973e-06,
"loss": 0.0718,
"step": 839
},
{
"epoch": 1.4117647058823528,
"grad_norm": 2.7191226267335726,
"learning_rate": 1.9868268181037186e-06,
"loss": 0.0898,
"step": 840
},
{
"epoch": 1.4134453781512604,
"grad_norm": 2.636556426536022,
"learning_rate": 1.976303527822933e-06,
"loss": 0.0944,
"step": 841
},
{
"epoch": 1.4151260504201681,
"grad_norm": 3.427543761682362,
"learning_rate": 1.9658013113862806e-06,
"loss": 0.0629,
"step": 842
},
{
"epoch": 1.4168067226890757,
"grad_norm": 2.407314737887372,
"learning_rate": 1.9553202419896256e-06,
"loss": 0.0669,
"step": 843
},
{
"epoch": 1.4184873949579833,
"grad_norm": 2.3970562937015605,
"learning_rate": 1.944860392681442e-06,
"loss": 0.0842,
"step": 844
},
{
"epoch": 1.4201680672268908,
"grad_norm": 3.0111009657414436,
"learning_rate": 1.934421836362315e-06,
"loss": 0.0925,
"step": 845
},
{
"epoch": 1.4218487394957984,
"grad_norm": 2.3181211787455287,
"learning_rate": 1.9240046457844223e-06,
"loss": 0.0857,
"step": 846
},
{
"epoch": 1.423529411764706,
"grad_norm": 2.4618319020447057,
"learning_rate": 1.913608893551036e-06,
"loss": 0.0718,
"step": 847
},
{
"epoch": 1.4252100840336135,
"grad_norm": 2.4722571546574206,
"learning_rate": 1.9032346521160066e-06,
"loss": 0.0775,
"step": 848
},
{
"epoch": 1.426890756302521,
"grad_norm": 2.8957411522240957,
"learning_rate": 1.8928819937832689e-06,
"loss": 0.0981,
"step": 849
},
{
"epoch": 1.4285714285714286,
"grad_norm": 3.0058263449936424,
"learning_rate": 1.8825509907063328e-06,
"loss": 0.1281,
"step": 850
},
{
"epoch": 1.4302521008403362,
"grad_norm": 2.6662478992952585,
"learning_rate": 1.8722417148877752e-06,
"loss": 0.0808,
"step": 851
},
{
"epoch": 1.4319327731092437,
"grad_norm": 2.9792508981015247,
"learning_rate": 1.8619542381787508e-06,
"loss": 0.0999,
"step": 852
},
{
"epoch": 1.4336134453781513,
"grad_norm": 2.1089140232124155,
"learning_rate": 1.851688632278476e-06,
"loss": 0.0469,
"step": 853
},
{
"epoch": 1.4352941176470588,
"grad_norm": 3.5585819612230334,
"learning_rate": 1.8414449687337467e-06,
"loss": 0.0704,
"step": 854
},
{
"epoch": 1.4369747899159664,
"grad_norm": 2.24926411881091,
"learning_rate": 1.8312233189384194e-06,
"loss": 0.0735,
"step": 855
},
{
"epoch": 1.438655462184874,
"grad_norm": 2.6302186153665947,
"learning_rate": 1.821023754132933e-06,
"loss": 0.074,
"step": 856
},
{
"epoch": 1.4403361344537815,
"grad_norm": 2.8780342261679506,
"learning_rate": 1.8108463454038022e-06,
"loss": 0.0778,
"step": 857
},
{
"epoch": 1.442016806722689,
"grad_norm": 3.7396581161865172,
"learning_rate": 1.800691163683118e-06,
"loss": 0.1022,
"step": 858
},
{
"epoch": 1.4436974789915966,
"grad_norm": 3.60714342413831,
"learning_rate": 1.790558279748067e-06,
"loss": 0.1618,
"step": 859
},
{
"epoch": 1.4453781512605042,
"grad_norm": 2.9595163340771147,
"learning_rate": 1.780447764220422e-06,
"loss": 0.0904,
"step": 860
},
{
"epoch": 1.4470588235294117,
"grad_norm": 2.1689321130662886,
"learning_rate": 1.7703596875660645e-06,
"loss": 0.0719,
"step": 861
},
{
"epoch": 1.4487394957983193,
"grad_norm": 2.1941556772158264,
"learning_rate": 1.7602941200944812e-06,
"loss": 0.0635,
"step": 862
},
{
"epoch": 1.4504201680672268,
"grad_norm": 2.2798654285400737,
"learning_rate": 1.7502511319582855e-06,
"loss": 0.0875,
"step": 863
},
{
"epoch": 1.4521008403361344,
"grad_norm": 2.2334745972248298,
"learning_rate": 1.7402307931527157e-06,
"loss": 0.0687,
"step": 864
},
{
"epoch": 1.453781512605042,
"grad_norm": 2.8004894708871415,
"learning_rate": 1.7302331735151594e-06,
"loss": 0.1048,
"step": 865
},
{
"epoch": 1.4554621848739495,
"grad_norm": 3.108271318580809,
"learning_rate": 1.7202583427246633e-06,
"loss": 0.0753,
"step": 866
},
{
"epoch": 1.457142857142857,
"grad_norm": 2.312999197511707,
"learning_rate": 1.7103063703014372e-06,
"loss": 0.0746,
"step": 867
},
{
"epoch": 1.4588235294117646,
"grad_norm": 2.6351363101768954,
"learning_rate": 1.7003773256063882e-06,
"loss": 0.092,
"step": 868
},
{
"epoch": 1.4605042016806722,
"grad_norm": 2.183847704629778,
"learning_rate": 1.690471277840619e-06,
"loss": 0.0625,
"step": 869
},
{
"epoch": 1.46218487394958,
"grad_norm": 3.064581959699731,
"learning_rate": 1.6805882960449594e-06,
"loss": 0.1215,
"step": 870
},
{
"epoch": 1.4638655462184875,
"grad_norm": 3.1271728558378737,
"learning_rate": 1.6707284490994746e-06,
"loss": 0.0908,
"step": 871
},
{
"epoch": 1.465546218487395,
"grad_norm": 2.9326514691610455,
"learning_rate": 1.6608918057229944e-06,
"loss": 0.0798,
"step": 872
},
{
"epoch": 1.4672268907563026,
"grad_norm": 3.6801264170612193,
"learning_rate": 1.6510784344726294e-06,
"loss": 0.1136,
"step": 873
},
{
"epoch": 1.4689075630252102,
"grad_norm": 2.2001266838315057,
"learning_rate": 1.6412884037432875e-06,
"loss": 0.077,
"step": 874
},
{
"epoch": 1.4705882352941178,
"grad_norm": 2.018537490774913,
"learning_rate": 1.6315217817672142e-06,
"loss": 0.0577,
"step": 875
},
{
"epoch": 1.4722689075630253,
"grad_norm": 2.502467447738802,
"learning_rate": 1.6217786366134953e-06,
"loss": 0.0867,
"step": 876
},
{
"epoch": 1.4739495798319329,
"grad_norm": 2.404093931401216,
"learning_rate": 1.612059036187602e-06,
"loss": 0.0818,
"step": 877
},
{
"epoch": 1.4756302521008404,
"grad_norm": 2.7777757705585366,
"learning_rate": 1.6023630482309017e-06,
"loss": 0.0851,
"step": 878
},
{
"epoch": 1.477310924369748,
"grad_norm": 2.1744066253008025,
"learning_rate": 1.5926907403202001e-06,
"loss": 0.0667,
"step": 879
},
{
"epoch": 1.4789915966386555,
"grad_norm": 3.2942884352254542,
"learning_rate": 1.5830421798672568e-06,
"loss": 0.0714,
"step": 880
},
{
"epoch": 1.480672268907563,
"grad_norm": 3.4466527581467976,
"learning_rate": 1.5734174341183284e-06,
"loss": 0.1096,
"step": 881
},
{
"epoch": 1.4823529411764707,
"grad_norm": 2.530910684223825,
"learning_rate": 1.5638165701536866e-06,
"loss": 0.0786,
"step": 882
},
{
"epoch": 1.4840336134453782,
"grad_norm": 2.949069912774898,
"learning_rate": 1.554239654887163e-06,
"loss": 0.082,
"step": 883
},
{
"epoch": 1.4857142857142858,
"grad_norm": 2.918482254384839,
"learning_rate": 1.544686755065677e-06,
"loss": 0.1056,
"step": 884
},
{
"epoch": 1.4873949579831933,
"grad_norm": 2.9537497761417644,
"learning_rate": 1.5351579372687658e-06,
"loss": 0.0904,
"step": 885
},
{
"epoch": 1.4890756302521009,
"grad_norm": 3.0556323605290676,
"learning_rate": 1.525653267908132e-06,
"loss": 0.0871,
"step": 886
},
{
"epoch": 1.4907563025210084,
"grad_norm": 2.6043633400058006,
"learning_rate": 1.5161728132271674e-06,
"loss": 0.0744,
"step": 887
},
{
"epoch": 1.492436974789916,
"grad_norm": 2.0618324084427804,
"learning_rate": 1.5067166393005055e-06,
"loss": 0.0701,
"step": 888
},
{
"epoch": 1.4941176470588236,
"grad_norm": 2.9896346108157177,
"learning_rate": 1.4972848120335453e-06,
"loss": 0.1136,
"step": 889
},
{
"epoch": 1.495798319327731,
"grad_norm": 2.605228843859927,
"learning_rate": 1.4878773971620076e-06,
"loss": 0.0673,
"step": 890
},
{
"epoch": 1.4974789915966387,
"grad_norm": 2.7905166458094333,
"learning_rate": 1.4784944602514662e-06,
"loss": 0.0809,
"step": 891
},
{
"epoch": 1.4991596638655462,
"grad_norm": 3.1497593827029506,
"learning_rate": 1.4691360666968923e-06,
"loss": 0.1021,
"step": 892
},
{
"epoch": 1.5008403361344538,
"grad_norm": 2.1897838947360735,
"learning_rate": 1.4598022817222058e-06,
"loss": 0.0571,
"step": 893
},
{
"epoch": 1.5025210084033613,
"grad_norm": 2.302331646085917,
"learning_rate": 1.4504931703798086e-06,
"loss": 0.0576,
"step": 894
},
{
"epoch": 1.504201680672269,
"grad_norm": 2.7532766985557857,
"learning_rate": 1.4412087975501459e-06,
"loss": 0.076,
"step": 895
},
{
"epoch": 1.5058823529411764,
"grad_norm": 2.5202671032062582,
"learning_rate": 1.4319492279412388e-06,
"loss": 0.0678,
"step": 896
},
{
"epoch": 1.507563025210084,
"grad_norm": 3.3560869623465535,
"learning_rate": 1.4227145260882463e-06,
"loss": 0.0674,
"step": 897
},
{
"epoch": 1.5092436974789916,
"grad_norm": 2.7037346414852497,
"learning_rate": 1.413504756353009e-06,
"loss": 0.0877,
"step": 898
},
{
"epoch": 1.5109243697478991,
"grad_norm": 2.950076661717238,
"learning_rate": 1.4043199829235983e-06,
"loss": 0.1009,
"step": 899
},
{
"epoch": 1.5126050420168067,
"grad_norm": 2.9724993078538096,
"learning_rate": 1.3951602698138773e-06,
"loss": 0.0998,
"step": 900
},
{
"epoch": 1.5142857142857142,
"grad_norm": 2.171203981414814,
"learning_rate": 1.3860256808630429e-06,
"loss": 0.0649,
"step": 901
},
{
"epoch": 1.5159663865546218,
"grad_norm": 2.3717860449067407,
"learning_rate": 1.3769162797351953e-06,
"loss": 0.0628,
"step": 902
},
{
"epoch": 1.5176470588235293,
"grad_norm": 3.7600443429783352,
"learning_rate": 1.3678321299188802e-06,
"loss": 0.1002,
"step": 903
},
{
"epoch": 1.519327731092437,
"grad_norm": 3.0435496611540467,
"learning_rate": 1.3587732947266557e-06,
"loss": 0.095,
"step": 904
},
{
"epoch": 1.5210084033613445,
"grad_norm": 2.96613726072199,
"learning_rate": 1.34973983729465e-06,
"loss": 0.0914,
"step": 905
},
{
"epoch": 1.522689075630252,
"grad_norm": 2.8435196172045907,
"learning_rate": 1.340731820582114e-06,
"loss": 0.0699,
"step": 906
},
{
"epoch": 1.5243697478991596,
"grad_norm": 2.464476894025319,
"learning_rate": 1.3317493073709936e-06,
"loss": 0.0705,
"step": 907
},
{
"epoch": 1.5260504201680671,
"grad_norm": 1.890126330693869,
"learning_rate": 1.3227923602654808e-06,
"loss": 0.0561,
"step": 908
},
{
"epoch": 1.5277310924369747,
"grad_norm": 2.2602626027892176,
"learning_rate": 1.3138610416915887e-06,
"loss": 0.0733,
"step": 909
},
{
"epoch": 1.5294117647058822,
"grad_norm": 2.5239743120304468,
"learning_rate": 1.3049554138967052e-06,
"loss": 0.0658,
"step": 910
},
{
"epoch": 1.5310924369747898,
"grad_norm": 2.1639616773849344,
"learning_rate": 1.2960755389491703e-06,
"loss": 0.0644,
"step": 911
},
{
"epoch": 1.5327731092436974,
"grad_norm": 2.7172132336504373,
"learning_rate": 1.2872214787378306e-06,
"loss": 0.0806,
"step": 912
},
{
"epoch": 1.534453781512605,
"grad_norm": 2.433240574623036,
"learning_rate": 1.278393294971626e-06,
"loss": 0.0783,
"step": 913
},
{
"epoch": 1.5361344537815125,
"grad_norm": 3.0834984240711565,
"learning_rate": 1.269591049179138e-06,
"loss": 0.0617,
"step": 914
},
{
"epoch": 1.53781512605042,
"grad_norm": 2.397030675922682,
"learning_rate": 1.2608148027081773e-06,
"loss": 0.0818,
"step": 915
},
{
"epoch": 1.5394957983193276,
"grad_norm": 3.0216001414776374,
"learning_rate": 1.2520646167253514e-06,
"loss": 0.0821,
"step": 916
},
{
"epoch": 1.5411764705882351,
"grad_norm": 3.134391572738623,
"learning_rate": 1.2433405522156334e-06,
"loss": 0.0916,
"step": 917
},
{
"epoch": 1.5428571428571427,
"grad_norm": 2.3360210128403636,
"learning_rate": 1.234642669981946e-06,
"loss": 0.0641,
"step": 918
},
{
"epoch": 1.5445378151260503,
"grad_norm": 2.7163994508351217,
"learning_rate": 1.2259710306447275e-06,
"loss": 0.0886,
"step": 919
},
{
"epoch": 1.5462184873949578,
"grad_norm": 2.431875213779764,
"learning_rate": 1.2173256946415214e-06,
"loss": 0.0773,
"step": 920
},
{
"epoch": 1.5478991596638656,
"grad_norm": 2.5611857478832882,
"learning_rate": 1.2087067222265409e-06,
"loss": 0.0628,
"step": 921
},
{
"epoch": 1.5495798319327732,
"grad_norm": 2.9190300690138633,
"learning_rate": 1.2001141734702625e-06,
"loss": 0.077,
"step": 922
},
{
"epoch": 1.5512605042016807,
"grad_norm": 2.6935195739367526,
"learning_rate": 1.1915481082589998e-06,
"loss": 0.0651,
"step": 923
},
{
"epoch": 1.5529411764705883,
"grad_norm": 2.495024421506342,
"learning_rate": 1.1830085862944851e-06,
"loss": 0.0812,
"step": 924
},
{
"epoch": 1.5546218487394958,
"grad_norm": 2.3186901545314353,
"learning_rate": 1.17449566709346e-06,
"loss": 0.0682,
"step": 925
},
{
"epoch": 1.5563025210084034,
"grad_norm": 2.3237588553363944,
"learning_rate": 1.166009409987251e-06,
"loss": 0.0555,
"step": 926
},
{
"epoch": 1.557983193277311,
"grad_norm": 2.8044298597093107,
"learning_rate": 1.1575498741213682e-06,
"loss": 0.0947,
"step": 927
},
{
"epoch": 1.5596638655462185,
"grad_norm": 2.4178880085870396,
"learning_rate": 1.1491171184550799e-06,
"loss": 0.0629,
"step": 928
},
{
"epoch": 1.561344537815126,
"grad_norm": 1.976434008723005,
"learning_rate": 1.1407112017610134e-06,
"loss": 0.052,
"step": 929
},
{
"epoch": 1.5630252100840336,
"grad_norm": 3.483563565061037,
"learning_rate": 1.1323321826247347e-06,
"loss": 0.077,
"step": 930
},
{
"epoch": 1.5647058823529412,
"grad_norm": 1.8972048774489287,
"learning_rate": 1.1239801194443507e-06,
"loss": 0.0494,
"step": 931
},
{
"epoch": 1.5663865546218487,
"grad_norm": 3.1882538432651777,
"learning_rate": 1.115655070430096e-06,
"loss": 0.0779,
"step": 932
},
{
"epoch": 1.5680672268907563,
"grad_norm": 3.1426262134738603,
"learning_rate": 1.107357093603924e-06,
"loss": 0.076,
"step": 933
},
{
"epoch": 1.5697478991596638,
"grad_norm": 3.3491106407225555,
"learning_rate": 1.0990862467991132e-06,
"loss": 0.0939,
"step": 934
},
{
"epoch": 1.5714285714285714,
"grad_norm": 2.3555969578510196,
"learning_rate": 1.0908425876598512e-06,
"loss": 0.0683,
"step": 935
},
{
"epoch": 1.573109243697479,
"grad_norm": 2.4300759069980136,
"learning_rate": 1.082626173640846e-06,
"loss": 0.0788,
"step": 936
},
{
"epoch": 1.5747899159663865,
"grad_norm": 3.0685309864448636,
"learning_rate": 1.0744370620069122e-06,
"loss": 0.0852,
"step": 937
},
{
"epoch": 1.576470588235294,
"grad_norm": 2.568291131776804,
"learning_rate": 1.066275309832584e-06,
"loss": 0.074,
"step": 938
},
{
"epoch": 1.5781512605042018,
"grad_norm": 2.520877566474604,
"learning_rate": 1.0581409740017113e-06,
"loss": 0.0521,
"step": 939
},
{
"epoch": 1.5798319327731094,
"grad_norm": 2.792407855946344,
"learning_rate": 1.0500341112070605e-06,
"loss": 0.0789,
"step": 940
},
{
"epoch": 1.581512605042017,
"grad_norm": 3.1440477930637214,
"learning_rate": 1.0419547779499283e-06,
"loss": 0.111,
"step": 941
},
{
"epoch": 1.5831932773109245,
"grad_norm": 3.08626306661282,
"learning_rate": 1.0339030305397374e-06,
"loss": 0.0773,
"step": 942
},
{
"epoch": 1.584873949579832,
"grad_norm": 2.7818074011761613,
"learning_rate": 1.025878925093653e-06,
"loss": 0.1025,
"step": 943
},
{
"epoch": 1.5865546218487396,
"grad_norm": 2.0707670868256756,
"learning_rate": 1.0178825175361846e-06,
"loss": 0.0543,
"step": 944
},
{
"epoch": 1.5882352941176472,
"grad_norm": 3.277736077714651,
"learning_rate": 1.0099138635988026e-06,
"loss": 0.0886,
"step": 945
},
{
"epoch": 1.5899159663865547,
"grad_norm": 1.7856587713887242,
"learning_rate": 1.0019730188195464e-06,
"loss": 0.0498,
"step": 946
},
{
"epoch": 1.5915966386554623,
"grad_norm": 2.6385746957949787,
"learning_rate": 9.940600385426347e-07,
"loss": 0.0812,
"step": 947
},
{
"epoch": 1.5932773109243699,
"grad_norm": 2.776521241285141,
"learning_rate": 9.861749779180873e-07,
"loss": 0.087,
"step": 948
},
{
"epoch": 1.5949579831932774,
"grad_norm": 2.639723525487175,
"learning_rate": 9.783178919013297e-07,
"loss": 0.1119,
"step": 949
},
{
"epoch": 1.596638655462185,
"grad_norm": 2.7204981405013458,
"learning_rate": 9.704888352528257e-07,
"loss": 0.085,
"step": 950
},
{
"epoch": 1.5983193277310925,
"grad_norm": 2.5248398773851166,
"learning_rate": 9.626878625376784e-07,
"loss": 0.0695,
"step": 951
},
{
"epoch": 1.6,
"grad_norm": 2.670224920366225,
"learning_rate": 9.549150281252633e-07,
"loss": 0.0951,
"step": 952
},
{
"epoch": 1.6016806722689076,
"grad_norm": 2.7422961930525176,
"learning_rate": 9.471703861888398e-07,
"loss": 0.1044,
"step": 953
},
{
"epoch": 1.6033613445378152,
"grad_norm": 2.2888317772986673,
"learning_rate": 9.394539907051837e-07,
"loss": 0.0586,
"step": 954
},
{
"epoch": 1.6050420168067228,
"grad_norm": 2.1943114037634976,
"learning_rate": 9.317658954541992e-07,
"loss": 0.0687,
"step": 955
},
{
"epoch": 1.6067226890756303,
"grad_norm": 3.4427372615358003,
"learning_rate": 9.241061540185547e-07,
"loss": 0.0908,
"step": 956
},
{
"epoch": 1.6084033613445379,
"grad_norm": 2.817393993003906,
"learning_rate": 9.164748197833039e-07,
"loss": 0.0619,
"step": 957
},
{
"epoch": 1.6100840336134454,
"grad_norm": 2.9866238876292415,
"learning_rate": 9.088719459355133e-07,
"loss": 0.0672,
"step": 958
},
{
"epoch": 1.611764705882353,
"grad_norm": 2.785563725722871,
"learning_rate": 9.01297585463895e-07,
"loss": 0.0745,
"step": 959
},
{
"epoch": 1.6134453781512605,
"grad_norm": 2.3105959305373136,
"learning_rate": 8.937517911584321e-07,
"loss": 0.0705,
"step": 960
},
{
"epoch": 1.615126050420168,
"grad_norm": 1.99378117488201,
"learning_rate": 8.862346156100188e-07,
"loss": 0.0466,
"step": 961
},
{
"epoch": 1.6168067226890757,
"grad_norm": 2.6666955685587053,
"learning_rate": 8.787461112100837e-07,
"loss": 0.0656,
"step": 962
},
{
"epoch": 1.6184873949579832,
"grad_norm": 3.3249810625849956,
"learning_rate": 8.712863301502339e-07,
"loss": 0.0678,
"step": 963
},
{
"epoch": 1.6201680672268908,
"grad_norm": 3.353300594203047,
"learning_rate": 8.638553244218872e-07,
"loss": 0.0956,
"step": 964
},
{
"epoch": 1.6218487394957983,
"grad_norm": 2.8496488574356476,
"learning_rate": 8.56453145815907e-07,
"loss": 0.0838,
"step": 965
},
{
"epoch": 1.6235294117647059,
"grad_norm": 3.064310810621701,
"learning_rate": 8.490798459222477e-07,
"loss": 0.0865,
"step": 966
},
{
"epoch": 1.6252100840336134,
"grad_norm": 2.968813736122225,
"learning_rate": 8.417354761295876e-07,
"loss": 0.0728,
"step": 967
},
{
"epoch": 1.626890756302521,
"grad_norm": 2.049286780650219,
"learning_rate": 8.344200876249803e-07,
"loss": 0.0475,
"step": 968
},
{
"epoch": 1.6285714285714286,
"grad_norm": 3.100955143857725,
"learning_rate": 8.271337313934869e-07,
"loss": 0.0849,
"step": 969
},
{
"epoch": 1.6302521008403361,
"grad_norm": 2.9949248571700617,
"learning_rate": 8.198764582178303e-07,
"loss": 0.0859,
"step": 970
},
{
"epoch": 1.6319327731092437,
"grad_norm": 2.853687486256643,
"learning_rate": 8.12648318678036e-07,
"loss": 0.0653,
"step": 971
},
{
"epoch": 1.6336134453781512,
"grad_norm": 2.4758118064736685,
"learning_rate": 8.054493631510785e-07,
"loss": 0.0736,
"step": 972
},
{
"epoch": 1.6352941176470588,
"grad_norm": 2.480419073945819,
"learning_rate": 7.98279641810537e-07,
"loss": 0.0612,
"step": 973
},
{
"epoch": 1.6369747899159663,
"grad_norm": 2.6330025212546073,
"learning_rate": 7.911392046262367e-07,
"loss": 0.0665,
"step": 974
},
{
"epoch": 1.638655462184874,
"grad_norm": 2.6543901706240707,
"learning_rate": 7.840281013639078e-07,
"loss": 0.0508,
"step": 975
},
{
"epoch": 1.6403361344537815,
"grad_norm": 2.701255527964721,
"learning_rate": 7.769463815848344e-07,
"loss": 0.0906,
"step": 976
},
{
"epoch": 1.642016806722689,
"grad_norm": 3.0305303322182335,
"learning_rate": 7.698940946455125e-07,
"loss": 0.076,
"step": 977
},
{
"epoch": 1.6436974789915966,
"grad_norm": 2.560069381692694,
"learning_rate": 7.628712896973006e-07,
"loss": 0.0611,
"step": 978
},
{
"epoch": 1.6453781512605041,
"grad_norm": 2.126677934815069,
"learning_rate": 7.55878015686084e-07,
"loss": 0.0626,
"step": 979
},
{
"epoch": 1.6470588235294117,
"grad_norm": 3.320788842248769,
"learning_rate": 7.489143213519301e-07,
"loss": 0.066,
"step": 980
},
{
"epoch": 1.6487394957983192,
"grad_norm": 2.1324499834476893,
"learning_rate": 7.419802552287453e-07,
"loss": 0.0601,
"step": 981
},
{
"epoch": 1.6504201680672268,
"grad_norm": 1.6451700538742926,
"learning_rate": 7.350758656439455e-07,
"loss": 0.0358,
"step": 982
},
{
"epoch": 1.6521008403361344,
"grad_norm": 2.9089699245304526,
"learning_rate": 7.282012007181083e-07,
"loss": 0.091,
"step": 983
},
{
"epoch": 1.653781512605042,
"grad_norm": 3.3157789811498115,
"learning_rate": 7.213563083646497e-07,
"loss": 0.0808,
"step": 984
},
{
"epoch": 1.6554621848739495,
"grad_norm": 2.504068675080367,
"learning_rate": 7.145412362894771e-07,
"loss": 0.083,
"step": 985
},
{
"epoch": 1.657142857142857,
"grad_norm": 2.965188873541059,
"learning_rate": 7.077560319906696e-07,
"loss": 0.0761,
"step": 986
},
{
"epoch": 1.6588235294117646,
"grad_norm": 3.9651631993982632,
"learning_rate": 7.010007427581378e-07,
"loss": 0.0931,
"step": 987
},
{
"epoch": 1.6605042016806721,
"grad_norm": 2.6828484004077704,
"learning_rate": 6.942754156732978e-07,
"loss": 0.0899,
"step": 988
},
{
"epoch": 1.6621848739495797,
"grad_norm": 2.9178209309323617,
"learning_rate": 6.875800976087444e-07,
"loss": 0.0922,
"step": 989
},
{
"epoch": 1.6638655462184873,
"grad_norm": 2.167420070287885,
"learning_rate": 6.809148352279182e-07,
"loss": 0.0491,
"step": 990
},
{
"epoch": 1.6655462184873948,
"grad_norm": 2.1946934272492538,
"learning_rate": 6.742796749847908e-07,
"loss": 0.059,
"step": 991
},
{
"epoch": 1.6672268907563024,
"grad_norm": 2.312028197457495,
"learning_rate": 6.676746631235282e-07,
"loss": 0.06,
"step": 992
},
{
"epoch": 1.66890756302521,
"grad_norm": 2.6693038248515415,
"learning_rate": 6.61099845678183e-07,
"loss": 0.0644,
"step": 993
},
{
"epoch": 1.6705882352941175,
"grad_norm": 3.164886728441667,
"learning_rate": 6.545552684723583e-07,
"loss": 0.1167,
"step": 994
},
{
"epoch": 1.6722689075630253,
"grad_norm": 3.460096856141393,
"learning_rate": 6.480409771189027e-07,
"loss": 0.092,
"step": 995
},
{
"epoch": 1.6739495798319328,
"grad_norm": 2.0571715576847422,
"learning_rate": 6.415570170195801e-07,
"loss": 0.0583,
"step": 996
},
{
"epoch": 1.6756302521008404,
"grad_norm": 2.5258688350847662,
"learning_rate": 6.351034333647615e-07,
"loss": 0.0716,
"step": 997
},
{
"epoch": 1.677310924369748,
"grad_norm": 2.5066410840729385,
"learning_rate": 6.286802711331097e-07,
"loss": 0.0663,
"step": 998
},
{
"epoch": 1.6789915966386555,
"grad_norm": 2.686273803992896,
"learning_rate": 6.222875750912571e-07,
"loss": 0.0739,
"step": 999
},
{
"epoch": 1.680672268907563,
"grad_norm": 2.3800935639690572,
"learning_rate": 6.159253897935069e-07,
"loss": 0.0645,
"step": 1000
},
{
"epoch": 1.680672268907563,
"eval_loss": 0.21711505949497223,
"eval_runtime": 7.6127,
"eval_samples_per_second": 6.437,
"eval_steps_per_second": 1.708,
"step": 1000
},
{
"epoch": 1.6823529411764706,
"grad_norm": 2.3474068601495777,
"learning_rate": 6.095937595815104e-07,
"loss": 0.059,
"step": 1001
},
{
"epoch": 1.6840336134453782,
"grad_norm": 3.0027968700627143,
"learning_rate": 6.032927285839674e-07,
"loss": 0.11,
"step": 1002
},
{
"epoch": 1.6857142857142857,
"grad_norm": 2.4756615878732755,
"learning_rate": 5.9702234071631e-07,
"loss": 0.0652,
"step": 1003
},
{
"epoch": 1.6873949579831933,
"grad_norm": 2.4849167604857065,
"learning_rate": 5.907826396804062e-07,
"loss": 0.0707,
"step": 1004
},
{
"epoch": 1.6890756302521008,
"grad_norm": 2.527471856677461,
"learning_rate": 5.845736689642472e-07,
"loss": 0.0869,
"step": 1005
},
{
"epoch": 1.6907563025210084,
"grad_norm": 2.8221683255428434,
"learning_rate": 5.783954718416468e-07,
"loss": 0.0819,
"step": 1006
},
{
"epoch": 1.692436974789916,
"grad_norm": 2.631351359370838,
"learning_rate": 5.722480913719425e-07,
"loss": 0.0587,
"step": 1007
},
{
"epoch": 1.6941176470588235,
"grad_norm": 2.6033686025698515,
"learning_rate": 5.661315703996905e-07,
"loss": 0.0766,
"step": 1008
},
{
"epoch": 1.695798319327731,
"grad_norm": 2.543956882885212,
"learning_rate": 5.600459515543733e-07,
"loss": 0.065,
"step": 1009
},
{
"epoch": 1.6974789915966386,
"grad_norm": 3.650192626479405,
"learning_rate": 5.539912772500943e-07,
"loss": 0.0862,
"step": 1010
},
{
"epoch": 1.6991596638655462,
"grad_norm": 3.78077938487044,
"learning_rate": 5.47967589685292e-07,
"loss": 0.0892,
"step": 1011
},
{
"epoch": 1.7008403361344537,
"grad_norm": 2.7394887442627796,
"learning_rate": 5.419749308424377e-07,
"loss": 0.063,
"step": 1012
},
{
"epoch": 1.7025210084033613,
"grad_norm": 2.7156577152398538,
"learning_rate": 5.360133424877467e-07,
"loss": 0.0991,
"step": 1013
},
{
"epoch": 1.704201680672269,
"grad_norm": 3.596863078893733,
"learning_rate": 5.300828661708873e-07,
"loss": 0.124,
"step": 1014
},
{
"epoch": 1.7058823529411766,
"grad_norm": 2.702400026116516,
"learning_rate": 5.241835432246888e-07,
"loss": 0.0779,
"step": 1015
},
{
"epoch": 1.7075630252100842,
"grad_norm": 1.9594421326250522,
"learning_rate": 5.183154147648578e-07,
"loss": 0.0636,
"step": 1016
},
{
"epoch": 1.7092436974789917,
"grad_norm": 2.3716518580813006,
"learning_rate": 5.124785216896854e-07,
"loss": 0.065,
"step": 1017
},
{
"epoch": 1.7109243697478993,
"grad_norm": 1.8907181889521054,
"learning_rate": 5.066729046797692e-07,
"loss": 0.0473,
"step": 1018
},
{
"epoch": 1.7126050420168069,
"grad_norm": 2.4203847174896884,
"learning_rate": 5.008986041977254e-07,
"loss": 0.0544,
"step": 1019
},
{
"epoch": 1.7142857142857144,
"grad_norm": 3.053591851347738,
"learning_rate": 4.951556604879049e-07,
"loss": 0.084,
"step": 1020
},
{
"epoch": 1.715966386554622,
"grad_norm": 2.925477119081493,
"learning_rate": 4.894441135761197e-07,
"loss": 0.0962,
"step": 1021
},
{
"epoch": 1.7176470588235295,
"grad_norm": 2.2302715873385166,
"learning_rate": 4.837640032693558e-07,
"loss": 0.0966,
"step": 1022
},
{
"epoch": 1.719327731092437,
"grad_norm": 3.533404563017714,
"learning_rate": 4.781153691555035e-07,
"loss": 0.163,
"step": 1023
},
{
"epoch": 1.7210084033613446,
"grad_norm": 2.597578370940511,
"learning_rate": 4.724982506030762e-07,
"loss": 0.0756,
"step": 1024
},
{
"epoch": 1.7226890756302522,
"grad_norm": 3.1659295913472554,
"learning_rate": 4.669126867609375e-07,
"loss": 0.085,
"step": 1025
},
{
"epoch": 1.7243697478991598,
"grad_norm": 2.847957423266496,
"learning_rate": 4.613587165580269e-07,
"loss": 0.0917,
"step": 1026
},
{
"epoch": 1.7260504201680673,
"grad_norm": 2.4839081103193683,
"learning_rate": 4.5583637870309397e-07,
"loss": 0.0709,
"step": 1027
},
{
"epoch": 1.7277310924369749,
"grad_norm": 1.9119792986038078,
"learning_rate": 4.503457116844201e-07,
"loss": 0.0636,
"step": 1028
},
{
"epoch": 1.7294117647058824,
"grad_norm": 3.5996944839451395,
"learning_rate": 4.448867537695578e-07,
"loss": 0.0813,
"step": 1029
},
{
"epoch": 1.73109243697479,
"grad_norm": 1.7931626779021408,
"learning_rate": 4.394595430050613e-07,
"loss": 0.0553,
"step": 1030
},
{
"epoch": 1.7327731092436975,
"grad_norm": 2.3675109843091158,
"learning_rate": 4.34064117216218e-07,
"loss": 0.0555,
"step": 1031
},
{
"epoch": 1.734453781512605,
"grad_norm": 3.2733999141078143,
"learning_rate": 4.287005140067912e-07,
"loss": 0.1126,
"step": 1032
},
{
"epoch": 1.7361344537815127,
"grad_norm": 2.0302561800728944,
"learning_rate": 4.2336877075875136e-07,
"loss": 0.047,
"step": 1033
},
{
"epoch": 1.7378151260504202,
"grad_norm": 1.7878895517124131,
"learning_rate": 4.1806892463202353e-07,
"loss": 0.0403,
"step": 1034
},
{
"epoch": 1.7394957983193278,
"grad_norm": 2.517182395401572,
"learning_rate": 4.1280101256421903e-07,
"loss": 0.064,
"step": 1035
},
{
"epoch": 1.7411764705882353,
"grad_norm": 2.8819937038701005,
"learning_rate": 4.0756507127038494e-07,
"loss": 0.0839,
"step": 1036
},
{
"epoch": 1.7428571428571429,
"grad_norm": 2.382102175428615,
"learning_rate": 4.0236113724274716e-07,
"loss": 0.0583,
"step": 1037
},
{
"epoch": 1.7445378151260504,
"grad_norm": 2.9497304360347716,
"learning_rate": 3.971892467504518e-07,
"loss": 0.0689,
"step": 1038
},
{
"epoch": 1.746218487394958,
"grad_norm": 2.0320898349329886,
"learning_rate": 3.9204943583931953e-07,
"loss": 0.0682,
"step": 1039
},
{
"epoch": 1.7478991596638656,
"grad_norm": 3.004623695811176,
"learning_rate": 3.869417403315856e-07,
"loss": 0.0925,
"step": 1040
},
{
"epoch": 1.749579831932773,
"grad_norm": 2.747110324380183,
"learning_rate": 3.8186619582565974e-07,
"loss": 0.0586,
"step": 1041
},
{
"epoch": 1.7512605042016807,
"grad_norm": 2.094632480179336,
"learning_rate": 3.7682283769586883e-07,
"loss": 0.0538,
"step": 1042
},
{
"epoch": 1.7529411764705882,
"grad_norm": 2.7621486553756034,
"learning_rate": 3.71811701092219e-07,
"loss": 0.1123,
"step": 1043
},
{
"epoch": 1.7546218487394958,
"grad_norm": 2.7045998618097435,
"learning_rate": 3.6683282094014285e-07,
"loss": 0.0691,
"step": 1044
},
{
"epoch": 1.7563025210084033,
"grad_norm": 3.439936853763029,
"learning_rate": 3.6188623194026105e-07,
"loss": 0.085,
"step": 1045
},
{
"epoch": 1.757983193277311,
"grad_norm": 1.9441996285629553,
"learning_rate": 3.569719685681405e-07,
"loss": 0.0432,
"step": 1046
},
{
"epoch": 1.7596638655462185,
"grad_norm": 2.340946200287024,
"learning_rate": 3.5209006507404883e-07,
"loss": 0.0756,
"step": 1047
},
{
"epoch": 1.761344537815126,
"grad_norm": 1.868301185446278,
"learning_rate": 3.4724055548272406e-07,
"loss": 0.0589,
"step": 1048
},
{
"epoch": 1.7630252100840336,
"grad_norm": 2.7014525404333627,
"learning_rate": 3.4242347359312864e-07,
"loss": 0.0931,
"step": 1049
},
{
"epoch": 1.7647058823529411,
"grad_norm": 2.8475033114504877,
"learning_rate": 3.3763885297822153e-07,
"loss": 0.0735,
"step": 1050
},
{
"epoch": 1.7663865546218487,
"grad_norm": 2.9891854484974125,
"learning_rate": 3.3288672698471804e-07,
"loss": 0.0852,
"step": 1051
},
{
"epoch": 1.7680672268907562,
"grad_norm": 2.542773456492228,
"learning_rate": 3.281671287328614e-07,
"loss": 0.0732,
"step": 1052
},
{
"epoch": 1.7697478991596638,
"grad_norm": 3.6329591813631508,
"learning_rate": 3.2348009111619227e-07,
"loss": 0.0985,
"step": 1053
},
{
"epoch": 1.7714285714285714,
"grad_norm": 2.007617360456515,
"learning_rate": 3.18825646801314e-07,
"loss": 0.0477,
"step": 1054
},
{
"epoch": 1.773109243697479,
"grad_norm": 2.8286776456999094,
"learning_rate": 3.1420382822767326e-07,
"loss": 0.0835,
"step": 1055
},
{
"epoch": 1.7747899159663865,
"grad_norm": 2.8970226798307066,
"learning_rate": 3.096146676073253e-07,
"loss": 0.0989,
"step": 1056
},
{
"epoch": 1.776470588235294,
"grad_norm": 2.2847322270579453,
"learning_rate": 3.0505819692471797e-07,
"loss": 0.0629,
"step": 1057
},
{
"epoch": 1.7781512605042016,
"grad_norm": 3.247777179885162,
"learning_rate": 3.0053444793646024e-07,
"loss": 0.1072,
"step": 1058
},
{
"epoch": 1.7798319327731091,
"grad_norm": 2.8398776729370447,
"learning_rate": 2.960434521711086e-07,
"loss": 0.0636,
"step": 1059
},
{
"epoch": 1.7815126050420167,
"grad_norm": 1.7198170726412092,
"learning_rate": 2.915852409289421e-07,
"loss": 0.0459,
"step": 1060
},
{
"epoch": 1.7831932773109243,
"grad_norm": 2.47365582662474,
"learning_rate": 2.8715984528174757e-07,
"loss": 0.0565,
"step": 1061
},
{
"epoch": 1.7848739495798318,
"grad_norm": 2.7202776621889004,
"learning_rate": 2.827672960725991e-07,
"loss": 0.0868,
"step": 1062
},
{
"epoch": 1.7865546218487394,
"grad_norm": 2.7847895905936837,
"learning_rate": 2.7840762391564634e-07,
"loss": 0.0817,
"step": 1063
},
{
"epoch": 1.788235294117647,
"grad_norm": 2.2785885981744243,
"learning_rate": 2.7408085919590265e-07,
"loss": 0.057,
"step": 1064
},
{
"epoch": 1.7899159663865545,
"grad_norm": 2.5726828881196315,
"learning_rate": 2.697870320690266e-07,
"loss": 0.0738,
"step": 1065
},
{
"epoch": 1.791596638655462,
"grad_norm": 2.4462470824133793,
"learning_rate": 2.6552617246111966e-07,
"loss": 0.067,
"step": 1066
},
{
"epoch": 1.7932773109243696,
"grad_norm": 2.1874531530519956,
"learning_rate": 2.612983100685118e-07,
"loss": 0.0572,
"step": 1067
},
{
"epoch": 1.7949579831932772,
"grad_norm": 3.148057488521974,
"learning_rate": 2.5710347435755955e-07,
"loss": 0.0768,
"step": 1068
},
{
"epoch": 1.7966386554621847,
"grad_norm": 2.2736618720698942,
"learning_rate": 2.5294169456443416e-07,
"loss": 0.0607,
"step": 1069
},
{
"epoch": 1.7983193277310925,
"grad_norm": 2.408408179557469,
"learning_rate": 2.4881299969492514e-07,
"loss": 0.0839,
"step": 1070
},
{
"epoch": 1.8,
"grad_norm": 2.7245275157909683,
"learning_rate": 2.447174185242324e-07,
"loss": 0.0963,
"step": 1071
},
{
"epoch": 1.8016806722689076,
"grad_norm": 2.546372677098468,
"learning_rate": 2.406549795967678e-07,
"loss": 0.0501,
"step": 1072
},
{
"epoch": 1.8033613445378152,
"grad_norm": 2.2774525189800703,
"learning_rate": 2.366257112259579e-07,
"loss": 0.0506,
"step": 1073
},
{
"epoch": 1.8050420168067227,
"grad_norm": 1.9454534628385145,
"learning_rate": 2.3262964149404322e-07,
"loss": 0.0698,
"step": 1074
},
{
"epoch": 1.8067226890756303,
"grad_norm": 2.7533389388257405,
"learning_rate": 2.286667982518853e-07,
"loss": 0.0828,
"step": 1075
},
{
"epoch": 1.8084033613445378,
"grad_norm": 2.3409625134750476,
"learning_rate": 2.247372091187705e-07,
"loss": 0.0657,
"step": 1076
},
{
"epoch": 1.8100840336134454,
"grad_norm": 3.118804130005249,
"learning_rate": 2.2084090148221937e-07,
"loss": 0.1111,
"step": 1077
},
{
"epoch": 1.811764705882353,
"grad_norm": 2.401067953654103,
"learning_rate": 2.1697790249779638e-07,
"loss": 0.0748,
"step": 1078
},
{
"epoch": 1.8134453781512605,
"grad_norm": 3.0291208574888193,
"learning_rate": 2.1314823908891558e-07,
"loss": 0.0889,
"step": 1079
},
{
"epoch": 1.815126050420168,
"grad_norm": 2.75949156227135,
"learning_rate": 2.093519379466602e-07,
"loss": 0.0776,
"step": 1080
},
{
"epoch": 1.8168067226890756,
"grad_norm": 2.00422109514553,
"learning_rate": 2.0558902552959058e-07,
"loss": 0.0598,
"step": 1081
},
{
"epoch": 1.8184873949579832,
"grad_norm": 3.509921644126103,
"learning_rate": 2.018595280635638e-07,
"loss": 0.077,
"step": 1082
},
{
"epoch": 1.8201680672268907,
"grad_norm": 2.395269012866003,
"learning_rate": 1.981634715415487e-07,
"loss": 0.0476,
"step": 1083
},
{
"epoch": 1.8218487394957983,
"grad_norm": 3.1367885506904933,
"learning_rate": 1.945008817234445e-07,
"loss": 0.0899,
"step": 1084
},
{
"epoch": 1.8235294117647058,
"grad_norm": 2.831809081450393,
"learning_rate": 1.908717841359048e-07,
"loss": 0.0669,
"step": 1085
},
{
"epoch": 1.8252100840336134,
"grad_norm": 2.5459432634484083,
"learning_rate": 1.8727620407215375e-07,
"loss": 0.0558,
"step": 1086
},
{
"epoch": 1.826890756302521,
"grad_norm": 3.489221037352137,
"learning_rate": 1.837141665918152e-07,
"loss": 0.0928,
"step": 1087
},
{
"epoch": 1.8285714285714287,
"grad_norm": 2.668603090786725,
"learning_rate": 1.801856965207338e-07,
"loss": 0.066,
"step": 1088
},
{
"epoch": 1.8302521008403363,
"grad_norm": 2.9412325741168868,
"learning_rate": 1.7669081845080648e-07,
"loss": 0.0781,
"step": 1089
},
{
"epoch": 1.8319327731092439,
"grad_norm": 2.5595453190415802,
"learning_rate": 1.7322955673980678e-07,
"loss": 0.0681,
"step": 1090
},
{
"epoch": 1.8336134453781514,
"grad_norm": 2.656909922356702,
"learning_rate": 1.6980193551121848e-07,
"loss": 0.0535,
"step": 1091
},
{
"epoch": 1.835294117647059,
"grad_norm": 2.7002698095093534,
"learning_rate": 1.664079786540629e-07,
"loss": 0.0642,
"step": 1092
},
{
"epoch": 1.8369747899159665,
"grad_norm": 2.9100825480724506,
"learning_rate": 1.6304770982273898e-07,
"loss": 0.0979,
"step": 1093
},
{
"epoch": 1.838655462184874,
"grad_norm": 2.5089073185708646,
"learning_rate": 1.597211524368536e-07,
"loss": 0.0695,
"step": 1094
},
{
"epoch": 1.8403361344537816,
"grad_norm": 2.9673819680310936,
"learning_rate": 1.564283296810576e-07,
"loss": 0.0826,
"step": 1095
},
{
"epoch": 1.8420168067226892,
"grad_norm": 2.4463338389394402,
"learning_rate": 1.5316926450488878e-07,
"loss": 0.0619,
"step": 1096
},
{
"epoch": 1.8436974789915967,
"grad_norm": 1.9871122803788783,
"learning_rate": 1.499439796226082e-07,
"loss": 0.0536,
"step": 1097
},
{
"epoch": 1.8453781512605043,
"grad_norm": 3.047793126703823,
"learning_rate": 1.4675249751304353e-07,
"loss": 0.07,
"step": 1098
},
{
"epoch": 1.8470588235294119,
"grad_norm": 2.573964395132922,
"learning_rate": 1.435948404194304e-07,
"loss": 0.0546,
"step": 1099
},
{
"epoch": 1.8487394957983194,
"grad_norm": 2.157568300113264,
"learning_rate": 1.404710303492618e-07,
"loss": 0.0521,
"step": 1100
},
{
"epoch": 1.850420168067227,
"grad_norm": 2.5093947332693434,
"learning_rate": 1.373810890741284e-07,
"loss": 0.0591,
"step": 1101
},
{
"epoch": 1.8521008403361345,
"grad_norm": 2.7798941234564527,
"learning_rate": 1.3432503812957242e-07,
"loss": 0.0861,
"step": 1102
},
{
"epoch": 1.853781512605042,
"grad_norm": 3.3151152408491953,
"learning_rate": 1.3130289881493452e-07,
"loss": 0.1133,
"step": 1103
},
{
"epoch": 1.8554621848739496,
"grad_norm": 2.455051483279099,
"learning_rate": 1.2831469219320603e-07,
"loss": 0.0784,
"step": 1104
},
{
"epoch": 1.8571428571428572,
"grad_norm": 3.389083633842711,
"learning_rate": 1.253604390908819e-07,
"loss": 0.1172,
"step": 1105
},
{
"epoch": 1.8588235294117648,
"grad_norm": 2.0890124129209404,
"learning_rate": 1.22440160097817e-07,
"loss": 0.0597,
"step": 1106
},
{
"epoch": 1.8605042016806723,
"grad_norm": 2.5001823487473014,
"learning_rate": 1.1955387556708e-07,
"loss": 0.0484,
"step": 1107
},
{
"epoch": 1.8621848739495799,
"grad_norm": 1.972553815528427,
"learning_rate": 1.1670160561481458e-07,
"loss": 0.057,
"step": 1108
},
{
"epoch": 1.8638655462184874,
"grad_norm": 2.247555606030712,
"learning_rate": 1.1388337012009643e-07,
"loss": 0.067,
"step": 1109
},
{
"epoch": 1.865546218487395,
"grad_norm": 3.3951419821687034,
"learning_rate": 1.1109918872479642e-07,
"loss": 0.1168,
"step": 1110
},
{
"epoch": 1.8672268907563025,
"grad_norm": 2.561792162365592,
"learning_rate": 1.0834908083344253e-07,
"loss": 0.0737,
"step": 1111
},
{
"epoch": 1.86890756302521,
"grad_norm": 2.924693392143974,
"learning_rate": 1.0563306561308773e-07,
"loss": 0.0672,
"step": 1112
},
{
"epoch": 1.8705882352941177,
"grad_norm": 3.0666803684218835,
"learning_rate": 1.0295116199317057e-07,
"loss": 0.0758,
"step": 1113
},
{
"epoch": 1.8722689075630252,
"grad_norm": 3.1132381273055203,
"learning_rate": 1.0030338866538925e-07,
"loss": 0.0912,
"step": 1114
},
{
"epoch": 1.8739495798319328,
"grad_norm": 2.193473585701426,
"learning_rate": 9.768976408356667e-08,
"loss": 0.0737,
"step": 1115
},
{
"epoch": 1.8756302521008403,
"grad_norm": 2.316436170911881,
"learning_rate": 9.511030646352615e-08,
"loss": 0.0785,
"step": 1116
},
{
"epoch": 1.877310924369748,
"grad_norm": 3.2232498371051066,
"learning_rate": 9.256503378295978e-08,
"loss": 0.1079,
"step": 1117
},
{
"epoch": 1.8789915966386554,
"grad_norm": 2.989879633737669,
"learning_rate": 9.005396378130748e-08,
"loss": 0.0683,
"step": 1118
},
{
"epoch": 1.880672268907563,
"grad_norm": 2.897217749538415,
"learning_rate": 8.757711395963097e-08,
"loss": 0.104,
"step": 1119
},
{
"epoch": 1.8823529411764706,
"grad_norm": 5.9925913526073185,
"learning_rate": 8.513450158049109e-08,
"loss": 0.0856,
"step": 1120
},
{
"epoch": 1.8840336134453781,
"grad_norm": 3.437858233576137,
"learning_rate": 8.27261436678306e-08,
"loss": 0.083,
"step": 1121
},
{
"epoch": 1.8857142857142857,
"grad_norm": 2.1904465686897754,
"learning_rate": 8.035205700685167e-08,
"loss": 0.0551,
"step": 1122
},
{
"epoch": 1.8873949579831932,
"grad_norm": 3.574848102142179,
"learning_rate": 7.801225814390245e-08,
"loss": 0.0891,
"step": 1123
},
{
"epoch": 1.8890756302521008,
"grad_norm": 2.9774864616230166,
"learning_rate": 7.570676338635896e-08,
"loss": 0.0842,
"step": 1124
},
{
"epoch": 1.8907563025210083,
"grad_norm": 2.9240546637759635,
"learning_rate": 7.343558880251289e-08,
"loss": 0.0836,
"step": 1125
},
{
"epoch": 1.892436974789916,
"grad_norm": 2.536237081972416,
"learning_rate": 7.11987502214595e-08,
"loss": 0.0611,
"step": 1126
},
{
"epoch": 1.8941176470588235,
"grad_norm": 2.945869104329601,
"learning_rate": 6.899626323298714e-08,
"loss": 0.0661,
"step": 1127
},
{
"epoch": 1.895798319327731,
"grad_norm": 2.2128300706474624,
"learning_rate": 6.682814318746844e-08,
"loss": 0.0558,
"step": 1128
},
{
"epoch": 1.8974789915966386,
"grad_norm": 2.469504977662401,
"learning_rate": 6.46944051957532e-08,
"loss": 0.0971,
"step": 1129
},
{
"epoch": 1.8991596638655461,
"grad_norm": 2.336659774072773,
"learning_rate": 6.259506412906402e-08,
"loss": 0.0639,
"step": 1130
},
{
"epoch": 1.9008403361344537,
"grad_norm": 2.916145176561295,
"learning_rate": 6.053013461889023e-08,
"loss": 0.0748,
"step": 1131
},
{
"epoch": 1.9025210084033612,
"grad_norm": 2.7261524069835796,
"learning_rate": 5.849963105689027e-08,
"loss": 0.0767,
"step": 1132
},
{
"epoch": 1.9042016806722688,
"grad_norm": 2.9983934151605105,
"learning_rate": 5.65035675947867e-08,
"loss": 0.0721,
"step": 1133
},
{
"epoch": 1.9058823529411764,
"grad_norm": 2.1467336761652343,
"learning_rate": 5.454195814427021e-08,
"loss": 0.0576,
"step": 1134
},
{
"epoch": 1.907563025210084,
"grad_norm": 2.2227378360842436,
"learning_rate": 5.261481637690247e-08,
"loss": 0.0647,
"step": 1135
},
{
"epoch": 1.9092436974789915,
"grad_norm": 2.4269755333288194,
"learning_rate": 5.072215572402006e-08,
"loss": 0.0419,
"step": 1136
},
{
"epoch": 1.910924369747899,
"grad_norm": 3.0533438647127533,
"learning_rate": 4.886398937664127e-08,
"loss": 0.0734,
"step": 1137
},
{
"epoch": 1.9126050420168066,
"grad_norm": 3.2788567900077497,
"learning_rate": 4.704033028537391e-08,
"loss": 0.0902,
"step": 1138
},
{
"epoch": 1.9142857142857141,
"grad_norm": 2.551953448087741,
"learning_rate": 4.52511911603265e-08,
"loss": 0.0827,
"step": 1139
},
{
"epoch": 1.9159663865546217,
"grad_norm": 4.78760807070443,
"learning_rate": 4.3496584471016125e-08,
"loss": 0.0627,
"step": 1140
},
{
"epoch": 1.9176470588235293,
"grad_norm": 2.721593978748799,
"learning_rate": 4.177652244628627e-08,
"loss": 0.0628,
"step": 1141
},
{
"epoch": 1.9193277310924368,
"grad_norm": 2.3037527032294656,
"learning_rate": 4.009101707421803e-08,
"loss": 0.0668,
"step": 1142
},
{
"epoch": 1.9210084033613444,
"grad_norm": 2.480126673107869,
"learning_rate": 3.8440080102047364e-08,
"loss": 0.0724,
"step": 1143
},
{
"epoch": 1.9226890756302522,
"grad_norm": 3.1491790650313236,
"learning_rate": 3.6823723036084616e-08,
"loss": 0.0667,
"step": 1144
},
{
"epoch": 1.9243697478991597,
"grad_norm": 3.297054116842271,
"learning_rate": 3.5241957141632923e-08,
"loss": 0.1002,
"step": 1145
},
{
"epoch": 1.9260504201680673,
"grad_norm": 3.5015254517965873,
"learning_rate": 3.369479344290938e-08,
"loss": 0.1012,
"step": 1146
},
{
"epoch": 1.9277310924369748,
"grad_norm": 2.2369857549858376,
"learning_rate": 3.218224272296955e-08,
"loss": 0.0449,
"step": 1147
},
{
"epoch": 1.9294117647058824,
"grad_norm": 2.2243368132738723,
"learning_rate": 3.0704315523631956e-08,
"loss": 0.0528,
"step": 1148
},
{
"epoch": 1.93109243697479,
"grad_norm": 2.104823571308308,
"learning_rate": 2.926102214540316e-08,
"loss": 0.0474,
"step": 1149
},
{
"epoch": 1.9327731092436975,
"grad_norm": 2.832894345048546,
"learning_rate": 2.7852372647407812e-08,
"loss": 0.0747,
"step": 1150
},
{
"epoch": 1.934453781512605,
"grad_norm": 4.335292022183732,
"learning_rate": 2.6478376847318687e-08,
"loss": 0.0696,
"step": 1151
},
{
"epoch": 1.9361344537815126,
"grad_norm": 2.697120477019712,
"learning_rate": 2.5139044321286223e-08,
"loss": 0.0791,
"step": 1152
},
{
"epoch": 1.9378151260504202,
"grad_norm": 2.54968758188515,
"learning_rate": 2.383438440387298e-08,
"loss": 0.0745,
"step": 1153
},
{
"epoch": 1.9394957983193277,
"grad_norm": 3.02832569342735,
"learning_rate": 2.256440618798872e-08,
"loss": 0.0975,
"step": 1154
},
{
"epoch": 1.9411764705882353,
"grad_norm": 3.387365620438066,
"learning_rate": 2.1329118524827662e-08,
"loss": 0.0923,
"step": 1155
},
{
"epoch": 1.9428571428571428,
"grad_norm": 2.6767183811646413,
"learning_rate": 2.012853002380466e-08,
"loss": 0.0536,
"step": 1156
},
{
"epoch": 1.9445378151260504,
"grad_norm": 3.1445720922314337,
"learning_rate": 1.896264905249856e-08,
"loss": 0.1053,
"step": 1157
},
{
"epoch": 1.946218487394958,
"grad_norm": 3.2263045004178017,
"learning_rate": 1.783148373659005e-08,
"loss": 0.0798,
"step": 1158
},
{
"epoch": 1.9478991596638655,
"grad_norm": 2.176030962064042,
"learning_rate": 1.6735041959806686e-08,
"loss": 0.0512,
"step": 1159
},
{
"epoch": 1.949579831932773,
"grad_norm": 2.22081721828048,
"learning_rate": 1.567333136387017e-08,
"loss": 0.064,
"step": 1160
},
{
"epoch": 1.9512605042016806,
"grad_norm": 3.576068882581677,
"learning_rate": 1.4646359348439165e-08,
"loss": 0.1032,
"step": 1161
},
{
"epoch": 1.9529411764705882,
"grad_norm": 2.7548882247249846,
"learning_rate": 1.3654133071059894e-08,
"loss": 0.0958,
"step": 1162
},
{
"epoch": 1.954621848739496,
"grad_norm": 2.0537531997332277,
"learning_rate": 1.2696659447116732e-08,
"loss": 0.0555,
"step": 1163
},
{
"epoch": 1.9563025210084035,
"grad_norm": 2.576434155048857,
"learning_rate": 1.1773945149782805e-08,
"loss": 0.0681,
"step": 1164
},
{
"epoch": 1.957983193277311,
"grad_norm": 2.3313055510378513,
"learning_rate": 1.088599660997336e-08,
"loss": 0.0682,
"step": 1165
},
{
"epoch": 1.9596638655462186,
"grad_norm": 3.117362012224255,
"learning_rate": 1.0032820016302458e-08,
"loss": 0.0637,
"step": 1166
},
{
"epoch": 1.9613445378151262,
"grad_norm": 3.1378506905510215,
"learning_rate": 9.21442131503858e-09,
"loss": 0.0885,
"step": 1167
},
{
"epoch": 1.9630252100840337,
"grad_norm": 3.0313070424172217,
"learning_rate": 8.430806210062426e-09,
"loss": 0.0949,
"step": 1168
},
{
"epoch": 1.9647058823529413,
"grad_norm": 1.8993144925399335,
"learning_rate": 7.681980162830283e-09,
"loss": 0.0566,
"step": 1169
},
{
"epoch": 1.9663865546218489,
"grad_norm": 2.78106637596925,
"learning_rate": 6.9679483923318356e-09,
"loss": 0.0655,
"step": 1170
},
{
"epoch": 1.9680672268907564,
"grad_norm": 2.6179872514492843,
"learning_rate": 6.288715875057416e-09,
"loss": 0.0863,
"step": 1171
},
{
"epoch": 1.969747899159664,
"grad_norm": 2.85141007344782,
"learning_rate": 5.644287344960253e-09,
"loss": 0.0495,
"step": 1172
},
{
"epoch": 1.9714285714285715,
"grad_norm": 1.9037225399426012,
"learning_rate": 5.034667293427053e-09,
"loss": 0.0531,
"step": 1173
},
{
"epoch": 1.973109243697479,
"grad_norm": 2.8597867728725825,
"learning_rate": 4.45985996924192e-09,
"loss": 0.076,
"step": 1174
},
{
"epoch": 1.9747899159663866,
"grad_norm": 2.384741719443929,
"learning_rate": 3.919869378561925e-09,
"loss": 0.0781,
"step": 1175
},
{
"epoch": 1.9764705882352942,
"grad_norm": 3.3663742321659793,
"learning_rate": 3.41469928488547e-09,
"loss": 0.0829,
"step": 1176
},
{
"epoch": 1.9781512605042018,
"grad_norm": 2.5855062360633063,
"learning_rate": 2.9443532090273064e-09,
"loss": 0.068,
"step": 1177
},
{
"epoch": 1.9798319327731093,
"grad_norm": 2.349097606121586,
"learning_rate": 2.508834429094664e-09,
"loss": 0.0672,
"step": 1178
},
{
"epoch": 1.9815126050420169,
"grad_norm": 2.188284885883368,
"learning_rate": 2.108145980462828e-09,
"loss": 0.0587,
"step": 1179
},
{
"epoch": 1.9831932773109244,
"grad_norm": 2.603925500773801,
"learning_rate": 1.7422906557557074e-09,
"loss": 0.0898,
"step": 1180
},
{
"epoch": 1.984873949579832,
"grad_norm": 2.072728888868806,
"learning_rate": 1.4112710048247436e-09,
"loss": 0.0689,
"step": 1181
},
{
"epoch": 1.9865546218487395,
"grad_norm": 2.2130213181251284,
"learning_rate": 1.1150893347328107e-09,
"loss": 0.0537,
"step": 1182
},
{
"epoch": 1.988235294117647,
"grad_norm": 2.2290325180753983,
"learning_rate": 8.537477097364522e-10,
"loss": 0.0739,
"step": 1183
},
{
"epoch": 1.9899159663865547,
"grad_norm": 2.915171945856223,
"learning_rate": 6.272479512731133e-10,
"loss": 0.0693,
"step": 1184
},
{
"epoch": 1.9915966386554622,
"grad_norm": 3.387630014023736,
"learning_rate": 4.3559163794670844e-10,
"loss": 0.097,
"step": 1185
},
{
"epoch": 1.9932773109243698,
"grad_norm": 2.778376156187998,
"learning_rate": 2.787801055181838e-10,
"loss": 0.0861,
"step": 1186
},
{
"epoch": 1.9949579831932773,
"grad_norm": 2.6278202477958623,
"learning_rate": 1.568144468955257e-10,
"loss": 0.0779,
"step": 1187
},
{
"epoch": 1.9966386554621849,
"grad_norm": 2.434531776499178,
"learning_rate": 6.969551212598901e-11,
"loss": 0.0615,
"step": 1188
},
{
"epoch": 1.9983193277310924,
"grad_norm": 2.7044583414113026,
"learning_rate": 1.7423908390545862e-11,
"loss": 0.0862,
"step": 1189
},
{
"epoch": 2.0,
"grad_norm": 1.9872586148824984,
"learning_rate": 0.0,
"loss": 0.0517,
"step": 1190
},
{
"epoch": 2.0,
"step": 1190,
"total_flos": 6634871193600.0,
"train_loss": 0.17354622093819772,
"train_runtime": 2680.2801,
"train_samples_per_second": 3.549,
"train_steps_per_second": 0.444
}
],
"logging_steps": 1,
"max_steps": 1190,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6634871193600.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}