{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1856, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005387931034482759, "grad_norm": 24.06527582915772, "learning_rate": 5.376344086021506e-08, "loss": 1.3568, "step": 1 }, { "epoch": 0.0026939655172413795, "grad_norm": 23.2847675267083, "learning_rate": 2.688172043010753e-07, "loss": 1.3668, "step": 5 }, { "epoch": 0.005387931034482759, "grad_norm": 16.195930738756566, "learning_rate": 5.376344086021506e-07, "loss": 1.3204, "step": 10 }, { "epoch": 0.008081896551724138, "grad_norm": 12.068298869370592, "learning_rate": 8.064516129032258e-07, "loss": 1.153, "step": 15 }, { "epoch": 0.010775862068965518, "grad_norm": 8.564123494535863, "learning_rate": 1.0752688172043011e-06, "loss": 1.0452, "step": 20 }, { "epoch": 0.013469827586206896, "grad_norm": 3.533789309391932, "learning_rate": 1.3440860215053765e-06, "loss": 0.9515, "step": 25 }, { "epoch": 0.016163793103448277, "grad_norm": 3.24461197562523, "learning_rate": 1.6129032258064516e-06, "loss": 0.9001, "step": 30 }, { "epoch": 0.018857758620689655, "grad_norm": 2.990611660406535, "learning_rate": 1.881720430107527e-06, "loss": 0.8773, "step": 35 }, { "epoch": 0.021551724137931036, "grad_norm": 3.0063853939062346, "learning_rate": 2.1505376344086023e-06, "loss": 0.851, "step": 40 }, { "epoch": 0.024245689655172414, "grad_norm": 2.956366561006899, "learning_rate": 2.4193548387096776e-06, "loss": 0.8574, "step": 45 }, { "epoch": 0.02693965517241379, "grad_norm": 2.983398789032246, "learning_rate": 2.688172043010753e-06, "loss": 0.84, "step": 50 }, { "epoch": 0.029633620689655173, "grad_norm": 2.964731632227324, "learning_rate": 2.9569892473118283e-06, "loss": 0.824, "step": 55 }, { "epoch": 0.032327586206896554, "grad_norm": 2.9208803498660623, "learning_rate": 3.225806451612903e-06, "loss": 0.8138, "step": 60 }, { "epoch": 0.03502155172413793, "grad_norm": 3.2063303145455366, "learning_rate": 3.494623655913979e-06, "loss": 0.8009, "step": 65 }, { "epoch": 0.03771551724137931, "grad_norm": 3.242653708652505, "learning_rate": 3.763440860215054e-06, "loss": 0.792, "step": 70 }, { "epoch": 0.04040948275862069, "grad_norm": 3.1462448663803846, "learning_rate": 4.032258064516129e-06, "loss": 0.7902, "step": 75 }, { "epoch": 0.04310344827586207, "grad_norm": 3.0229975986392716, "learning_rate": 4.3010752688172045e-06, "loss": 0.7699, "step": 80 }, { "epoch": 0.045797413793103446, "grad_norm": 3.12423094671722, "learning_rate": 4.56989247311828e-06, "loss": 0.7644, "step": 85 }, { "epoch": 0.04849137931034483, "grad_norm": 3.2796596768473902, "learning_rate": 4.838709677419355e-06, "loss": 0.7712, "step": 90 }, { "epoch": 0.05118534482758621, "grad_norm": 3.0184242042359943, "learning_rate": 5.1075268817204305e-06, "loss": 0.7546, "step": 95 }, { "epoch": 0.05387931034482758, "grad_norm": 3.0881392753326447, "learning_rate": 5.376344086021506e-06, "loss": 0.7487, "step": 100 }, { "epoch": 0.056573275862068964, "grad_norm": 3.4110841994799657, "learning_rate": 5.645161290322582e-06, "loss": 0.7496, "step": 105 }, { "epoch": 0.059267241379310345, "grad_norm": 2.92733810047956, "learning_rate": 5.9139784946236566e-06, "loss": 0.7368, "step": 110 }, { "epoch": 0.06196120689655173, "grad_norm": 3.3139008810992046, "learning_rate": 6.182795698924732e-06, "loss": 0.7277, "step": 115 }, { "epoch": 0.06465517241379311, "grad_norm": 3.1747479144288455, "learning_rate": 6.451612903225806e-06, "loss": 0.7283, "step": 120 }, { "epoch": 0.06734913793103449, "grad_norm": 2.894519107469561, "learning_rate": 6.720430107526882e-06, "loss": 0.7282, "step": 125 }, { "epoch": 0.07004310344827586, "grad_norm": 2.8405180587913987, "learning_rate": 6.989247311827958e-06, "loss": 0.7123, "step": 130 }, { "epoch": 0.07273706896551724, "grad_norm": 2.7948188759602717, "learning_rate": 7.258064516129033e-06, "loss": 0.7193, "step": 135 }, { "epoch": 0.07543103448275862, "grad_norm": 3.154756842274138, "learning_rate": 7.526881720430108e-06, "loss": 0.7207, "step": 140 }, { "epoch": 0.078125, "grad_norm": 2.9457108929499207, "learning_rate": 7.795698924731183e-06, "loss": 0.7212, "step": 145 }, { "epoch": 0.08081896551724138, "grad_norm": 2.8503644648477517, "learning_rate": 8.064516129032258e-06, "loss": 0.72, "step": 150 }, { "epoch": 0.08351293103448276, "grad_norm": 2.949964251276019, "learning_rate": 8.333333333333334e-06, "loss": 0.723, "step": 155 }, { "epoch": 0.08620689655172414, "grad_norm": 2.959116036250926, "learning_rate": 8.602150537634409e-06, "loss": 0.7158, "step": 160 }, { "epoch": 0.08890086206896551, "grad_norm": 2.7803395603035517, "learning_rate": 8.870967741935484e-06, "loss": 0.7067, "step": 165 }, { "epoch": 0.09159482758620689, "grad_norm": 2.8799202670097115, "learning_rate": 9.13978494623656e-06, "loss": 0.71, "step": 170 }, { "epoch": 0.09428879310344827, "grad_norm": 2.9537594310040687, "learning_rate": 9.408602150537635e-06, "loss": 0.7152, "step": 175 }, { "epoch": 0.09698275862068965, "grad_norm": 2.8628517050727873, "learning_rate": 9.67741935483871e-06, "loss": 0.7054, "step": 180 }, { "epoch": 0.09967672413793104, "grad_norm": 2.8896943288351586, "learning_rate": 9.946236559139786e-06, "loss": 0.7235, "step": 185 }, { "epoch": 0.10237068965517242, "grad_norm": 2.938518709851193, "learning_rate": 9.999858445152838e-06, "loss": 0.7122, "step": 190 }, { "epoch": 0.1050646551724138, "grad_norm": 2.58690085015114, "learning_rate": 9.999283392323047e-06, "loss": 0.7061, "step": 195 }, { "epoch": 0.10775862068965517, "grad_norm": 2.763129396160507, "learning_rate": 9.998266045169356e-06, "loss": 0.7063, "step": 200 }, { "epoch": 0.11045258620689655, "grad_norm": 2.816275952414151, "learning_rate": 9.996806493698038e-06, "loss": 0.7087, "step": 205 }, { "epoch": 0.11314655172413793, "grad_norm": 2.73738463168911, "learning_rate": 9.994904867037867e-06, "loss": 0.6986, "step": 210 }, { "epoch": 0.11584051724137931, "grad_norm": 2.810575578616004, "learning_rate": 9.99256133342869e-06, "loss": 0.6929, "step": 215 }, { "epoch": 0.11853448275862069, "grad_norm": 2.6652685941669265, "learning_rate": 9.989776100206547e-06, "loss": 0.6898, "step": 220 }, { "epoch": 0.12122844827586207, "grad_norm": 2.7660230194471107, "learning_rate": 9.986549413785323e-06, "loss": 0.695, "step": 225 }, { "epoch": 0.12392241379310345, "grad_norm": 2.5553942202252466, "learning_rate": 9.982881559634946e-06, "loss": 0.7017, "step": 230 }, { "epoch": 0.12661637931034483, "grad_norm": 2.5245345530966192, "learning_rate": 9.978772862256145e-06, "loss": 0.6916, "step": 235 }, { "epoch": 0.12931034482758622, "grad_norm": 2.520167957976126, "learning_rate": 9.97422368515172e-06, "loss": 0.694, "step": 240 }, { "epoch": 0.1320043103448276, "grad_norm": 2.7125840301494706, "learning_rate": 9.969234430794395e-06, "loss": 0.6887, "step": 245 }, { "epoch": 0.13469827586206898, "grad_norm": 2.631424447595556, "learning_rate": 9.96380554059121e-06, "loss": 0.685, "step": 250 }, { "epoch": 0.13739224137931033, "grad_norm": 2.555021040773695, "learning_rate": 9.957937494844472e-06, "loss": 0.7004, "step": 255 }, { "epoch": 0.1400862068965517, "grad_norm": 2.539978410855113, "learning_rate": 9.951630812709245e-06, "loss": 0.6897, "step": 260 }, { "epoch": 0.1427801724137931, "grad_norm": 2.7494174109330842, "learning_rate": 9.944886052147445e-06, "loss": 0.6928, "step": 265 }, { "epoch": 0.14547413793103448, "grad_norm": 2.559956756758314, "learning_rate": 9.937703809878455e-06, "loss": 0.6813, "step": 270 }, { "epoch": 0.14816810344827586, "grad_norm": 2.525562445581053, "learning_rate": 9.930084721326342e-06, "loss": 0.6944, "step": 275 }, { "epoch": 0.15086206896551724, "grad_norm": 2.777619881263396, "learning_rate": 9.92202946056364e-06, "loss": 0.6745, "step": 280 }, { "epoch": 0.15355603448275862, "grad_norm": 2.4859789362282076, "learning_rate": 9.913538740251711e-06, "loss": 0.6527, "step": 285 }, { "epoch": 0.15625, "grad_norm": 2.4614571056065624, "learning_rate": 9.904613311577696e-06, "loss": 0.6673, "step": 290 }, { "epoch": 0.15894396551724138, "grad_norm": 2.503690727361147, "learning_rate": 9.895253964188056e-06, "loss": 0.6601, "step": 295 }, { "epoch": 0.16163793103448276, "grad_norm": 2.61491684131174, "learning_rate": 9.885461526118713e-06, "loss": 0.6629, "step": 300 }, { "epoch": 0.16433189655172414, "grad_norm": 2.563289578189323, "learning_rate": 9.875236863721788e-06, "loss": 0.6834, "step": 305 }, { "epoch": 0.16702586206896552, "grad_norm": 2.542961491155676, "learning_rate": 9.864580881588958e-06, "loss": 0.6634, "step": 310 }, { "epoch": 0.1697198275862069, "grad_norm": 2.5998608415854774, "learning_rate": 9.853494522471423e-06, "loss": 0.6564, "step": 315 }, { "epoch": 0.1724137931034483, "grad_norm": 2.580998138867243, "learning_rate": 9.841978767196495e-06, "loss": 0.6522, "step": 320 }, { "epoch": 0.17510775862068967, "grad_norm": 2.462022076166109, "learning_rate": 9.830034634580833e-06, "loss": 0.6575, "step": 325 }, { "epoch": 0.17780172413793102, "grad_norm": 2.641866987114795, "learning_rate": 9.8176631813403e-06, "loss": 0.6654, "step": 330 }, { "epoch": 0.1804956896551724, "grad_norm": 2.483224928563204, "learning_rate": 9.804865501996472e-06, "loss": 0.6687, "step": 335 }, { "epoch": 0.18318965517241378, "grad_norm": 2.6158710388060755, "learning_rate": 9.79164272877981e-06, "loss": 0.6606, "step": 340 }, { "epoch": 0.18588362068965517, "grad_norm": 2.6690109052148396, "learning_rate": 9.777996031529486e-06, "loss": 0.6587, "step": 345 }, { "epoch": 0.18857758620689655, "grad_norm": 2.5145797557443403, "learning_rate": 9.763926617589883e-06, "loss": 0.6455, "step": 350 }, { "epoch": 0.19127155172413793, "grad_norm": 2.34228188842774, "learning_rate": 9.749435731703786e-06, "loss": 0.6467, "step": 355 }, { "epoch": 0.1939655172413793, "grad_norm": 2.518236951767628, "learning_rate": 9.734524655902253e-06, "loss": 0.6651, "step": 360 }, { "epoch": 0.1966594827586207, "grad_norm": 2.3327366524820423, "learning_rate": 9.719194709391191e-06, "loss": 0.6527, "step": 365 }, { "epoch": 0.19935344827586207, "grad_norm": 2.6721928236725425, "learning_rate": 9.70344724843465e-06, "loss": 0.6471, "step": 370 }, { "epoch": 0.20204741379310345, "grad_norm": 2.512497207087126, "learning_rate": 9.687283666234823e-06, "loss": 0.6345, "step": 375 }, { "epoch": 0.20474137931034483, "grad_norm": 2.5381248307269106, "learning_rate": 9.670705392808796e-06, "loss": 0.6549, "step": 380 }, { "epoch": 0.20743534482758622, "grad_norm": 2.489609282435604, "learning_rate": 9.653713894862024e-06, "loss": 0.6287, "step": 385 }, { "epoch": 0.2101293103448276, "grad_norm": 2.4187969624820767, "learning_rate": 9.63631067565858e-06, "loss": 0.6372, "step": 390 }, { "epoch": 0.21282327586206898, "grad_norm": 2.378128543534024, "learning_rate": 9.618497274888147e-06, "loss": 0.6344, "step": 395 }, { "epoch": 0.21551724137931033, "grad_norm": 2.3554799136699383, "learning_rate": 9.600275268529809e-06, "loss": 0.632, "step": 400 }, { "epoch": 0.2182112068965517, "grad_norm": 2.9669359679831437, "learning_rate": 9.58164626871261e-06, "loss": 0.6409, "step": 405 }, { "epoch": 0.2209051724137931, "grad_norm": 2.510424077340063, "learning_rate": 9.562611923572944e-06, "loss": 0.6316, "step": 410 }, { "epoch": 0.22359913793103448, "grad_norm": 2.5067266793187843, "learning_rate": 9.543173917108725e-06, "loss": 0.6337, "step": 415 }, { "epoch": 0.22629310344827586, "grad_norm": 2.4014165442615627, "learning_rate": 9.523333969030413e-06, "loss": 0.6285, "step": 420 }, { "epoch": 0.22898706896551724, "grad_norm": 2.5503305669006266, "learning_rate": 9.503093834608856e-06, "loss": 0.6297, "step": 425 }, { "epoch": 0.23168103448275862, "grad_norm": 2.683370610867663, "learning_rate": 9.482455304520013e-06, "loss": 0.6222, "step": 430 }, { "epoch": 0.234375, "grad_norm": 2.3415254501274156, "learning_rate": 9.46142020468652e-06, "loss": 0.6181, "step": 435 }, { "epoch": 0.23706896551724138, "grad_norm": 2.4296203317167513, "learning_rate": 9.439990396116149e-06, "loss": 0.6191, "step": 440 }, { "epoch": 0.23976293103448276, "grad_norm": 2.4277540188724833, "learning_rate": 9.418167774737173e-06, "loss": 0.6218, "step": 445 }, { "epoch": 0.24245689655172414, "grad_norm": 2.594904022170311, "learning_rate": 9.395954271230606e-06, "loss": 0.622, "step": 450 }, { "epoch": 0.24515086206896552, "grad_norm": 2.347098862192039, "learning_rate": 9.373351850859417e-06, "loss": 0.6136, "step": 455 }, { "epoch": 0.2478448275862069, "grad_norm": 2.3928008650888204, "learning_rate": 9.350362513294652e-06, "loss": 0.6272, "step": 460 }, { "epoch": 0.2505387931034483, "grad_norm": 2.335542398750826, "learning_rate": 9.326988292438514e-06, "loss": 0.6245, "step": 465 }, { "epoch": 0.25323275862068967, "grad_norm": 2.3458410101982174, "learning_rate": 9.30323125624443e-06, "loss": 0.6176, "step": 470 }, { "epoch": 0.25592672413793105, "grad_norm": 2.5491037378725188, "learning_rate": 9.279093506534085e-06, "loss": 0.6039, "step": 475 }, { "epoch": 0.25862068965517243, "grad_norm": 2.35768113596503, "learning_rate": 9.254577178811482e-06, "loss": 0.6062, "step": 480 }, { "epoch": 0.2613146551724138, "grad_norm": 2.4427975704018072, "learning_rate": 9.229684442074005e-06, "loss": 0.6038, "step": 485 }, { "epoch": 0.2640086206896552, "grad_norm": 2.3518303928123183, "learning_rate": 9.204417498620521e-06, "loss": 0.6071, "step": 490 }, { "epoch": 0.2667025862068966, "grad_norm": 2.3978894249163285, "learning_rate": 9.178778583856552e-06, "loss": 0.6024, "step": 495 }, { "epoch": 0.26939655172413796, "grad_norm": 2.530047013657598, "learning_rate": 9.152769966096483e-06, "loss": 0.6028, "step": 500 }, { "epoch": 0.27209051724137934, "grad_norm": 2.4123317555719708, "learning_rate": 9.126393946362906e-06, "loss": 0.6083, "step": 505 }, { "epoch": 0.27478448275862066, "grad_norm": 2.4793056830777753, "learning_rate": 9.099652858183027e-06, "loss": 0.6051, "step": 510 }, { "epoch": 0.27747844827586204, "grad_norm": 2.372688897527012, "learning_rate": 9.072549067382225e-06, "loss": 0.6157, "step": 515 }, { "epoch": 0.2801724137931034, "grad_norm": 2.380240348074666, "learning_rate": 9.045084971874738e-06, "loss": 0.6073, "step": 520 }, { "epoch": 0.2828663793103448, "grad_norm": 2.545807161286919, "learning_rate": 9.017263001451518e-06, "loss": 0.5884, "step": 525 }, { "epoch": 0.2855603448275862, "grad_norm": 2.5935659051260824, "learning_rate": 8.989085617565261e-06, "loss": 0.5983, "step": 530 }, { "epoch": 0.28825431034482757, "grad_norm": 2.2548884783469836, "learning_rate": 8.960555313112646e-06, "loss": 0.5895, "step": 535 }, { "epoch": 0.29094827586206895, "grad_norm": 2.3534621434136533, "learning_rate": 8.93167461221378e-06, "loss": 0.5914, "step": 540 }, { "epoch": 0.29364224137931033, "grad_norm": 2.5336260688373495, "learning_rate": 8.902446069988878e-06, "loss": 0.5939, "step": 545 }, { "epoch": 0.2963362068965517, "grad_norm": 2.624683890197873, "learning_rate": 8.87287227233222e-06, "loss": 0.5836, "step": 550 }, { "epoch": 0.2990301724137931, "grad_norm": 2.3588318708883604, "learning_rate": 8.842955835683368e-06, "loss": 0.5786, "step": 555 }, { "epoch": 0.3017241379310345, "grad_norm": 2.501675897313923, "learning_rate": 8.812699406795683e-06, "loss": 0.5799, "step": 560 }, { "epoch": 0.30441810344827586, "grad_norm": 2.6078839400922424, "learning_rate": 8.78210566250216e-06, "loss": 0.5801, "step": 565 }, { "epoch": 0.30711206896551724, "grad_norm": 2.3496389383543135, "learning_rate": 8.751177309478618e-06, "loss": 0.5756, "step": 570 }, { "epoch": 0.3098060344827586, "grad_norm": 2.3002443057548727, "learning_rate": 8.71991708400422e-06, "loss": 0.5823, "step": 575 }, { "epoch": 0.3125, "grad_norm": 2.368311996486066, "learning_rate": 8.688327751719403e-06, "loss": 0.57, "step": 580 }, { "epoch": 0.3151939655172414, "grad_norm": 2.316476591326147, "learning_rate": 8.656412107381187e-06, "loss": 0.572, "step": 585 }, { "epoch": 0.31788793103448276, "grad_norm": 2.648056237571166, "learning_rate": 8.624172974615926e-06, "loss": 0.5759, "step": 590 }, { "epoch": 0.32058189655172414, "grad_norm": 2.5273275022283035, "learning_rate": 8.591613205669494e-06, "loss": 0.5751, "step": 595 }, { "epoch": 0.3232758620689655, "grad_norm": 2.3674743965920433, "learning_rate": 8.558735681154944e-06, "loss": 0.5525, "step": 600 }, { "epoch": 0.3259698275862069, "grad_norm": 2.334754085556647, "learning_rate": 8.525543309797653e-06, "loss": 0.5501, "step": 605 }, { "epoch": 0.3286637931034483, "grad_norm": 2.511690588702945, "learning_rate": 8.492039028177985e-06, "loss": 0.5703, "step": 610 }, { "epoch": 0.33135775862068967, "grad_norm": 2.41344799771138, "learning_rate": 8.458225800471492e-06, "loss": 0.5674, "step": 615 }, { "epoch": 0.33405172413793105, "grad_norm": 2.274991518802859, "learning_rate": 8.424106618186653e-06, "loss": 0.568, "step": 620 }, { "epoch": 0.33674568965517243, "grad_norm": 2.2914893865907375, "learning_rate": 8.389684499900231e-06, "loss": 0.5578, "step": 625 }, { "epoch": 0.3394396551724138, "grad_norm": 2.2271331744770175, "learning_rate": 8.354962490990202e-06, "loss": 0.554, "step": 630 }, { "epoch": 0.3421336206896552, "grad_norm": 2.346436964348071, "learning_rate": 8.319943663366325e-06, "loss": 0.5623, "step": 635 }, { "epoch": 0.3448275862068966, "grad_norm": 2.2365182629879707, "learning_rate": 8.284631115198371e-06, "loss": 0.5534, "step": 640 }, { "epoch": 0.34752155172413796, "grad_norm": 2.461241222937466, "learning_rate": 8.24902797064203e-06, "loss": 0.5564, "step": 645 }, { "epoch": 0.35021551724137934, "grad_norm": 2.442140982131872, "learning_rate": 8.213137379562486e-06, "loss": 0.5506, "step": 650 }, { "epoch": 0.35290948275862066, "grad_norm": 2.388325267487531, "learning_rate": 8.176962517255776e-06, "loss": 0.5531, "step": 655 }, { "epoch": 0.35560344827586204, "grad_norm": 2.398524248781268, "learning_rate": 8.140506584167845e-06, "loss": 0.5415, "step": 660 }, { "epoch": 0.3582974137931034, "grad_norm": 2.566763693618945, "learning_rate": 8.103772805611403e-06, "loss": 0.5616, "step": 665 }, { "epoch": 0.3609913793103448, "grad_norm": 2.3106768834034805, "learning_rate": 8.066764431480584e-06, "loss": 0.5328, "step": 670 }, { "epoch": 0.3636853448275862, "grad_norm": 2.2940366514378425, "learning_rate": 8.029484735963409e-06, "loss": 0.5452, "step": 675 }, { "epoch": 0.36637931034482757, "grad_norm": 2.4096028111246652, "learning_rate": 7.991937017252127e-06, "loss": 0.5448, "step": 680 }, { "epoch": 0.36907327586206895, "grad_norm": 2.450510234216877, "learning_rate": 7.95412459725141e-06, "loss": 0.5407, "step": 685 }, { "epoch": 0.37176724137931033, "grad_norm": 2.498635611862816, "learning_rate": 7.916050821284462e-06, "loss": 0.536, "step": 690 }, { "epoch": 0.3744612068965517, "grad_norm": 2.3384557737181306, "learning_rate": 7.877719057797055e-06, "loss": 0.5404, "step": 695 }, { "epoch": 0.3771551724137931, "grad_norm": 2.395634299723523, "learning_rate": 7.839132698059515e-06, "loss": 0.5469, "step": 700 }, { "epoch": 0.3798491379310345, "grad_norm": 2.528299315994187, "learning_rate": 7.800295155866688e-06, "loss": 0.5272, "step": 705 }, { "epoch": 0.38254310344827586, "grad_norm": 2.383516192036904, "learning_rate": 7.761209867235924e-06, "loss": 0.5495, "step": 710 }, { "epoch": 0.38523706896551724, "grad_norm": 2.3221638101603954, "learning_rate": 7.721880290103082e-06, "loss": 0.5517, "step": 715 }, { "epoch": 0.3879310344827586, "grad_norm": 2.451275702370551, "learning_rate": 7.6823099040166e-06, "loss": 0.5195, "step": 720 }, { "epoch": 0.390625, "grad_norm": 2.469988525493039, "learning_rate": 7.64250220982966e-06, "loss": 0.5151, "step": 725 }, { "epoch": 0.3933189655172414, "grad_norm": 2.4698654498618016, "learning_rate": 7.602460729390455e-06, "loss": 0.5296, "step": 730 }, { "epoch": 0.39601293103448276, "grad_norm": 2.433689149450146, "learning_rate": 7.562189005230609e-06, "loss": 0.5122, "step": 735 }, { "epoch": 0.39870689655172414, "grad_norm": 2.317764828643439, "learning_rate": 7.521690600251765e-06, "loss": 0.5389, "step": 740 }, { "epoch": 0.4014008620689655, "grad_norm": 2.3785211168925997, "learning_rate": 7.480969097410369e-06, "loss": 0.5342, "step": 745 }, { "epoch": 0.4040948275862069, "grad_norm": 2.352268614869421, "learning_rate": 7.4400280994006765e-06, "loss": 0.5222, "step": 750 }, { "epoch": 0.4067887931034483, "grad_norm": 2.3334817294609844, "learning_rate": 7.398871228336022e-06, "loss": 0.5148, "step": 755 }, { "epoch": 0.40948275862068967, "grad_norm": 2.2180745679186513, "learning_rate": 7.357502125428359e-06, "loss": 0.5269, "step": 760 }, { "epoch": 0.41217672413793105, "grad_norm": 2.4024098190438448, "learning_rate": 7.315924450666129e-06, "loss": 0.5252, "step": 765 }, { "epoch": 0.41487068965517243, "grad_norm": 2.4847050155908326, "learning_rate": 7.274141882490435e-06, "loss": 0.5215, "step": 770 }, { "epoch": 0.4175646551724138, "grad_norm": 2.3489603723016423, "learning_rate": 7.23215811746963e-06, "loss": 0.5331, "step": 775 }, { "epoch": 0.4202586206896552, "grad_norm": 2.3846378852084276, "learning_rate": 7.189976869972249e-06, "loss": 0.526, "step": 780 }, { "epoch": 0.4229525862068966, "grad_norm": 2.2721960920466087, "learning_rate": 7.147601871838419e-06, "loss": 0.5111, "step": 785 }, { "epoch": 0.42564655172413796, "grad_norm": 2.242972711736404, "learning_rate": 7.105036872049676e-06, "loss": 0.5079, "step": 790 }, { "epoch": 0.42834051724137934, "grad_norm": 2.5168627834860944, "learning_rate": 7.0622856363973e-06, "loss": 0.5037, "step": 795 }, { "epoch": 0.43103448275862066, "grad_norm": 2.3034024680284797, "learning_rate": 7.019351947149149e-06, "loss": 0.5037, "step": 800 }, { "epoch": 0.43372844827586204, "grad_norm": 2.3169182311354204, "learning_rate": 6.976239602715025e-06, "loss": 0.5244, "step": 805 }, { "epoch": 0.4364224137931034, "grad_norm": 2.342523099764779, "learning_rate": 6.932952417310634e-06, "loss": 0.4955, "step": 810 }, { "epoch": 0.4391163793103448, "grad_norm": 2.4079674615936213, "learning_rate": 6.889494220620135e-06, "loss": 0.5039, "step": 815 }, { "epoch": 0.4418103448275862, "grad_norm": 2.2705187143965704, "learning_rate": 6.8458688574573164e-06, "loss": 0.4921, "step": 820 }, { "epoch": 0.44450431034482757, "grad_norm": 2.3040634798061053, "learning_rate": 6.8020801874254425e-06, "loss": 0.4952, "step": 825 }, { "epoch": 0.44719827586206895, "grad_norm": 2.283780585980132, "learning_rate": 6.758132084575791e-06, "loss": 0.5204, "step": 830 }, { "epoch": 0.44989224137931033, "grad_norm": 2.2311658006536175, "learning_rate": 6.7140284370649015e-06, "loss": 0.5062, "step": 835 }, { "epoch": 0.4525862068965517, "grad_norm": 2.381000659447914, "learning_rate": 6.6697731468105985e-06, "loss": 0.5054, "step": 840 }, { "epoch": 0.4552801724137931, "grad_norm": 2.5645822620698295, "learning_rate": 6.625370129146771e-06, "loss": 0.4967, "step": 845 }, { "epoch": 0.4579741379310345, "grad_norm": 2.518018472550615, "learning_rate": 6.580823312476976e-06, "loss": 0.5057, "step": 850 }, { "epoch": 0.46066810344827586, "grad_norm": 2.3310109009449937, "learning_rate": 6.536136637926898e-06, "loss": 0.4923, "step": 855 }, { "epoch": 0.46336206896551724, "grad_norm": 2.4572949530360235, "learning_rate": 6.491314058995653e-06, "loss": 0.4923, "step": 860 }, { "epoch": 0.4660560344827586, "grad_norm": 2.333469399501826, "learning_rate": 6.446359541206042e-06, "loss": 0.4984, "step": 865 }, { "epoch": 0.46875, "grad_norm": 2.3170414009513287, "learning_rate": 6.401277061753689e-06, "loss": 0.4805, "step": 870 }, { "epoch": 0.4714439655172414, "grad_norm": 2.3105233267502068, "learning_rate": 6.356070609155188e-06, "loss": 0.4857, "step": 875 }, { "epoch": 0.47413793103448276, "grad_norm": 2.406900488225167, "learning_rate": 6.310744182895231e-06, "loss": 0.474, "step": 880 }, { "epoch": 0.47683189655172414, "grad_norm": 2.3233269304186246, "learning_rate": 6.265301793072762e-06, "loss": 0.4947, "step": 885 }, { "epoch": 0.4795258620689655, "grad_norm": 2.336797328678939, "learning_rate": 6.219747460046203e-06, "loss": 0.4771, "step": 890 }, { "epoch": 0.4822198275862069, "grad_norm": 2.3058756900360566, "learning_rate": 6.17408521407776e-06, "loss": 0.4791, "step": 895 }, { "epoch": 0.4849137931034483, "grad_norm": 2.467884893673803, "learning_rate": 6.128319094976869e-06, "loss": 0.492, "step": 900 }, { "epoch": 0.48760775862068967, "grad_norm": 2.3280199883273047, "learning_rate": 6.0824531517427765e-06, "loss": 0.4816, "step": 905 }, { "epoch": 0.49030172413793105, "grad_norm": 2.2642826853033053, "learning_rate": 6.03649144220633e-06, "loss": 0.4805, "step": 910 }, { "epoch": 0.49299568965517243, "grad_norm": 2.2845546468033007, "learning_rate": 5.990438032670968e-06, "loss": 0.4804, "step": 915 }, { "epoch": 0.4956896551724138, "grad_norm": 2.320099011292584, "learning_rate": 5.944296997552968e-06, "loss": 0.4807, "step": 920 }, { "epoch": 0.4983836206896552, "grad_norm": 2.4032671750639607, "learning_rate": 5.898072419020978e-06, "loss": 0.479, "step": 925 }, { "epoch": 0.5010775862068966, "grad_norm": 2.3454490179654948, "learning_rate": 5.851768386634863e-06, "loss": 0.4657, "step": 930 }, { "epoch": 0.5037715517241379, "grad_norm": 2.2272370976346707, "learning_rate": 5.805388996983891e-06, "loss": 0.4778, "step": 935 }, { "epoch": 0.5064655172413793, "grad_norm": 2.399429478516486, "learning_rate": 5.758938353324308e-06, "loss": 0.4766, "step": 940 }, { "epoch": 0.5091594827586207, "grad_norm": 2.2479225788941726, "learning_rate": 5.712420565216305e-06, "loss": 0.4689, "step": 945 }, { "epoch": 0.5118534482758621, "grad_norm": 2.333910684063406, "learning_rate": 5.66583974816045e-06, "loss": 0.4689, "step": 950 }, { "epoch": 0.5145474137931034, "grad_norm": 2.494414220923278, "learning_rate": 5.619200023233582e-06, "loss": 0.4654, "step": 955 }, { "epoch": 0.5172413793103449, "grad_norm": 2.4303474928270314, "learning_rate": 5.572505516724207e-06, "loss": 0.4841, "step": 960 }, { "epoch": 0.5199353448275862, "grad_norm": 2.3290300558522605, "learning_rate": 5.52576035976744e-06, "loss": 0.4631, "step": 965 }, { "epoch": 0.5226293103448276, "grad_norm": 2.303763077645539, "learning_rate": 5.478968687979527e-06, "loss": 0.4535, "step": 970 }, { "epoch": 0.525323275862069, "grad_norm": 2.3158015015015367, "learning_rate": 5.432134641091945e-06, "loss": 0.4653, "step": 975 }, { "epoch": 0.5280172413793104, "grad_norm": 2.412268625727716, "learning_rate": 5.3852623625851655e-06, "loss": 0.4553, "step": 980 }, { "epoch": 0.5307112068965517, "grad_norm": 2.4152646593142477, "learning_rate": 5.338355999322069e-06, "loss": 0.459, "step": 985 }, { "epoch": 0.5334051724137931, "grad_norm": 2.3009383932051186, "learning_rate": 5.291419701181069e-06, "loss": 0.4574, "step": 990 }, { "epoch": 0.5360991379310345, "grad_norm": 2.3404820672273683, "learning_rate": 5.244457620688962e-06, "loss": 0.4457, "step": 995 }, { "epoch": 0.5387931034482759, "grad_norm": 2.2918401803413277, "learning_rate": 5.197473912653549e-06, "loss": 0.4625, "step": 1000 }, { "epoch": 0.5414870689655172, "grad_norm": 2.330307145203118, "learning_rate": 5.150472733796053e-06, "loss": 0.4614, "step": 1005 }, { "epoch": 0.5441810344827587, "grad_norm": 2.317228108453964, "learning_rate": 5.103458242383371e-06, "loss": 0.4346, "step": 1010 }, { "epoch": 0.546875, "grad_norm": 2.246449210384358, "learning_rate": 5.056434597860176e-06, "loss": 0.4332, "step": 1015 }, { "epoch": 0.5495689655172413, "grad_norm": 2.2315633880832917, "learning_rate": 5.009405960480937e-06, "loss": 0.4374, "step": 1020 }, { "epoch": 0.5522629310344828, "grad_norm": 2.236917389881302, "learning_rate": 4.962376490941846e-06, "loss": 0.4443, "step": 1025 }, { "epoch": 0.5549568965517241, "grad_norm": 2.2257101057521953, "learning_rate": 4.915350350012714e-06, "loss": 0.4485, "step": 1030 }, { "epoch": 0.5576508620689655, "grad_norm": 2.2768475081245696, "learning_rate": 4.868331698168875e-06, "loss": 0.456, "step": 1035 }, { "epoch": 0.5603448275862069, "grad_norm": 2.2588873812858243, "learning_rate": 4.82132469522308e-06, "loss": 0.4531, "step": 1040 }, { "epoch": 0.5630387931034483, "grad_norm": 2.2517674521156414, "learning_rate": 4.774333499957488e-06, "loss": 0.4439, "step": 1045 }, { "epoch": 0.5657327586206896, "grad_norm": 2.3879681903493277, "learning_rate": 4.727362269755736e-06, "loss": 0.4507, "step": 1050 }, { "epoch": 0.568426724137931, "grad_norm": 2.2168932530530654, "learning_rate": 4.68041516023511e-06, "loss": 0.4436, "step": 1055 }, { "epoch": 0.5711206896551724, "grad_norm": 2.328909950607463, "learning_rate": 4.633496324878906e-06, "loss": 0.4408, "step": 1060 }, { "epoch": 0.5738146551724138, "grad_norm": 2.2564887174276183, "learning_rate": 4.586609914668963e-06, "loss": 0.4516, "step": 1065 }, { "epoch": 0.5765086206896551, "grad_norm": 2.2979177074885424, "learning_rate": 4.539760077718416e-06, "loss": 0.4389, "step": 1070 }, { "epoch": 0.5792025862068966, "grad_norm": 2.2933960847054515, "learning_rate": 4.492950958904707e-06, "loss": 0.4266, "step": 1075 }, { "epoch": 0.5818965517241379, "grad_norm": 2.2594325799250594, "learning_rate": 4.4461866995028776e-06, "loss": 0.427, "step": 1080 }, { "epoch": 0.5845905172413793, "grad_norm": 2.349659814217747, "learning_rate": 4.399471436819199e-06, "loss": 0.4346, "step": 1085 }, { "epoch": 0.5872844827586207, "grad_norm": 2.297930957947952, "learning_rate": 4.352809303825115e-06, "loss": 0.4279, "step": 1090 }, { "epoch": 0.5899784482758621, "grad_norm": 2.202712644399629, "learning_rate": 4.306204428791609e-06, "loss": 0.4291, "step": 1095 }, { "epoch": 0.5926724137931034, "grad_norm": 2.2128476870439813, "learning_rate": 4.259660934923965e-06, "loss": 0.44, "step": 1100 }, { "epoch": 0.5953663793103449, "grad_norm": 2.367627389505961, "learning_rate": 4.213182939996978e-06, "loss": 0.4379, "step": 1105 }, { "epoch": 0.5980603448275862, "grad_norm": 2.274117011259563, "learning_rate": 4.166774555990654e-06, "loss": 0.4344, "step": 1110 }, { "epoch": 0.6007543103448276, "grad_norm": 2.2261394360036983, "learning_rate": 4.120439888726407e-06, "loss": 0.4142, "step": 1115 }, { "epoch": 0.603448275862069, "grad_norm": 2.1852891937100436, "learning_rate": 4.074183037503827e-06, "loss": 0.4266, "step": 1120 }, { "epoch": 0.6061422413793104, "grad_norm": 2.3083672939605053, "learning_rate": 4.028008094737989e-06, "loss": 0.4394, "step": 1125 }, { "epoch": 0.6088362068965517, "grad_norm": 2.2610041056896963, "learning_rate": 3.981919145597404e-06, "loss": 0.4128, "step": 1130 }, { "epoch": 0.6115301724137931, "grad_norm": 2.19751146715402, "learning_rate": 3.935920267642592e-06, "loss": 0.4227, "step": 1135 }, { "epoch": 0.6142241379310345, "grad_norm": 2.3415136999781963, "learning_rate": 3.890015530465342e-06, "loss": 0.4133, "step": 1140 }, { "epoch": 0.6169181034482759, "grad_norm": 2.291673599344672, "learning_rate": 3.844208995328659e-06, "loss": 0.4192, "step": 1145 }, { "epoch": 0.6196120689655172, "grad_norm": 2.2459859353779508, "learning_rate": 3.7985047148074584e-06, "loss": 0.4257, "step": 1150 }, { "epoch": 0.6223060344827587, "grad_norm": 2.3753214874892072, "learning_rate": 3.75290673243004e-06, "loss": 0.421, "step": 1155 }, { "epoch": 0.625, "grad_norm": 2.181100394703554, "learning_rate": 3.707419082320336e-06, "loss": 0.4287, "step": 1160 }, { "epoch": 0.6276939655172413, "grad_norm": 2.242465849693457, "learning_rate": 3.6620457888410143e-06, "loss": 0.4143, "step": 1165 }, { "epoch": 0.6303879310344828, "grad_norm": 2.3646959150338813, "learning_rate": 3.616790866237433e-06, "loss": 0.4045, "step": 1170 }, { "epoch": 0.6330818965517241, "grad_norm": 2.312802724452316, "learning_rate": 3.5716583182825023e-06, "loss": 0.4248, "step": 1175 }, { "epoch": 0.6357758620689655, "grad_norm": 2.208443511882899, "learning_rate": 3.5266521379224506e-06, "loss": 0.4135, "step": 1180 }, { "epoch": 0.6384698275862069, "grad_norm": 2.2774985396607046, "learning_rate": 3.4817763069235747e-06, "loss": 0.4028, "step": 1185 }, { "epoch": 0.6411637931034483, "grad_norm": 2.3080269121559898, "learning_rate": 3.4370347955199634e-06, "loss": 0.4086, "step": 1190 }, { "epoch": 0.6438577586206896, "grad_norm": 2.3130128907712355, "learning_rate": 3.392431562062238e-06, "loss": 0.408, "step": 1195 }, { "epoch": 0.646551724137931, "grad_norm": 2.2776700595089676, "learning_rate": 3.347970552667361e-06, "loss": 0.4159, "step": 1200 }, { "epoch": 0.6492456896551724, "grad_norm": 2.1524296489308576, "learning_rate": 3.303655700869507e-06, "loss": 0.4035, "step": 1205 }, { "epoch": 0.6519396551724138, "grad_norm": 2.2146294105038185, "learning_rate": 3.259490927272071e-06, "loss": 0.4012, "step": 1210 }, { "epoch": 0.6546336206896551, "grad_norm": 2.2480654104489752, "learning_rate": 3.2154801392007883e-06, "loss": 0.4153, "step": 1215 }, { "epoch": 0.6573275862068966, "grad_norm": 2.169871400965887, "learning_rate": 3.171627230358063e-06, "loss": 0.404, "step": 1220 }, { "epoch": 0.6600215517241379, "grad_norm": 2.4015866937415056, "learning_rate": 3.1279360804784785e-06, "loss": 0.4063, "step": 1225 }, { "epoch": 0.6627155172413793, "grad_norm": 2.3038799378482557, "learning_rate": 3.084410554985553e-06, "loss": 0.3898, "step": 1230 }, { "epoch": 0.6654094827586207, "grad_norm": 2.198625588166285, "learning_rate": 3.0410545046497553e-06, "loss": 0.4035, "step": 1235 }, { "epoch": 0.6681034482758621, "grad_norm": 2.1950219963512176, "learning_rate": 2.9978717652478343e-06, "loss": 0.3902, "step": 1240 }, { "epoch": 0.6707974137931034, "grad_norm": 2.247458718435766, "learning_rate": 2.954866157223445e-06, "loss": 0.4082, "step": 1245 }, { "epoch": 0.6734913793103449, "grad_norm": 2.2241261994844588, "learning_rate": 2.9120414853491574e-06, "loss": 0.404, "step": 1250 }, { "epoch": 0.6761853448275862, "grad_norm": 2.1606540598223103, "learning_rate": 2.86940153838984e-06, "loss": 0.3948, "step": 1255 }, { "epoch": 0.6788793103448276, "grad_norm": 2.0718054651873437, "learning_rate": 2.826950088767469e-06, "loss": 0.3927, "step": 1260 }, { "epoch": 0.681573275862069, "grad_norm": 2.227847088159035, "learning_rate": 2.784690892227363e-06, "loss": 0.3903, "step": 1265 }, { "epoch": 0.6842672413793104, "grad_norm": 2.207892303296737, "learning_rate": 2.7426276875059145e-06, "loss": 0.3955, "step": 1270 }, { "epoch": 0.6869612068965517, "grad_norm": 2.1465153515114093, "learning_rate": 2.700764195999819e-06, "loss": 0.3788, "step": 1275 }, { "epoch": 0.6896551724137931, "grad_norm": 2.223157201107058, "learning_rate": 2.6591041214368383e-06, "loss": 0.4053, "step": 1280 }, { "epoch": 0.6923491379310345, "grad_norm": 2.392548147708553, "learning_rate": 2.6176511495481172e-06, "loss": 0.3834, "step": 1285 }, { "epoch": 0.6950431034482759, "grad_norm": 2.059476074487736, "learning_rate": 2.5764089477421067e-06, "loss": 0.3857, "step": 1290 }, { "epoch": 0.6977370689655172, "grad_norm": 2.157455657651667, "learning_rate": 2.5353811647801107e-06, "loss": 0.3884, "step": 1295 }, { "epoch": 0.7004310344827587, "grad_norm": 2.307643086382308, "learning_rate": 2.4945714304534584e-06, "loss": 0.3815, "step": 1300 }, { "epoch": 0.703125, "grad_norm": 2.26315069416342, "learning_rate": 2.453983355262382e-06, "loss": 0.3865, "step": 1305 }, { "epoch": 0.7058189655172413, "grad_norm": 2.332313222729813, "learning_rate": 2.413620530096592e-06, "loss": 0.391, "step": 1310 }, { "epoch": 0.7085129310344828, "grad_norm": 2.1418117590999413, "learning_rate": 2.373486525917575e-06, "loss": 0.3912, "step": 1315 }, { "epoch": 0.7112068965517241, "grad_norm": 2.178180423311831, "learning_rate": 2.333584893442675e-06, "loss": 0.3854, "step": 1320 }, { "epoch": 0.7139008620689655, "grad_norm": 2.151591142836586, "learning_rate": 2.2939191628309482e-06, "loss": 0.3815, "step": 1325 }, { "epoch": 0.7165948275862069, "grad_norm": 2.1488408048158916, "learning_rate": 2.254492843370857e-06, "loss": 0.3741, "step": 1330 }, { "epoch": 0.7192887931034483, "grad_norm": 2.3225770656541624, "learning_rate": 2.2153094231697807e-06, "loss": 0.3865, "step": 1335 }, { "epoch": 0.7219827586206896, "grad_norm": 2.225461569667121, "learning_rate": 2.1763723688454297e-06, "loss": 0.389, "step": 1340 }, { "epoch": 0.724676724137931, "grad_norm": 2.310688191216032, "learning_rate": 2.1376851252191465e-06, "loss": 0.3905, "step": 1345 }, { "epoch": 0.7273706896551724, "grad_norm": 2.206817710811153, "learning_rate": 2.09925111501113e-06, "loss": 0.3705, "step": 1350 }, { "epoch": 0.7300646551724138, "grad_norm": 2.194541840528301, "learning_rate": 2.061073738537635e-06, "loss": 0.38, "step": 1355 }, { "epoch": 0.7327586206896551, "grad_norm": 2.1363777762782568, "learning_rate": 2.0231563734101245e-06, "loss": 0.3826, "step": 1360 }, { "epoch": 0.7354525862068966, "grad_norm": 2.043722143372559, "learning_rate": 1.9855023742364647e-06, "loss": 0.3722, "step": 1365 }, { "epoch": 0.7381465517241379, "grad_norm": 2.296022903294665, "learning_rate": 1.9481150723241236e-06, "loss": 0.3836, "step": 1370 }, { "epoch": 0.7408405172413793, "grad_norm": 2.1320085273295333, "learning_rate": 1.9109977753854496e-06, "loss": 0.367, "step": 1375 }, { "epoch": 0.7435344827586207, "grad_norm": 2.126131429150438, "learning_rate": 1.8741537672450406e-06, "loss": 0.3756, "step": 1380 }, { "epoch": 0.7462284482758621, "grad_norm": 2.3054341669665708, "learning_rate": 1.8375863075492062e-06, "loss": 0.3737, "step": 1385 }, { "epoch": 0.7489224137931034, "grad_norm": 2.3340813640902867, "learning_rate": 1.8012986314775888e-06, "loss": 0.3694, "step": 1390 }, { "epoch": 0.7516163793103449, "grad_norm": 2.1335614766566544, "learning_rate": 1.7652939494569428e-06, "loss": 0.3706, "step": 1395 }, { "epoch": 0.7543103448275862, "grad_norm": 2.135867482259856, "learning_rate": 1.7295754468771026e-06, "loss": 0.3826, "step": 1400 }, { "epoch": 0.7570043103448276, "grad_norm": 2.253239028561062, "learning_rate": 1.6941462838091643e-06, "loss": 0.3879, "step": 1405 }, { "epoch": 0.759698275862069, "grad_norm": 2.1899554008641613, "learning_rate": 1.6590095947259083e-06, "loss": 0.3657, "step": 1410 }, { "epoch": 0.7623922413793104, "grad_norm": 1.9335639886365577, "learning_rate": 1.6241684882244952e-06, "loss": 0.3647, "step": 1415 }, { "epoch": 0.7650862068965517, "grad_norm": 2.158271364922754, "learning_rate": 1.5896260467514335e-06, "loss": 0.3613, "step": 1420 }, { "epoch": 0.7677801724137931, "grad_norm": 2.283426548356461, "learning_rate": 1.5553853263298741e-06, "loss": 0.3804, "step": 1425 }, { "epoch": 0.7704741379310345, "grad_norm": 1.973245710047114, "learning_rate": 1.521449356289245e-06, "loss": 0.3616, "step": 1430 }, { "epoch": 0.7731681034482759, "grad_norm": 2.176003470736959, "learning_rate": 1.4878211389972369e-06, "loss": 0.3594, "step": 1435 }, { "epoch": 0.7758620689655172, "grad_norm": 2.350333157030792, "learning_rate": 1.454503649594176e-06, "loss": 0.3745, "step": 1440 }, { "epoch": 0.7785560344827587, "grad_norm": 2.1046600168472254, "learning_rate": 1.421499835729812e-06, "loss": 0.3614, "step": 1445 }, { "epoch": 0.78125, "grad_norm": 2.2403959550973376, "learning_rate": 1.3888126173025412e-06, "loss": 0.3667, "step": 1450 }, { "epoch": 0.7839439655172413, "grad_norm": 2.2036204076799244, "learning_rate": 1.3564448862010653e-06, "loss": 0.3719, "step": 1455 }, { "epoch": 0.7866379310344828, "grad_norm": 2.1004023468667223, "learning_rate": 1.3243995060485537e-06, "loss": 0.3609, "step": 1460 }, { "epoch": 0.7893318965517241, "grad_norm": 2.049485866619644, "learning_rate": 1.2926793119492848e-06, "loss": 0.3562, "step": 1465 }, { "epoch": 0.7920258620689655, "grad_norm": 2.2562907662057015, "learning_rate": 1.2612871102378305e-06, "loss": 0.3638, "step": 1470 }, { "epoch": 0.7947198275862069, "grad_norm": 2.0015131375954045, "learning_rate": 1.230225678230766e-06, "loss": 0.3523, "step": 1475 }, { "epoch": 0.7974137931034483, "grad_norm": 1.9761111123797053, "learning_rate": 1.1994977639809575e-06, "loss": 0.3605, "step": 1480 }, { "epoch": 0.8001077586206896, "grad_norm": 2.1818297029398916, "learning_rate": 1.169106086034446e-06, "loss": 0.369, "step": 1485 }, { "epoch": 0.802801724137931, "grad_norm": 2.2176123875649782, "learning_rate": 1.1390533331899235e-06, "loss": 0.359, "step": 1490 }, { "epoch": 0.8054956896551724, "grad_norm": 2.1415950875401952, "learning_rate": 1.109342164260853e-06, "loss": 0.365, "step": 1495 }, { "epoch": 0.8081896551724138, "grad_norm": 1.9579230862394106, "learning_rate": 1.079975207840247e-06, "loss": 0.3475, "step": 1500 }, { "epoch": 0.8108836206896551, "grad_norm": 1.9891326864430916, "learning_rate": 1.050955062068098e-06, "loss": 0.3636, "step": 1505 }, { "epoch": 0.8135775862068966, "grad_norm": 2.1589113372475826, "learning_rate": 1.0222842944015327e-06, "loss": 0.3637, "step": 1510 }, { "epoch": 0.8162715517241379, "grad_norm": 2.2093770653678817, "learning_rate": 9.939654413876493e-07, "loss": 0.3704, "step": 1515 }, { "epoch": 0.8189655172413793, "grad_norm": 2.117779906161616, "learning_rate": 9.660010084391197e-07, "loss": 0.3549, "step": 1520 }, { "epoch": 0.8216594827586207, "grad_norm": 2.2081164429406623, "learning_rate": 9.383934696125213e-07, "loss": 0.3637, "step": 1525 }, { "epoch": 0.8243534482758621, "grad_norm": 2.0797066327192915, "learning_rate": 9.111452673894589e-07, "loss": 0.355, "step": 1530 }, { "epoch": 0.8270474137931034, "grad_norm": 1.9884207565802496, "learning_rate": 8.842588124604695e-07, "loss": 0.3598, "step": 1535 }, { "epoch": 0.8297413793103449, "grad_norm": 1.9966503677289194, "learning_rate": 8.577364835117552e-07, "loss": 0.3503, "step": 1540 }, { "epoch": 0.8324353448275862, "grad_norm": 2.0974426601893006, "learning_rate": 8.315806270147237e-07, "loss": 0.3513, "step": 1545 }, { "epoch": 0.8351293103448276, "grad_norm": 2.0409953572157264, "learning_rate": 8.057935570184e-07, "loss": 0.353, "step": 1550 }, { "epoch": 0.837823275862069, "grad_norm": 2.05994767546201, "learning_rate": 7.803775549447017e-07, "loss": 0.3612, "step": 1555 }, { "epoch": 0.8405172413793104, "grad_norm": 1.9798689534701572, "learning_rate": 7.553348693865897e-07, "loss": 0.3433, "step": 1560 }, { "epoch": 0.8432112068965517, "grad_norm": 2.0314728151818557, "learning_rate": 7.306677159091385e-07, "loss": 0.3554, "step": 1565 }, { "epoch": 0.8459051724137931, "grad_norm": 2.1770521072409665, "learning_rate": 7.06378276853516e-07, "loss": 0.3434, "step": 1570 }, { "epoch": 0.8485991379310345, "grad_norm": 3.199094357987707, "learning_rate": 6.824687011439168e-07, "loss": 0.3555, "step": 1575 }, { "epoch": 0.8512931034482759, "grad_norm": 2.0350410942770267, "learning_rate": 6.589411040974369e-07, "loss": 0.3455, "step": 1580 }, { "epoch": 0.8539870689655172, "grad_norm": 2.0106939788979994, "learning_rate": 6.35797567236926e-07, "loss": 0.342, "step": 1585 }, { "epoch": 0.8566810344827587, "grad_norm": 2.0462922997663333, "learning_rate": 6.130401381068424e-07, "loss": 0.3484, "step": 1590 }, { "epoch": 0.859375, "grad_norm": 1.9989302742973973, "learning_rate": 5.906708300920916e-07, "loss": 0.358, "step": 1595 }, { "epoch": 0.8620689655172413, "grad_norm": 2.1421705464248997, "learning_rate": 5.686916222399069e-07, "loss": 0.3479, "step": 1600 }, { "epoch": 0.8647629310344828, "grad_norm": 1.8665911668349293, "learning_rate": 5.471044590847569e-07, "loss": 0.3485, "step": 1605 }, { "epoch": 0.8674568965517241, "grad_norm": 2.252328311927183, "learning_rate": 5.259112504763115e-07, "loss": 0.3537, "step": 1610 }, { "epoch": 0.8701508620689655, "grad_norm": 2.242291713625665, "learning_rate": 5.051138714104726e-07, "loss": 0.3493, "step": 1615 }, { "epoch": 0.8728448275862069, "grad_norm": 1.9256177965601142, "learning_rate": 4.847141618634899e-07, "loss": 0.346, "step": 1620 }, { "epoch": 0.8755387931034483, "grad_norm": 2.0978920858884806, "learning_rate": 4.647139266291789e-07, "loss": 0.3447, "step": 1625 }, { "epoch": 0.8782327586206896, "grad_norm": 2.1438665447656424, "learning_rate": 4.4511493515924373e-07, "loss": 0.3467, "step": 1630 }, { "epoch": 0.880926724137931, "grad_norm": 1.943275187391926, "learning_rate": 4.2591892140673383e-07, "loss": 0.359, "step": 1635 }, { "epoch": 0.8836206896551724, "grad_norm": 1.9691693184683765, "learning_rate": 4.0712758367263573e-07, "loss": 0.3453, "step": 1640 }, { "epoch": 0.8863146551724138, "grad_norm": 2.2550989234096432, "learning_rate": 3.8874258445562694e-07, "loss": 0.354, "step": 1645 }, { "epoch": 0.8890086206896551, "grad_norm": 1.9743645882114702, "learning_rate": 3.7076555030498505e-07, "loss": 0.3545, "step": 1650 }, { "epoch": 0.8917025862068966, "grad_norm": 2.069394313953148, "learning_rate": 3.531980716766914e-07, "loss": 0.3465, "step": 1655 }, { "epoch": 0.8943965517241379, "grad_norm": 2.084992853821571, "learning_rate": 3.3604170279271375e-07, "loss": 0.347, "step": 1660 }, { "epoch": 0.8970905172413793, "grad_norm": 2.028486932834069, "learning_rate": 3.1929796150351076e-07, "loss": 0.3385, "step": 1665 }, { "epoch": 0.8997844827586207, "grad_norm": 1.9042104552013777, "learning_rate": 3.02968329153735e-07, "loss": 0.3456, "step": 1670 }, { "epoch": 0.9024784482758621, "grad_norm": 2.138202184025318, "learning_rate": 2.870542504511864e-07, "loss": 0.3524, "step": 1675 }, { "epoch": 0.9051724137931034, "grad_norm": 2.0791032572613615, "learning_rate": 2.7155713333898826e-07, "loss": 0.3557, "step": 1680 }, { "epoch": 0.9078663793103449, "grad_norm": 2.032552582124559, "learning_rate": 2.564783488710293e-07, "loss": 0.3472, "step": 1685 }, { "epoch": 0.9105603448275862, "grad_norm": 2.0702198858374063, "learning_rate": 2.4181923109066254e-07, "loss": 0.3423, "step": 1690 }, { "epoch": 0.9132543103448276, "grad_norm": 2.223955152789369, "learning_rate": 2.2758107691268294e-07, "loss": 0.353, "step": 1695 }, { "epoch": 0.915948275862069, "grad_norm": 2.151000423198189, "learning_rate": 2.1376514600858212e-07, "loss": 0.3446, "step": 1700 }, { "epoch": 0.9186422413793104, "grad_norm": 1.9722858881802758, "learning_rate": 2.003726606951084e-07, "loss": 0.3423, "step": 1705 }, { "epoch": 0.9213362068965517, "grad_norm": 2.152676598806774, "learning_rate": 1.874048058261252e-07, "loss": 0.3566, "step": 1710 }, { "epoch": 0.9240301724137931, "grad_norm": 2.14241065854355, "learning_rate": 1.7486272868778299e-07, "loss": 0.3451, "step": 1715 }, { "epoch": 0.9267241379310345, "grad_norm": 1.9240645550272026, "learning_rate": 1.62747538897019e-07, "loss": 0.3526, "step": 1720 }, { "epoch": 0.9294181034482759, "grad_norm": 1.9864527165081682, "learning_rate": 1.5106030830338791e-07, "loss": 0.3414, "step": 1725 }, { "epoch": 0.9321120689655172, "grad_norm": 1.891840587890648, "learning_rate": 1.3980207089423326e-07, "loss": 0.3507, "step": 1730 }, { "epoch": 0.9348060344827587, "grad_norm": 2.197241548310695, "learning_rate": 1.2897382270320947e-07, "loss": 0.3415, "step": 1735 }, { "epoch": 0.9375, "grad_norm": 2.1206142876832708, "learning_rate": 1.1857652172215905e-07, "loss": 0.3453, "step": 1740 }, { "epoch": 0.9401939655172413, "grad_norm": 2.0575425778092375, "learning_rate": 1.0861108781636099e-07, "loss": 0.3414, "step": 1745 }, { "epoch": 0.9428879310344828, "grad_norm": 2.067217232750268, "learning_rate": 9.907840264314572e-08, "loss": 0.3429, "step": 1750 }, { "epoch": 0.9455818965517241, "grad_norm": 2.08954775323305, "learning_rate": 8.997930957389433e-08, "loss": 0.3406, "step": 1755 }, { "epoch": 0.9482758620689655, "grad_norm": 2.0413104358527865, "learning_rate": 8.13146136194265e-08, "loss": 0.3544, "step": 1760 }, { "epoch": 0.9509698275862069, "grad_norm": 1.9504574949587095, "learning_rate": 7.308508135877745e-08, "loss": 0.3515, "step": 1765 }, { "epoch": 0.9536637931034483, "grad_norm": 2.0325177039467266, "learning_rate": 6.52914408713784e-08, "loss": 0.3422, "step": 1770 }, { "epoch": 0.9563577586206896, "grad_norm": 2.080402951454278, "learning_rate": 5.7934381672640206e-08, "loss": 0.3302, "step": 1775 }, { "epoch": 0.959051724137931, "grad_norm": 1.9103094146698458, "learning_rate": 5.101455465295557e-08, "loss": 0.3388, "step": 1780 }, { "epoch": 0.9617456896551724, "grad_norm": 2.0461617336274665, "learning_rate": 4.453257202011008e-08, "loss": 0.3437, "step": 1785 }, { "epoch": 0.9644396551724138, "grad_norm": 1.8955751541723638, "learning_rate": 3.848900724511828e-08, "loss": 0.3448, "step": 1790 }, { "epoch": 0.9671336206896551, "grad_norm": 1.8502858059698502, "learning_rate": 3.28843950114921e-08, "loss": 0.3318, "step": 1795 }, { "epoch": 0.9698275862068966, "grad_norm": 1.9634830726403167, "learning_rate": 2.771923116793307e-08, "loss": 0.3506, "step": 1800 }, { "epoch": 0.9725215517241379, "grad_norm": 2.12551984941854, "learning_rate": 2.299397268446413e-08, "loss": 0.3425, "step": 1805 }, { "epoch": 0.9752155172413793, "grad_norm": 2.4278727464472136, "learning_rate": 1.8709037612003044e-08, "loss": 0.3471, "step": 1810 }, { "epoch": 0.9779094827586207, "grad_norm": 2.191866602098634, "learning_rate": 1.4864805045373687e-08, "loss": 0.3384, "step": 1815 }, { "epoch": 0.9806034482758621, "grad_norm": 2.128906961450063, "learning_rate": 1.1461615089770062e-08, "loss": 0.349, "step": 1820 }, { "epoch": 0.9832974137931034, "grad_norm": 2.0846719469136916, "learning_rate": 8.499768830663723e-09, "loss": 0.3357, "step": 1825 }, { "epoch": 0.9859913793103449, "grad_norm": 2.319036063763146, "learning_rate": 5.979528307168414e-09, "loss": 0.3402, "step": 1830 }, { "epoch": 0.9886853448275862, "grad_norm": 2.0237346794749858, "learning_rate": 3.901116488855827e-09, "loss": 0.3554, "step": 1835 }, { "epoch": 0.9913793103448276, "grad_norm": 2.007135214089839, "learning_rate": 2.264717256030835e-09, "loss": 0.3462, "step": 1840 }, { "epoch": 0.994073275862069, "grad_norm": 1.994084875393067, "learning_rate": 1.0704753834600567e-09, "loss": 0.3455, "step": 1845 }, { "epoch": 0.9967672413793104, "grad_norm": 2.1211709513233856, "learning_rate": 3.184965275676577e-10, "loss": 0.3438, "step": 1850 }, { "epoch": 0.9994612068965517, "grad_norm": 2.0397937800653443, "learning_rate": 8.847217084495541e-12, "loss": 0.3482, "step": 1855 }, { "epoch": 1.0, "eval_runtime": 3.3988, "eval_samples_per_second": 2.942, "eval_steps_per_second": 0.883, "step": 1856 }, { "epoch": 1.0, "step": 1856, "total_flos": 194304320471040.0, "train_loss": 0.50882549257949, "train_runtime": 16510.7518, "train_samples_per_second": 1.799, "train_steps_per_second": 0.112 } ], "logging_steps": 5, "max_steps": 1856, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 194304320471040.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }