{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.22408335900955156, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00011204167950477578, "grad_norm": 0.829902708530426, "learning_rate": 2e-05, "loss": 1.9407, "step": 1 }, { "epoch": 0.00022408335900955155, "grad_norm": 0.9879770874977112, "learning_rate": 4e-05, "loss": 2.003, "step": 2 }, { "epoch": 0.0003361250385143273, "grad_norm": 0.9323781728744507, "learning_rate": 6e-05, "loss": 1.4437, "step": 3 }, { "epoch": 0.0004481667180191031, "grad_norm": 1.076143741607666, "learning_rate": 8e-05, "loss": 2.0588, "step": 4 }, { "epoch": 0.0005602083975238789, "grad_norm": 2.0179669857025146, "learning_rate": 0.0001, "loss": 1.8546, "step": 5 }, { "epoch": 0.0006722500770286546, "grad_norm": 1.2414021492004395, "learning_rate": 0.00012, "loss": 1.144, "step": 6 }, { "epoch": 0.0007842917565334304, "grad_norm": 0.7555748224258423, "learning_rate": 0.00014, "loss": 1.8284, "step": 7 }, { "epoch": 0.0008963334360382062, "grad_norm": 1.0806628465652466, "learning_rate": 0.00016, "loss": 1.6396, "step": 8 }, { "epoch": 0.001008375115542982, "grad_norm": 0.806289792060852, "learning_rate": 0.00018, "loss": 1.9042, "step": 9 }, { "epoch": 0.0011204167950477578, "grad_norm": 0.9136477708816528, "learning_rate": 0.0002, "loss": 1.3231, "step": 10 }, { "epoch": 0.0012324584745525336, "grad_norm": 2.1844406127929688, "learning_rate": 0.00019998878923766818, "loss": 1.5027, "step": 11 }, { "epoch": 0.0013445001540573092, "grad_norm": 1.3407416343688965, "learning_rate": 0.00019997757847533634, "loss": 1.5676, "step": 12 }, { "epoch": 0.001456541833562085, "grad_norm": 0.8225234746932983, "learning_rate": 0.0001999663677130045, "loss": 1.7255, "step": 13 }, { "epoch": 0.0015685835130668608, "grad_norm": 1.4570527076721191, "learning_rate": 0.00019995515695067265, "loss": 1.543, "step": 14 }, { "epoch": 0.0016806251925716366, "grad_norm": 1.167662262916565, "learning_rate": 0.00019994394618834082, "loss": 1.4586, "step": 15 }, { "epoch": 0.0017926668720764124, "grad_norm": 0.8484973907470703, "learning_rate": 0.000199932735426009, "loss": 1.3582, "step": 16 }, { "epoch": 0.0019047085515811882, "grad_norm": 1.175027847290039, "learning_rate": 0.00019992152466367713, "loss": 1.0393, "step": 17 }, { "epoch": 0.002016750231085964, "grad_norm": 1.4026998281478882, "learning_rate": 0.0001999103139013453, "loss": 1.2062, "step": 18 }, { "epoch": 0.0021287919105907396, "grad_norm": 0.7838619947433472, "learning_rate": 0.00019989910313901346, "loss": 1.8304, "step": 19 }, { "epoch": 0.0022408335900955157, "grad_norm": 1.5558677911758423, "learning_rate": 0.00019988789237668163, "loss": 1.5472, "step": 20 }, { "epoch": 0.0023528752696002912, "grad_norm": 0.8925554752349854, "learning_rate": 0.00019987668161434977, "loss": 1.7708, "step": 21 }, { "epoch": 0.0024649169491050673, "grad_norm": 0.9016994833946228, "learning_rate": 0.00019986547085201794, "loss": 0.9849, "step": 22 }, { "epoch": 0.002576958628609843, "grad_norm": 0.9562382698059082, "learning_rate": 0.0001998542600896861, "loss": 1.6214, "step": 23 }, { "epoch": 0.0026890003081146184, "grad_norm": 0.9302222728729248, "learning_rate": 0.00019984304932735427, "loss": 1.4912, "step": 24 }, { "epoch": 0.0028010419876193945, "grad_norm": 1.1686227321624756, "learning_rate": 0.00019983183856502244, "loss": 1.2566, "step": 25 }, { "epoch": 0.00291308366712417, "grad_norm": 1.5995142459869385, "learning_rate": 0.0001998206278026906, "loss": 1.3027, "step": 26 }, { "epoch": 0.003025125346628946, "grad_norm": 0.9424421787261963, "learning_rate": 0.00019980941704035875, "loss": 1.3376, "step": 27 }, { "epoch": 0.0031371670261337217, "grad_norm": 1.0297436714172363, "learning_rate": 0.00019979820627802691, "loss": 1.8166, "step": 28 }, { "epoch": 0.0032492087056384977, "grad_norm": 0.5636870861053467, "learning_rate": 0.00019978699551569508, "loss": 1.8255, "step": 29 }, { "epoch": 0.0033612503851432733, "grad_norm": 0.848228931427002, "learning_rate": 0.00019977578475336325, "loss": 1.6907, "step": 30 }, { "epoch": 0.0034732920646480493, "grad_norm": 1.219604730606079, "learning_rate": 0.0001997645739910314, "loss": 1.274, "step": 31 }, { "epoch": 0.003585333744152825, "grad_norm": 1.4606797695159912, "learning_rate": 0.00019975336322869956, "loss": 1.5849, "step": 32 }, { "epoch": 0.0036973754236576005, "grad_norm": 1.3997535705566406, "learning_rate": 0.00019974215246636772, "loss": 1.368, "step": 33 }, { "epoch": 0.0038094171031623765, "grad_norm": 0.983539879322052, "learning_rate": 0.0001997309417040359, "loss": 1.2379, "step": 34 }, { "epoch": 0.0039214587826671525, "grad_norm": 1.8045989274978638, "learning_rate": 0.00019971973094170403, "loss": 1.3885, "step": 35 }, { "epoch": 0.004033500462171928, "grad_norm": 1.1149908304214478, "learning_rate": 0.0001997085201793722, "loss": 1.1442, "step": 36 }, { "epoch": 0.004145542141676704, "grad_norm": 1.0518440008163452, "learning_rate": 0.00019969730941704037, "loss": 1.5883, "step": 37 }, { "epoch": 0.004257583821181479, "grad_norm": 1.5053592920303345, "learning_rate": 0.00019968609865470853, "loss": 1.1635, "step": 38 }, { "epoch": 0.004369625500686255, "grad_norm": 1.869209885597229, "learning_rate": 0.0001996748878923767, "loss": 0.7799, "step": 39 }, { "epoch": 0.004481667180191031, "grad_norm": 1.4690619707107544, "learning_rate": 0.00019966367713004487, "loss": 1.3447, "step": 40 }, { "epoch": 0.004593708859695807, "grad_norm": 0.8668084740638733, "learning_rate": 0.000199652466367713, "loss": 1.8564, "step": 41 }, { "epoch": 0.0047057505392005825, "grad_norm": 1.7591629028320312, "learning_rate": 0.00019964125560538118, "loss": 1.2139, "step": 42 }, { "epoch": 0.004817792218705358, "grad_norm": 1.468502402305603, "learning_rate": 0.00019963004484304934, "loss": 1.6201, "step": 43 }, { "epoch": 0.0049298338982101345, "grad_norm": 0.8053362369537354, "learning_rate": 0.00019961883408071748, "loss": 1.6325, "step": 44 }, { "epoch": 0.00504187557771491, "grad_norm": 1.114880919456482, "learning_rate": 0.00019960762331838565, "loss": 1.6708, "step": 45 }, { "epoch": 0.005153917257219686, "grad_norm": 1.2228628396987915, "learning_rate": 0.00019959641255605382, "loss": 1.0447, "step": 46 }, { "epoch": 0.005265958936724461, "grad_norm": 0.9293903112411499, "learning_rate": 0.00019958520179372199, "loss": 1.075, "step": 47 }, { "epoch": 0.005378000616229237, "grad_norm": 1.108929991722107, "learning_rate": 0.00019957399103139015, "loss": 1.2237, "step": 48 }, { "epoch": 0.005490042295734013, "grad_norm": 0.9987677931785583, "learning_rate": 0.0001995627802690583, "loss": 0.6119, "step": 49 }, { "epoch": 0.005602083975238789, "grad_norm": 1.0769113302230835, "learning_rate": 0.00019955156950672646, "loss": 1.2302, "step": 50 }, { "epoch": 0.0057141256547435645, "grad_norm": 1.1827917098999023, "learning_rate": 0.00019954035874439463, "loss": 1.409, "step": 51 }, { "epoch": 0.00582616733424834, "grad_norm": 1.0538841485977173, "learning_rate": 0.0001995291479820628, "loss": 1.6391, "step": 52 }, { "epoch": 0.0059382090137531165, "grad_norm": 1.313279151916504, "learning_rate": 0.00019951793721973096, "loss": 1.0135, "step": 53 }, { "epoch": 0.006050250693257892, "grad_norm": 1.702959418296814, "learning_rate": 0.00019950672645739913, "loss": 1.2147, "step": 54 }, { "epoch": 0.006162292372762668, "grad_norm": 2.4783010482788086, "learning_rate": 0.00019949551569506727, "loss": 0.9157, "step": 55 }, { "epoch": 0.006274334052267443, "grad_norm": 1.9060266017913818, "learning_rate": 0.00019948430493273544, "loss": 1.5957, "step": 56 }, { "epoch": 0.006386375731772219, "grad_norm": 1.8357959985733032, "learning_rate": 0.0001994730941704036, "loss": 1.0354, "step": 57 }, { "epoch": 0.006498417411276995, "grad_norm": 2.8807880878448486, "learning_rate": 0.00019946188340807174, "loss": 1.2594, "step": 58 }, { "epoch": 0.006610459090781771, "grad_norm": 2.3406355381011963, "learning_rate": 0.0001994506726457399, "loss": 1.5229, "step": 59 }, { "epoch": 0.0067225007702865465, "grad_norm": 1.3177297115325928, "learning_rate": 0.00019943946188340808, "loss": 1.2784, "step": 60 }, { "epoch": 0.006834542449791322, "grad_norm": 0.9821874499320984, "learning_rate": 0.00019942825112107625, "loss": 1.5853, "step": 61 }, { "epoch": 0.006946584129296099, "grad_norm": 2.13622784614563, "learning_rate": 0.0001994170403587444, "loss": 1.1567, "step": 62 }, { "epoch": 0.007058625808800874, "grad_norm": 1.944874882698059, "learning_rate": 0.00019940582959641255, "loss": 0.6764, "step": 63 }, { "epoch": 0.00717066748830565, "grad_norm": 1.0578577518463135, "learning_rate": 0.00019939461883408072, "loss": 1.4714, "step": 64 }, { "epoch": 0.007282709167810425, "grad_norm": 2.2720305919647217, "learning_rate": 0.0001993834080717489, "loss": 1.3643, "step": 65 }, { "epoch": 0.007394750847315201, "grad_norm": 2.0353288650512695, "learning_rate": 0.00019937219730941706, "loss": 0.9008, "step": 66 }, { "epoch": 0.007506792526819977, "grad_norm": 1.2870949506759644, "learning_rate": 0.00019936098654708522, "loss": 1.6618, "step": 67 }, { "epoch": 0.007618834206324753, "grad_norm": 0.9148669242858887, "learning_rate": 0.0001993497757847534, "loss": 1.0548, "step": 68 }, { "epoch": 0.0077308758858295286, "grad_norm": 0.8967483043670654, "learning_rate": 0.00019933856502242153, "loss": 1.3272, "step": 69 }, { "epoch": 0.007842917565334305, "grad_norm": 0.953134298324585, "learning_rate": 0.0001993273542600897, "loss": 1.439, "step": 70 }, { "epoch": 0.00795495924483908, "grad_norm": 1.0446513891220093, "learning_rate": 0.00019931614349775787, "loss": 1.2221, "step": 71 }, { "epoch": 0.008067000924343856, "grad_norm": 1.0762940645217896, "learning_rate": 0.000199304932735426, "loss": 1.4904, "step": 72 }, { "epoch": 0.008179042603848632, "grad_norm": 1.2400437593460083, "learning_rate": 0.00019929372197309417, "loss": 1.612, "step": 73 }, { "epoch": 0.008291084283353407, "grad_norm": 1.1380577087402344, "learning_rate": 0.00019928251121076234, "loss": 0.9864, "step": 74 }, { "epoch": 0.008403125962858183, "grad_norm": 1.0389713048934937, "learning_rate": 0.0001992713004484305, "loss": 1.2869, "step": 75 }, { "epoch": 0.008515167642362959, "grad_norm": 0.7196476459503174, "learning_rate": 0.00019926008968609865, "loss": 1.913, "step": 76 }, { "epoch": 0.008627209321867734, "grad_norm": 1.0790194272994995, "learning_rate": 0.00019924887892376682, "loss": 1.2677, "step": 77 }, { "epoch": 0.00873925100137251, "grad_norm": 1.3770583868026733, "learning_rate": 0.00019923766816143498, "loss": 1.2396, "step": 78 }, { "epoch": 0.008851292680877287, "grad_norm": 2.2702391147613525, "learning_rate": 0.00019922645739910315, "loss": 1.0927, "step": 79 }, { "epoch": 0.008963334360382063, "grad_norm": 0.8956285715103149, "learning_rate": 0.00019921524663677132, "loss": 2.166, "step": 80 }, { "epoch": 0.009075376039886838, "grad_norm": 2.4474527835845947, "learning_rate": 0.00019920403587443949, "loss": 1.043, "step": 81 }, { "epoch": 0.009187417719391614, "grad_norm": 0.8798457384109497, "learning_rate": 0.00019919282511210763, "loss": 1.9765, "step": 82 }, { "epoch": 0.00929945939889639, "grad_norm": 1.791527271270752, "learning_rate": 0.0001991816143497758, "loss": 0.8807, "step": 83 }, { "epoch": 0.009411501078401165, "grad_norm": 1.5848817825317383, "learning_rate": 0.00019917040358744396, "loss": 1.2697, "step": 84 }, { "epoch": 0.00952354275790594, "grad_norm": 1.4345341920852661, "learning_rate": 0.0001991591928251121, "loss": 1.0316, "step": 85 }, { "epoch": 0.009635584437410716, "grad_norm": 1.7331970930099487, "learning_rate": 0.00019914798206278027, "loss": 1.0742, "step": 86 }, { "epoch": 0.009747626116915492, "grad_norm": 1.54176664352417, "learning_rate": 0.00019913677130044844, "loss": 1.1366, "step": 87 }, { "epoch": 0.009859667796420269, "grad_norm": 1.1096471548080444, "learning_rate": 0.0001991255605381166, "loss": 1.2173, "step": 88 }, { "epoch": 0.009971709475925045, "grad_norm": 1.1697500944137573, "learning_rate": 0.00019911434977578477, "loss": 1.4514, "step": 89 }, { "epoch": 0.01008375115542982, "grad_norm": 1.2933481931686401, "learning_rate": 0.0001991031390134529, "loss": 0.9804, "step": 90 }, { "epoch": 0.010195792834934596, "grad_norm": 1.6500056982040405, "learning_rate": 0.00019909192825112108, "loss": 1.1384, "step": 91 }, { "epoch": 0.010307834514439371, "grad_norm": 1.5209136009216309, "learning_rate": 0.00019908071748878925, "loss": 0.7814, "step": 92 }, { "epoch": 0.010419876193944147, "grad_norm": 1.274984359741211, "learning_rate": 0.0001990695067264574, "loss": 1.1465, "step": 93 }, { "epoch": 0.010531917873448923, "grad_norm": 1.1176897287368774, "learning_rate": 0.00019905829596412558, "loss": 1.4201, "step": 94 }, { "epoch": 0.010643959552953698, "grad_norm": 1.0417767763137817, "learning_rate": 0.00019904708520179375, "loss": 0.8469, "step": 95 }, { "epoch": 0.010756001232458474, "grad_norm": 0.8218156099319458, "learning_rate": 0.0001990358744394619, "loss": 1.7877, "step": 96 }, { "epoch": 0.010868042911963251, "grad_norm": 1.4339174032211304, "learning_rate": 0.00019902466367713006, "loss": 0.8023, "step": 97 }, { "epoch": 0.010980084591468027, "grad_norm": 1.5321481227874756, "learning_rate": 0.00019901345291479822, "loss": 1.1544, "step": 98 }, { "epoch": 0.011092126270972802, "grad_norm": 1.1068462133407593, "learning_rate": 0.00019900224215246636, "loss": 1.4202, "step": 99 }, { "epoch": 0.011204167950477578, "grad_norm": 1.9069092273712158, "learning_rate": 0.00019899103139013453, "loss": 0.9763, "step": 100 }, { "epoch": 0.011316209629982353, "grad_norm": 1.9597985744476318, "learning_rate": 0.0001989798206278027, "loss": 0.8515, "step": 101 }, { "epoch": 0.011428251309487129, "grad_norm": 1.573714017868042, "learning_rate": 0.00019896860986547086, "loss": 0.778, "step": 102 }, { "epoch": 0.011540292988991905, "grad_norm": 1.463826298713684, "learning_rate": 0.00019895739910313903, "loss": 1.5568, "step": 103 }, { "epoch": 0.01165233466849668, "grad_norm": 1.218384027481079, "learning_rate": 0.00019894618834080717, "loss": 0.9793, "step": 104 }, { "epoch": 0.011764376348001456, "grad_norm": 1.1051996946334839, "learning_rate": 0.00019893497757847534, "loss": 1.7922, "step": 105 }, { "epoch": 0.011876418027506233, "grad_norm": 0.881253719329834, "learning_rate": 0.0001989237668161435, "loss": 1.144, "step": 106 }, { "epoch": 0.011988459707011009, "grad_norm": 1.484553575515747, "learning_rate": 0.00019891255605381167, "loss": 0.7707, "step": 107 }, { "epoch": 0.012100501386515784, "grad_norm": 1.278754711151123, "learning_rate": 0.00019890134529147984, "loss": 1.004, "step": 108 }, { "epoch": 0.01221254306602056, "grad_norm": 0.9995393753051758, "learning_rate": 0.000198890134529148, "loss": 1.0402, "step": 109 }, { "epoch": 0.012324584745525335, "grad_norm": 1.3858439922332764, "learning_rate": 0.00019887892376681615, "loss": 0.8014, "step": 110 }, { "epoch": 0.012436626425030111, "grad_norm": 1.240240454673767, "learning_rate": 0.00019886771300448432, "loss": 1.7407, "step": 111 }, { "epoch": 0.012548668104534887, "grad_norm": 1.2800484895706177, "learning_rate": 0.00019885650224215246, "loss": 1.2117, "step": 112 }, { "epoch": 0.012660709784039662, "grad_norm": 1.1304497718811035, "learning_rate": 0.00019884529147982062, "loss": 0.8914, "step": 113 }, { "epoch": 0.012772751463544438, "grad_norm": 1.0187777280807495, "learning_rate": 0.0001988340807174888, "loss": 1.7153, "step": 114 }, { "epoch": 0.012884793143049215, "grad_norm": 1.1366180181503296, "learning_rate": 0.00019882286995515696, "loss": 1.4152, "step": 115 }, { "epoch": 0.01299683482255399, "grad_norm": 1.3564180135726929, "learning_rate": 0.00019881165919282513, "loss": 1.6297, "step": 116 }, { "epoch": 0.013108876502058766, "grad_norm": 0.8457950949668884, "learning_rate": 0.00019880044843049327, "loss": 1.5215, "step": 117 }, { "epoch": 0.013220918181563542, "grad_norm": 0.9168684482574463, "learning_rate": 0.00019878923766816143, "loss": 1.2233, "step": 118 }, { "epoch": 0.013332959861068317, "grad_norm": 1.060271143913269, "learning_rate": 0.0001987780269058296, "loss": 0.9783, "step": 119 }, { "epoch": 0.013445001540573093, "grad_norm": 1.2341206073760986, "learning_rate": 0.00019876681614349777, "loss": 1.6914, "step": 120 }, { "epoch": 0.013557043220077869, "grad_norm": 1.1339999437332153, "learning_rate": 0.00019875560538116594, "loss": 1.241, "step": 121 }, { "epoch": 0.013669084899582644, "grad_norm": 1.095458984375, "learning_rate": 0.0001987443946188341, "loss": 1.4184, "step": 122 }, { "epoch": 0.01378112657908742, "grad_norm": 1.584643840789795, "learning_rate": 0.00019873318385650227, "loss": 0.8354, "step": 123 }, { "epoch": 0.013893168258592197, "grad_norm": 1.1102590560913086, "learning_rate": 0.0001987219730941704, "loss": 1.2697, "step": 124 }, { "epoch": 0.014005209938096973, "grad_norm": 1.0487998723983765, "learning_rate": 0.00019871076233183858, "loss": 1.2313, "step": 125 }, { "epoch": 0.014117251617601748, "grad_norm": 0.9507973790168762, "learning_rate": 0.00019869955156950672, "loss": 1.3212, "step": 126 }, { "epoch": 0.014229293297106524, "grad_norm": 1.3752379417419434, "learning_rate": 0.00019868834080717489, "loss": 1.1818, "step": 127 }, { "epoch": 0.0143413349766113, "grad_norm": 0.9604128003120422, "learning_rate": 0.00019867713004484305, "loss": 1.1595, "step": 128 }, { "epoch": 0.014453376656116075, "grad_norm": 1.3550018072128296, "learning_rate": 0.00019866591928251122, "loss": 1.2516, "step": 129 }, { "epoch": 0.01456541833562085, "grad_norm": 1.2497659921646118, "learning_rate": 0.0001986547085201794, "loss": 1.4646, "step": 130 }, { "epoch": 0.014677460015125626, "grad_norm": 2.0970983505249023, "learning_rate": 0.00019864349775784753, "loss": 1.3156, "step": 131 }, { "epoch": 0.014789501694630402, "grad_norm": 1.0172722339630127, "learning_rate": 0.0001986322869955157, "loss": 1.1949, "step": 132 }, { "epoch": 0.014901543374135177, "grad_norm": 1.7070896625518799, "learning_rate": 0.00019862107623318386, "loss": 0.7891, "step": 133 }, { "epoch": 0.015013585053639955, "grad_norm": 1.4564061164855957, "learning_rate": 0.00019860986547085203, "loss": 1.4517, "step": 134 }, { "epoch": 0.01512562673314473, "grad_norm": 1.8836793899536133, "learning_rate": 0.0001985986547085202, "loss": 1.0214, "step": 135 }, { "epoch": 0.015237668412649506, "grad_norm": 1.602691888809204, "learning_rate": 0.00019858744394618837, "loss": 1.1399, "step": 136 }, { "epoch": 0.015349710092154282, "grad_norm": 1.2563743591308594, "learning_rate": 0.00019857623318385653, "loss": 0.9531, "step": 137 }, { "epoch": 0.015461751771659057, "grad_norm": 1.1459704637527466, "learning_rate": 0.00019856502242152467, "loss": 1.2181, "step": 138 }, { "epoch": 0.015573793451163833, "grad_norm": 1.004351019859314, "learning_rate": 0.00019855381165919284, "loss": 1.7192, "step": 139 }, { "epoch": 0.01568583513066861, "grad_norm": 1.921199917793274, "learning_rate": 0.00019854260089686098, "loss": 1.3169, "step": 140 }, { "epoch": 0.015797876810173386, "grad_norm": 1.9246957302093506, "learning_rate": 0.00019853139013452915, "loss": 1.6034, "step": 141 }, { "epoch": 0.01590991848967816, "grad_norm": 2.1363983154296875, "learning_rate": 0.00019852017937219732, "loss": 1.4535, "step": 142 }, { "epoch": 0.016021960169182937, "grad_norm": 1.364855170249939, "learning_rate": 0.00019850896860986548, "loss": 1.2927, "step": 143 }, { "epoch": 0.016134001848687712, "grad_norm": 1.1059762239456177, "learning_rate": 0.00019849775784753365, "loss": 1.0424, "step": 144 }, { "epoch": 0.016246043528192488, "grad_norm": 1.621781587600708, "learning_rate": 0.0001984865470852018, "loss": 1.182, "step": 145 }, { "epoch": 0.016358085207697264, "grad_norm": 1.4909039735794067, "learning_rate": 0.00019847533632286996, "loss": 1.1124, "step": 146 }, { "epoch": 0.01647012688720204, "grad_norm": 2.2853002548217773, "learning_rate": 0.00019846412556053812, "loss": 1.2721, "step": 147 }, { "epoch": 0.016582168566706815, "grad_norm": 0.9868189096450806, "learning_rate": 0.0001984529147982063, "loss": 1.4779, "step": 148 }, { "epoch": 0.01669421024621159, "grad_norm": 1.069211483001709, "learning_rate": 0.00019844170403587446, "loss": 0.8171, "step": 149 }, { "epoch": 0.016806251925716366, "grad_norm": 1.0235058069229126, "learning_rate": 0.00019843049327354263, "loss": 1.2956, "step": 150 }, { "epoch": 0.01691829360522114, "grad_norm": 3.6095454692840576, "learning_rate": 0.00019841928251121077, "loss": 1.3404, "step": 151 }, { "epoch": 0.017030335284725917, "grad_norm": 1.4739564657211304, "learning_rate": 0.00019840807174887893, "loss": 1.1881, "step": 152 }, { "epoch": 0.017142376964230693, "grad_norm": 1.050796389579773, "learning_rate": 0.00019839686098654707, "loss": 1.4581, "step": 153 }, { "epoch": 0.017254418643735468, "grad_norm": 1.3453677892684937, "learning_rate": 0.00019838565022421524, "loss": 1.7532, "step": 154 }, { "epoch": 0.017366460323240244, "grad_norm": 0.944364070892334, "learning_rate": 0.0001983744394618834, "loss": 1.2347, "step": 155 }, { "epoch": 0.01747850200274502, "grad_norm": 2.5314056873321533, "learning_rate": 0.00019836322869955158, "loss": 1.7477, "step": 156 }, { "epoch": 0.0175905436822498, "grad_norm": 0.8721354603767395, "learning_rate": 0.00019835201793721974, "loss": 1.6952, "step": 157 }, { "epoch": 0.017702585361754574, "grad_norm": 1.2394968271255493, "learning_rate": 0.0001983408071748879, "loss": 1.3067, "step": 158 }, { "epoch": 0.01781462704125935, "grad_norm": 2.0493600368499756, "learning_rate": 0.00019832959641255605, "loss": 1.0501, "step": 159 }, { "epoch": 0.017926668720764125, "grad_norm": 1.284934401512146, "learning_rate": 0.00019831838565022422, "loss": 1.0806, "step": 160 }, { "epoch": 0.0180387104002689, "grad_norm": 3.6838881969451904, "learning_rate": 0.0001983071748878924, "loss": 1.6638, "step": 161 }, { "epoch": 0.018150752079773676, "grad_norm": 1.3662748336791992, "learning_rate": 0.00019829596412556055, "loss": 1.3383, "step": 162 }, { "epoch": 0.018262793759278452, "grad_norm": 1.752532720565796, "learning_rate": 0.00019828475336322872, "loss": 1.0822, "step": 163 }, { "epoch": 0.018374835438783228, "grad_norm": 1.3427776098251343, "learning_rate": 0.0001982735426008969, "loss": 1.5999, "step": 164 }, { "epoch": 0.018486877118288003, "grad_norm": 1.5192070007324219, "learning_rate": 0.00019826233183856503, "loss": 1.1088, "step": 165 }, { "epoch": 0.01859891879779278, "grad_norm": 1.3683332204818726, "learning_rate": 0.0001982511210762332, "loss": 1.1331, "step": 166 }, { "epoch": 0.018710960477297554, "grad_norm": 1.148078441619873, "learning_rate": 0.00019823991031390134, "loss": 0.9603, "step": 167 }, { "epoch": 0.01882300215680233, "grad_norm": 0.9604099988937378, "learning_rate": 0.0001982286995515695, "loss": 1.5976, "step": 168 }, { "epoch": 0.018935043836307106, "grad_norm": 1.064560890197754, "learning_rate": 0.00019821748878923767, "loss": 1.6299, "step": 169 }, { "epoch": 0.01904708551581188, "grad_norm": 1.705771803855896, "learning_rate": 0.00019820627802690584, "loss": 1.2095, "step": 170 }, { "epoch": 0.019159127195316657, "grad_norm": 1.0940591096878052, "learning_rate": 0.000198195067264574, "loss": 1.3196, "step": 171 }, { "epoch": 0.019271168874821432, "grad_norm": 1.1382821798324585, "learning_rate": 0.00019818385650224217, "loss": 1.3011, "step": 172 }, { "epoch": 0.019383210554326208, "grad_norm": 1.8101707696914673, "learning_rate": 0.00019817264573991031, "loss": 1.8803, "step": 173 }, { "epoch": 0.019495252233830983, "grad_norm": 1.801864504814148, "learning_rate": 0.00019816143497757848, "loss": 2.0976, "step": 174 }, { "epoch": 0.019607293913335763, "grad_norm": 1.5090821981430054, "learning_rate": 0.00019815022421524665, "loss": 2.0772, "step": 175 }, { "epoch": 0.019719335592840538, "grad_norm": 1.421566128730774, "learning_rate": 0.00019813901345291482, "loss": 1.9478, "step": 176 }, { "epoch": 0.019831377272345314, "grad_norm": 1.62209951877594, "learning_rate": 0.00019812780269058298, "loss": 1.8785, "step": 177 }, { "epoch": 0.01994341895185009, "grad_norm": 1.3505034446716309, "learning_rate": 0.00019811659192825115, "loss": 1.6081, "step": 178 }, { "epoch": 0.020055460631354865, "grad_norm": 1.4076950550079346, "learning_rate": 0.0001981053811659193, "loss": 2.2446, "step": 179 }, { "epoch": 0.02016750231085964, "grad_norm": 0.7761825323104858, "learning_rate": 0.00019809417040358743, "loss": 1.8346, "step": 180 }, { "epoch": 0.020279543990364416, "grad_norm": 1.8436955213546753, "learning_rate": 0.0001980829596412556, "loss": 1.4254, "step": 181 }, { "epoch": 0.02039158566986919, "grad_norm": 1.710219383239746, "learning_rate": 0.00019807174887892377, "loss": 1.9046, "step": 182 }, { "epoch": 0.020503627349373967, "grad_norm": 1.0336873531341553, "learning_rate": 0.00019806053811659193, "loss": 1.7183, "step": 183 }, { "epoch": 0.020615669028878743, "grad_norm": 1.5093308687210083, "learning_rate": 0.0001980493273542601, "loss": 1.9141, "step": 184 }, { "epoch": 0.02072771070838352, "grad_norm": 1.330002784729004, "learning_rate": 0.00019803811659192827, "loss": 1.71, "step": 185 }, { "epoch": 0.020839752387888294, "grad_norm": 0.8079724907875061, "learning_rate": 0.0001980269058295964, "loss": 1.6287, "step": 186 }, { "epoch": 0.02095179406739307, "grad_norm": 1.6826435327529907, "learning_rate": 0.00019801569506726458, "loss": 1.6756, "step": 187 }, { "epoch": 0.021063835746897845, "grad_norm": 0.9353287816047668, "learning_rate": 0.00019800448430493274, "loss": 1.7122, "step": 188 }, { "epoch": 0.02117587742640262, "grad_norm": 0.9601882696151733, "learning_rate": 0.0001979932735426009, "loss": 1.4844, "step": 189 }, { "epoch": 0.021287919105907396, "grad_norm": 1.6065773963928223, "learning_rate": 0.00019798206278026908, "loss": 1.112, "step": 190 }, { "epoch": 0.021399960785412172, "grad_norm": 0.79659104347229, "learning_rate": 0.00019797085201793724, "loss": 1.6287, "step": 191 }, { "epoch": 0.021512002464916948, "grad_norm": 1.3850687742233276, "learning_rate": 0.0001979596412556054, "loss": 1.7034, "step": 192 }, { "epoch": 0.021624044144421727, "grad_norm": 0.932006299495697, "learning_rate": 0.00019794843049327355, "loss": 2.0286, "step": 193 }, { "epoch": 0.021736085823926502, "grad_norm": 1.2963751554489136, "learning_rate": 0.0001979372197309417, "loss": 1.7384, "step": 194 }, { "epoch": 0.021848127503431278, "grad_norm": 1.2883082628250122, "learning_rate": 0.00019792600896860986, "loss": 1.5571, "step": 195 }, { "epoch": 0.021960169182936053, "grad_norm": 0.9729629158973694, "learning_rate": 0.00019791479820627803, "loss": 1.5155, "step": 196 }, { "epoch": 0.02207221086244083, "grad_norm": 1.2336026430130005, "learning_rate": 0.0001979035874439462, "loss": 1.7199, "step": 197 }, { "epoch": 0.022184252541945605, "grad_norm": 0.9740604758262634, "learning_rate": 0.00019789237668161436, "loss": 2.0912, "step": 198 }, { "epoch": 0.02229629422145038, "grad_norm": 1.230739712715149, "learning_rate": 0.00019788116591928253, "loss": 1.9899, "step": 199 }, { "epoch": 0.022408335900955156, "grad_norm": 0.8832396268844604, "learning_rate": 0.00019786995515695067, "loss": 1.5053, "step": 200 }, { "epoch": 0.02252037758045993, "grad_norm": 1.1344594955444336, "learning_rate": 0.00019785874439461884, "loss": 1.2298, "step": 201 }, { "epoch": 0.022632419259964707, "grad_norm": 0.7094107270240784, "learning_rate": 0.000197847533632287, "loss": 1.9798, "step": 202 }, { "epoch": 0.022744460939469482, "grad_norm": 1.0926110744476318, "learning_rate": 0.00019783632286995517, "loss": 2.0406, "step": 203 }, { "epoch": 0.022856502618974258, "grad_norm": 1.033949851989746, "learning_rate": 0.00019782511210762334, "loss": 1.4292, "step": 204 }, { "epoch": 0.022968544298479034, "grad_norm": 1.2810473442077637, "learning_rate": 0.0001978139013452915, "loss": 1.8416, "step": 205 }, { "epoch": 0.02308058597798381, "grad_norm": 2.013779401779175, "learning_rate": 0.00019780269058295965, "loss": 2.0196, "step": 206 }, { "epoch": 0.023192627657488585, "grad_norm": 0.8969178795814514, "learning_rate": 0.00019779147982062781, "loss": 1.7194, "step": 207 }, { "epoch": 0.02330466933699336, "grad_norm": 1.4899756908416748, "learning_rate": 0.00019778026905829595, "loss": 1.3023, "step": 208 }, { "epoch": 0.023416711016498136, "grad_norm": 0.9315899610519409, "learning_rate": 0.00019776905829596412, "loss": 1.5099, "step": 209 }, { "epoch": 0.02352875269600291, "grad_norm": 1.5726507902145386, "learning_rate": 0.0001977578475336323, "loss": 1.4659, "step": 210 }, { "epoch": 0.023640794375507687, "grad_norm": 1.7480225563049316, "learning_rate": 0.00019774663677130046, "loss": 1.7883, "step": 211 }, { "epoch": 0.023752836055012466, "grad_norm": 1.5200700759887695, "learning_rate": 0.00019773542600896862, "loss": 0.9978, "step": 212 }, { "epoch": 0.023864877734517242, "grad_norm": 1.8687169551849365, "learning_rate": 0.0001977242152466368, "loss": 2.0377, "step": 213 }, { "epoch": 0.023976919414022017, "grad_norm": 1.9143239259719849, "learning_rate": 0.00019771300448430493, "loss": 1.8692, "step": 214 }, { "epoch": 0.024088961093526793, "grad_norm": 2.197420358657837, "learning_rate": 0.0001977017937219731, "loss": 1.1437, "step": 215 }, { "epoch": 0.02420100277303157, "grad_norm": 1.799605131149292, "learning_rate": 0.00019769058295964127, "loss": 1.5003, "step": 216 }, { "epoch": 0.024313044452536344, "grad_norm": 1.621794581413269, "learning_rate": 0.00019767937219730943, "loss": 2.3832, "step": 217 }, { "epoch": 0.02442508613204112, "grad_norm": 1.4794801473617554, "learning_rate": 0.0001976681614349776, "loss": 1.4597, "step": 218 }, { "epoch": 0.024537127811545895, "grad_norm": 1.2842360734939575, "learning_rate": 0.00019765695067264577, "loss": 2.1606, "step": 219 }, { "epoch": 0.02464916949105067, "grad_norm": 1.224863886833191, "learning_rate": 0.0001976457399103139, "loss": 1.3842, "step": 220 }, { "epoch": 0.024761211170555446, "grad_norm": 1.805149793624878, "learning_rate": 0.00019763452914798205, "loss": 2.7351, "step": 221 }, { "epoch": 0.024873252850060222, "grad_norm": 2.2805233001708984, "learning_rate": 0.00019762331838565022, "loss": 1.9161, "step": 222 }, { "epoch": 0.024985294529564998, "grad_norm": 1.3603965044021606, "learning_rate": 0.00019761210762331838, "loss": 1.6164, "step": 223 }, { "epoch": 0.025097336209069773, "grad_norm": 1.4993096590042114, "learning_rate": 0.00019760089686098655, "loss": 1.7522, "step": 224 }, { "epoch": 0.02520937788857455, "grad_norm": 0.908557116985321, "learning_rate": 0.00019758968609865472, "loss": 1.5222, "step": 225 }, { "epoch": 0.025321419568079324, "grad_norm": 1.2751511335372925, "learning_rate": 0.00019757847533632289, "loss": 1.3236, "step": 226 }, { "epoch": 0.0254334612475841, "grad_norm": 1.6853814125061035, "learning_rate": 0.00019756726457399105, "loss": 2.1715, "step": 227 }, { "epoch": 0.025545502927088876, "grad_norm": 0.6410899758338928, "learning_rate": 0.0001975560538116592, "loss": 1.8816, "step": 228 }, { "epoch": 0.02565754460659365, "grad_norm": 1.2574126720428467, "learning_rate": 0.00019754484304932736, "loss": 1.8984, "step": 229 }, { "epoch": 0.02576958628609843, "grad_norm": 1.2478420734405518, "learning_rate": 0.00019753363228699553, "loss": 1.3103, "step": 230 }, { "epoch": 0.025881627965603206, "grad_norm": 1.0752112865447998, "learning_rate": 0.0001975224215246637, "loss": 1.2958, "step": 231 }, { "epoch": 0.02599366964510798, "grad_norm": 0.9785799384117126, "learning_rate": 0.00019751121076233186, "loss": 1.3963, "step": 232 }, { "epoch": 0.026105711324612757, "grad_norm": 4.135465621948242, "learning_rate": 0.00019750000000000003, "loss": 2.1451, "step": 233 }, { "epoch": 0.026217753004117533, "grad_norm": 0.9218178987503052, "learning_rate": 0.00019748878923766817, "loss": 2.012, "step": 234 }, { "epoch": 0.026329794683622308, "grad_norm": 2.135587692260742, "learning_rate": 0.0001974775784753363, "loss": 1.4194, "step": 235 }, { "epoch": 0.026441836363127084, "grad_norm": 1.272314190864563, "learning_rate": 0.00019746636771300448, "loss": 1.9521, "step": 236 }, { "epoch": 0.02655387804263186, "grad_norm": 1.5357069969177246, "learning_rate": 0.00019745515695067265, "loss": 1.8754, "step": 237 }, { "epoch": 0.026665919722136635, "grad_norm": 1.6885690689086914, "learning_rate": 0.0001974439461883408, "loss": 1.4649, "step": 238 }, { "epoch": 0.02677796140164141, "grad_norm": 1.081275463104248, "learning_rate": 0.00019743273542600898, "loss": 1.7169, "step": 239 }, { "epoch": 0.026890003081146186, "grad_norm": 1.9999622106552124, "learning_rate": 0.00019742152466367715, "loss": 2.1255, "step": 240 }, { "epoch": 0.02700204476065096, "grad_norm": 1.0931596755981445, "learning_rate": 0.0001974103139013453, "loss": 1.4128, "step": 241 }, { "epoch": 0.027114086440155737, "grad_norm": 1.1437228918075562, "learning_rate": 0.00019739910313901346, "loss": 1.8549, "step": 242 }, { "epoch": 0.027226128119660513, "grad_norm": 1.3704149723052979, "learning_rate": 0.00019738789237668162, "loss": 1.6556, "step": 243 }, { "epoch": 0.02733816979916529, "grad_norm": 1.0847150087356567, "learning_rate": 0.0001973766816143498, "loss": 1.2594, "step": 244 }, { "epoch": 0.027450211478670064, "grad_norm": 1.2711559534072876, "learning_rate": 0.00019736547085201796, "loss": 1.8414, "step": 245 }, { "epoch": 0.02756225315817484, "grad_norm": 1.730326533317566, "learning_rate": 0.00019735426008968612, "loss": 1.7949, "step": 246 }, { "epoch": 0.027674294837679615, "grad_norm": 1.082261085510254, "learning_rate": 0.0001973430493273543, "loss": 2.151, "step": 247 }, { "epoch": 0.027786336517184394, "grad_norm": 0.8588718175888062, "learning_rate": 0.00019733183856502243, "loss": 1.8003, "step": 248 }, { "epoch": 0.02789837819668917, "grad_norm": 0.8918629884719849, "learning_rate": 0.00019732062780269057, "loss": 1.5776, "step": 249 }, { "epoch": 0.028010419876193945, "grad_norm": 1.1382129192352295, "learning_rate": 0.00019730941704035874, "loss": 1.9343, "step": 250 }, { "epoch": 0.02812246155569872, "grad_norm": 1.3021328449249268, "learning_rate": 0.0001972982062780269, "loss": 2.0429, "step": 251 }, { "epoch": 0.028234503235203497, "grad_norm": 1.5220530033111572, "learning_rate": 0.00019728699551569507, "loss": 1.5295, "step": 252 }, { "epoch": 0.028346544914708272, "grad_norm": 0.8310325145721436, "learning_rate": 0.00019727578475336324, "loss": 1.4177, "step": 253 }, { "epoch": 0.028458586594213048, "grad_norm": 2.0889320373535156, "learning_rate": 0.0001972645739910314, "loss": 1.6004, "step": 254 }, { "epoch": 0.028570628273717823, "grad_norm": 0.8648313283920288, "learning_rate": 0.00019725336322869955, "loss": 1.7022, "step": 255 }, { "epoch": 0.0286826699532226, "grad_norm": 1.2006266117095947, "learning_rate": 0.00019724215246636772, "loss": 1.5645, "step": 256 }, { "epoch": 0.028794711632727375, "grad_norm": 1.2655441761016846, "learning_rate": 0.00019723094170403588, "loss": 1.4247, "step": 257 }, { "epoch": 0.02890675331223215, "grad_norm": 0.8098933696746826, "learning_rate": 0.00019721973094170405, "loss": 1.2999, "step": 258 }, { "epoch": 0.029018794991736926, "grad_norm": 2.2496917247772217, "learning_rate": 0.00019720852017937222, "loss": 1.9521, "step": 259 }, { "epoch": 0.0291308366712417, "grad_norm": 2.451916217803955, "learning_rate": 0.00019719730941704039, "loss": 1.8154, "step": 260 }, { "epoch": 0.029242878350746477, "grad_norm": 0.8755319714546204, "learning_rate": 0.00019718609865470853, "loss": 1.601, "step": 261 }, { "epoch": 0.029354920030251253, "grad_norm": 2.2917873859405518, "learning_rate": 0.0001971748878923767, "loss": 1.6173, "step": 262 }, { "epoch": 0.029466961709756028, "grad_norm": 1.2772296667099, "learning_rate": 0.00019716367713004483, "loss": 1.9988, "step": 263 }, { "epoch": 0.029579003389260804, "grad_norm": 1.4274053573608398, "learning_rate": 0.000197152466367713, "loss": 1.4551, "step": 264 }, { "epoch": 0.02969104506876558, "grad_norm": 1.1452568769454956, "learning_rate": 0.00019714125560538117, "loss": 1.4996, "step": 265 }, { "epoch": 0.029803086748270355, "grad_norm": 0.9970101714134216, "learning_rate": 0.00019713004484304934, "loss": 1.479, "step": 266 }, { "epoch": 0.029915128427775134, "grad_norm": 0.9015843272209167, "learning_rate": 0.0001971188340807175, "loss": 1.7147, "step": 267 }, { "epoch": 0.03002717010727991, "grad_norm": 1.0109182596206665, "learning_rate": 0.00019710762331838567, "loss": 1.6371, "step": 268 }, { "epoch": 0.030139211786784685, "grad_norm": 1.1231122016906738, "learning_rate": 0.0001970964125560538, "loss": 2.1375, "step": 269 }, { "epoch": 0.03025125346628946, "grad_norm": 1.2176053524017334, "learning_rate": 0.00019708520179372198, "loss": 1.7811, "step": 270 }, { "epoch": 0.030363295145794236, "grad_norm": 1.87118399143219, "learning_rate": 0.00019707399103139015, "loss": 2.1257, "step": 271 }, { "epoch": 0.030475336825299012, "grad_norm": 1.2000921964645386, "learning_rate": 0.0001970627802690583, "loss": 1.3879, "step": 272 }, { "epoch": 0.030587378504803787, "grad_norm": 1.4533759355545044, "learning_rate": 0.00019705156950672648, "loss": 1.7463, "step": 273 }, { "epoch": 0.030699420184308563, "grad_norm": 3.009805917739868, "learning_rate": 0.00019704035874439465, "loss": 1.3139, "step": 274 }, { "epoch": 0.03081146186381334, "grad_norm": 1.049795389175415, "learning_rate": 0.0001970291479820628, "loss": 1.339, "step": 275 }, { "epoch": 0.030923503543318114, "grad_norm": 2.122925043106079, "learning_rate": 0.00019701793721973093, "loss": 1.631, "step": 276 }, { "epoch": 0.03103554522282289, "grad_norm": 0.8711285591125488, "learning_rate": 0.0001970067264573991, "loss": 1.5981, "step": 277 }, { "epoch": 0.031147586902327665, "grad_norm": 1.6317955255508423, "learning_rate": 0.00019699551569506726, "loss": 1.7886, "step": 278 }, { "epoch": 0.03125962858183244, "grad_norm": 1.8461803197860718, "learning_rate": 0.00019698430493273543, "loss": 1.7766, "step": 279 }, { "epoch": 0.03137167026133722, "grad_norm": 1.2124357223510742, "learning_rate": 0.0001969730941704036, "loss": 1.6553, "step": 280 }, { "epoch": 0.03148371194084199, "grad_norm": 1.5174574851989746, "learning_rate": 0.00019696188340807177, "loss": 1.7814, "step": 281 }, { "epoch": 0.03159575362034677, "grad_norm": 1.3527004718780518, "learning_rate": 0.00019695067264573993, "loss": 1.7768, "step": 282 }, { "epoch": 0.03170779529985154, "grad_norm": 1.013122797012329, "learning_rate": 0.00019693946188340807, "loss": 1.4298, "step": 283 }, { "epoch": 0.03181983697935632, "grad_norm": 1.4962410926818848, "learning_rate": 0.00019692825112107624, "loss": 1.6015, "step": 284 }, { "epoch": 0.031931878658861094, "grad_norm": 0.9314521551132202, "learning_rate": 0.0001969170403587444, "loss": 1.7104, "step": 285 }, { "epoch": 0.032043920338365874, "grad_norm": 2.6918692588806152, "learning_rate": 0.00019690582959641258, "loss": 2.1252, "step": 286 }, { "epoch": 0.032155962017870646, "grad_norm": 1.280490517616272, "learning_rate": 0.00019689461883408074, "loss": 1.8901, "step": 287 }, { "epoch": 0.032268003697375425, "grad_norm": 1.486735224723816, "learning_rate": 0.0001968834080717489, "loss": 1.6796, "step": 288 }, { "epoch": 0.0323800453768802, "grad_norm": 1.3431689739227295, "learning_rate": 0.00019687219730941705, "loss": 1.7045, "step": 289 }, { "epoch": 0.032492087056384976, "grad_norm": 1.7347776889801025, "learning_rate": 0.0001968609865470852, "loss": 1.727, "step": 290 }, { "epoch": 0.03260412873588975, "grad_norm": 0.930458664894104, "learning_rate": 0.00019684977578475336, "loss": 1.9726, "step": 291 }, { "epoch": 0.03271617041539453, "grad_norm": 1.110557198524475, "learning_rate": 0.00019683856502242152, "loss": 1.6199, "step": 292 }, { "epoch": 0.0328282120948993, "grad_norm": 1.7467089891433716, "learning_rate": 0.0001968273542600897, "loss": 1.7755, "step": 293 }, { "epoch": 0.03294025377440408, "grad_norm": 1.788590908050537, "learning_rate": 0.00019681614349775786, "loss": 2.1834, "step": 294 }, { "epoch": 0.03305229545390886, "grad_norm": 0.905660092830658, "learning_rate": 0.00019680493273542603, "loss": 1.2032, "step": 295 }, { "epoch": 0.03316433713341363, "grad_norm": 1.477889060974121, "learning_rate": 0.00019679372197309417, "loss": 1.8476, "step": 296 }, { "epoch": 0.03327637881291841, "grad_norm": 0.6667436957359314, "learning_rate": 0.00019678251121076233, "loss": 1.7266, "step": 297 }, { "epoch": 0.03338842049242318, "grad_norm": 1.647091031074524, "learning_rate": 0.0001967713004484305, "loss": 2.1291, "step": 298 }, { "epoch": 0.03350046217192796, "grad_norm": 1.1983885765075684, "learning_rate": 0.00019676008968609867, "loss": 1.999, "step": 299 }, { "epoch": 0.03361250385143273, "grad_norm": 1.436549186706543, "learning_rate": 0.00019674887892376684, "loss": 1.905, "step": 300 }, { "epoch": 0.03372454553093751, "grad_norm": 1.6125506162643433, "learning_rate": 0.000196737668161435, "loss": 2.0037, "step": 301 }, { "epoch": 0.03383658721044228, "grad_norm": 1.4619356393814087, "learning_rate": 0.00019672645739910317, "loss": 1.8324, "step": 302 }, { "epoch": 0.03394862888994706, "grad_norm": 1.3705886602401733, "learning_rate": 0.0001967152466367713, "loss": 2.0903, "step": 303 }, { "epoch": 0.034060670569451834, "grad_norm": 1.2136726379394531, "learning_rate": 0.00019670403587443945, "loss": 1.3843, "step": 304 }, { "epoch": 0.03417271224895661, "grad_norm": 1.1028761863708496, "learning_rate": 0.00019669282511210762, "loss": 2.1832, "step": 305 }, { "epoch": 0.034284753928461385, "grad_norm": 1.3862169981002808, "learning_rate": 0.0001966816143497758, "loss": 1.9416, "step": 306 }, { "epoch": 0.034396795607966164, "grad_norm": 0.7595400810241699, "learning_rate": 0.00019667040358744395, "loss": 1.8054, "step": 307 }, { "epoch": 0.034508837287470936, "grad_norm": 1.1747914552688599, "learning_rate": 0.00019665919282511212, "loss": 1.7478, "step": 308 }, { "epoch": 0.034620878966975716, "grad_norm": 0.7822675704956055, "learning_rate": 0.0001966479820627803, "loss": 1.7089, "step": 309 }, { "epoch": 0.03473292064648049, "grad_norm": 1.1017740964889526, "learning_rate": 0.00019663677130044843, "loss": 1.4109, "step": 310 }, { "epoch": 0.03484496232598527, "grad_norm": 0.9330350160598755, "learning_rate": 0.0001966255605381166, "loss": 1.8018, "step": 311 }, { "epoch": 0.03495700400549004, "grad_norm": 1.2532985210418701, "learning_rate": 0.00019661434977578476, "loss": 1.865, "step": 312 }, { "epoch": 0.03506904568499482, "grad_norm": 1.2453970909118652, "learning_rate": 0.00019660313901345293, "loss": 1.8382, "step": 313 }, { "epoch": 0.0351810873644996, "grad_norm": 1.4777342081069946, "learning_rate": 0.0001965919282511211, "loss": 1.6031, "step": 314 }, { "epoch": 0.03529312904400437, "grad_norm": 1.061545729637146, "learning_rate": 0.00019658071748878927, "loss": 1.1069, "step": 315 }, { "epoch": 0.03540517072350915, "grad_norm": 1.1150285005569458, "learning_rate": 0.0001965695067264574, "loss": 2.1224, "step": 316 }, { "epoch": 0.03551721240301392, "grad_norm": 0.8883174061775208, "learning_rate": 0.00019655829596412557, "loss": 1.6756, "step": 317 }, { "epoch": 0.0356292540825187, "grad_norm": 1.2782224416732788, "learning_rate": 0.00019654708520179371, "loss": 1.5956, "step": 318 }, { "epoch": 0.03574129576202347, "grad_norm": 1.3573459386825562, "learning_rate": 0.00019653587443946188, "loss": 2.0242, "step": 319 }, { "epoch": 0.03585333744152825, "grad_norm": 1.4004002809524536, "learning_rate": 0.00019652466367713005, "loss": 0.9949, "step": 320 }, { "epoch": 0.03596537912103302, "grad_norm": 1.0463887453079224, "learning_rate": 0.00019651345291479822, "loss": 1.97, "step": 321 }, { "epoch": 0.0360774208005378, "grad_norm": 1.7141753435134888, "learning_rate": 0.00019650224215246638, "loss": 1.8385, "step": 322 }, { "epoch": 0.036189462480042574, "grad_norm": 0.9977089762687683, "learning_rate": 0.00019649103139013455, "loss": 2.0595, "step": 323 }, { "epoch": 0.03630150415954735, "grad_norm": 1.2370812892913818, "learning_rate": 0.0001964798206278027, "loss": 1.8646, "step": 324 }, { "epoch": 0.036413545839052125, "grad_norm": 2.370875120162964, "learning_rate": 0.00019646860986547086, "loss": 1.5712, "step": 325 }, { "epoch": 0.036525587518556904, "grad_norm": 1.1011273860931396, "learning_rate": 0.00019645739910313903, "loss": 1.3211, "step": 326 }, { "epoch": 0.036637629198061676, "grad_norm": 1.0241345167160034, "learning_rate": 0.0001964461883408072, "loss": 1.5538, "step": 327 }, { "epoch": 0.036749670877566455, "grad_norm": 0.8308309316635132, "learning_rate": 0.00019643497757847536, "loss": 1.6208, "step": 328 }, { "epoch": 0.03686171255707123, "grad_norm": 1.0683560371398926, "learning_rate": 0.00019642376681614353, "loss": 1.6646, "step": 329 }, { "epoch": 0.036973754236576006, "grad_norm": 1.338710069656372, "learning_rate": 0.00019641255605381167, "loss": 1.6107, "step": 330 }, { "epoch": 0.037085795916080785, "grad_norm": 1.3803410530090332, "learning_rate": 0.0001964013452914798, "loss": 1.526, "step": 331 }, { "epoch": 0.03719783759558556, "grad_norm": 1.3309446573257446, "learning_rate": 0.00019639013452914798, "loss": 1.5938, "step": 332 }, { "epoch": 0.03730987927509034, "grad_norm": 1.2669615745544434, "learning_rate": 0.00019637892376681614, "loss": 1.6408, "step": 333 }, { "epoch": 0.03742192095459511, "grad_norm": 0.8464401960372925, "learning_rate": 0.0001963677130044843, "loss": 1.5998, "step": 334 }, { "epoch": 0.03753396263409989, "grad_norm": 1.2851347923278809, "learning_rate": 0.00019635650224215248, "loss": 1.8217, "step": 335 }, { "epoch": 0.03764600431360466, "grad_norm": 1.2095690965652466, "learning_rate": 0.00019634529147982064, "loss": 2.3131, "step": 336 }, { "epoch": 0.03775804599310944, "grad_norm": 1.5301982164382935, "learning_rate": 0.0001963340807174888, "loss": 1.6189, "step": 337 }, { "epoch": 0.03787008767261421, "grad_norm": 1.4262126684188843, "learning_rate": 0.00019632286995515695, "loss": 1.6474, "step": 338 }, { "epoch": 0.03798212935211899, "grad_norm": 0.9846980571746826, "learning_rate": 0.00019631165919282512, "loss": 2.1263, "step": 339 }, { "epoch": 0.03809417103162376, "grad_norm": 1.2180737257003784, "learning_rate": 0.0001963004484304933, "loss": 1.3994, "step": 340 }, { "epoch": 0.03820621271112854, "grad_norm": 1.1930640935897827, "learning_rate": 0.00019628923766816145, "loss": 1.8619, "step": 341 }, { "epoch": 0.03831825439063331, "grad_norm": 1.9225149154663086, "learning_rate": 0.00019627802690582962, "loss": 1.9804, "step": 342 }, { "epoch": 0.03843029607013809, "grad_norm": 0.9720186591148376, "learning_rate": 0.0001962668161434978, "loss": 1.5603, "step": 343 }, { "epoch": 0.038542337749642865, "grad_norm": 0.8619855642318726, "learning_rate": 0.00019625560538116593, "loss": 1.5737, "step": 344 }, { "epoch": 0.038654379429147644, "grad_norm": 1.7882968187332153, "learning_rate": 0.00019624439461883407, "loss": 1.7441, "step": 345 }, { "epoch": 0.038766421108652416, "grad_norm": 1.0049161911010742, "learning_rate": 0.00019623318385650224, "loss": 1.6536, "step": 346 }, { "epoch": 0.038878462788157195, "grad_norm": 1.528041958808899, "learning_rate": 0.0001962219730941704, "loss": 2.0649, "step": 347 }, { "epoch": 0.03899050446766197, "grad_norm": 1.329078197479248, "learning_rate": 0.00019621076233183857, "loss": 1.9582, "step": 348 }, { "epoch": 0.039102546147166746, "grad_norm": 0.8762238621711731, "learning_rate": 0.00019619955156950674, "loss": 1.3453, "step": 349 }, { "epoch": 0.039214587826671525, "grad_norm": 0.9069124460220337, "learning_rate": 0.0001961883408071749, "loss": 1.5871, "step": 350 }, { "epoch": 0.0393266295061763, "grad_norm": 1.2021558284759521, "learning_rate": 0.00019617713004484305, "loss": 1.4283, "step": 351 }, { "epoch": 0.039438671185681076, "grad_norm": 1.1997159719467163, "learning_rate": 0.00019616591928251121, "loss": 1.3346, "step": 352 }, { "epoch": 0.03955071286518585, "grad_norm": 1.017584204673767, "learning_rate": 0.00019615470852017938, "loss": 1.8129, "step": 353 }, { "epoch": 0.03966275454469063, "grad_norm": 1.6694780588150024, "learning_rate": 0.00019614349775784755, "loss": 1.7362, "step": 354 }, { "epoch": 0.0397747962241954, "grad_norm": 1.4448509216308594, "learning_rate": 0.00019613228699551572, "loss": 1.2625, "step": 355 }, { "epoch": 0.03988683790370018, "grad_norm": 1.0976581573486328, "learning_rate": 0.00019612107623318388, "loss": 2.4105, "step": 356 }, { "epoch": 0.03999887958320495, "grad_norm": 0.8776283860206604, "learning_rate": 0.00019610986547085202, "loss": 1.3558, "step": 357 }, { "epoch": 0.04011092126270973, "grad_norm": 1.4431874752044678, "learning_rate": 0.0001960986547085202, "loss": 1.6871, "step": 358 }, { "epoch": 0.0402229629422145, "grad_norm": 0.9691896438598633, "learning_rate": 0.00019608744394618833, "loss": 1.9369, "step": 359 }, { "epoch": 0.04033500462171928, "grad_norm": 1.2434937953948975, "learning_rate": 0.0001960762331838565, "loss": 2.1344, "step": 360 }, { "epoch": 0.04044704630122405, "grad_norm": 1.5863988399505615, "learning_rate": 0.00019606502242152467, "loss": 2.0058, "step": 361 }, { "epoch": 0.04055908798072883, "grad_norm": 1.3439626693725586, "learning_rate": 0.00019605381165919283, "loss": 1.4449, "step": 362 }, { "epoch": 0.040671129660233604, "grad_norm": 1.1116595268249512, "learning_rate": 0.000196042600896861, "loss": 1.735, "step": 363 }, { "epoch": 0.04078317133973838, "grad_norm": 2.0043749809265137, "learning_rate": 0.00019603139013452917, "loss": 2.1156, "step": 364 }, { "epoch": 0.040895213019243155, "grad_norm": 1.6407873630523682, "learning_rate": 0.0001960201793721973, "loss": 1.9848, "step": 365 }, { "epoch": 0.041007254698747934, "grad_norm": 1.0164940357208252, "learning_rate": 0.00019600896860986548, "loss": 1.8227, "step": 366 }, { "epoch": 0.04111929637825271, "grad_norm": 1.5119026899337769, "learning_rate": 0.00019599775784753364, "loss": 1.7301, "step": 367 }, { "epoch": 0.041231338057757486, "grad_norm": 1.3717575073242188, "learning_rate": 0.0001959865470852018, "loss": 1.4965, "step": 368 }, { "epoch": 0.041343379737262265, "grad_norm": 1.2949564456939697, "learning_rate": 0.00019597533632286998, "loss": 1.8325, "step": 369 }, { "epoch": 0.04145542141676704, "grad_norm": 0.8135946393013, "learning_rate": 0.00019596412556053815, "loss": 1.4313, "step": 370 }, { "epoch": 0.041567463096271816, "grad_norm": 0.7401415109634399, "learning_rate": 0.00019595291479820629, "loss": 1.5707, "step": 371 }, { "epoch": 0.04167950477577659, "grad_norm": 1.7585980892181396, "learning_rate": 0.00019594170403587445, "loss": 1.5784, "step": 372 }, { "epoch": 0.04179154645528137, "grad_norm": 1.3463921546936035, "learning_rate": 0.0001959304932735426, "loss": 0.9824, "step": 373 }, { "epoch": 0.04190358813478614, "grad_norm": 1.2857451438903809, "learning_rate": 0.00019591928251121076, "loss": 1.7876, "step": 374 }, { "epoch": 0.04201562981429092, "grad_norm": 1.6063556671142578, "learning_rate": 0.00019590807174887893, "loss": 1.7668, "step": 375 }, { "epoch": 0.04212767149379569, "grad_norm": 0.9151869416236877, "learning_rate": 0.0001958968609865471, "loss": 1.5426, "step": 376 }, { "epoch": 0.04223971317330047, "grad_norm": 1.2657008171081543, "learning_rate": 0.00019588565022421526, "loss": 1.4953, "step": 377 }, { "epoch": 0.04235175485280524, "grad_norm": 1.253432273864746, "learning_rate": 0.00019587443946188343, "loss": 1.6112, "step": 378 }, { "epoch": 0.04246379653231002, "grad_norm": 1.0362651348114014, "learning_rate": 0.00019586322869955157, "loss": 1.0107, "step": 379 }, { "epoch": 0.04257583821181479, "grad_norm": 1.1112746000289917, "learning_rate": 0.00019585201793721974, "loss": 1.522, "step": 380 }, { "epoch": 0.04268787989131957, "grad_norm": 1.555154800415039, "learning_rate": 0.0001958408071748879, "loss": 1.7784, "step": 381 }, { "epoch": 0.042799921570824344, "grad_norm": 1.1595889329910278, "learning_rate": 0.00019582959641255607, "loss": 1.5826, "step": 382 }, { "epoch": 0.04291196325032912, "grad_norm": 1.9532643556594849, "learning_rate": 0.00019581838565022424, "loss": 1.8004, "step": 383 }, { "epoch": 0.043024004929833895, "grad_norm": 1.1293835639953613, "learning_rate": 0.00019580717488789238, "loss": 1.5174, "step": 384 }, { "epoch": 0.043136046609338674, "grad_norm": 1.4853321313858032, "learning_rate": 0.00019579596412556055, "loss": 1.6396, "step": 385 }, { "epoch": 0.04324808828884345, "grad_norm": 2.342992067337036, "learning_rate": 0.0001957847533632287, "loss": 1.6274, "step": 386 }, { "epoch": 0.043360129968348225, "grad_norm": 2.0252702236175537, "learning_rate": 0.00019577354260089686, "loss": 1.416, "step": 387 }, { "epoch": 0.043472171647853004, "grad_norm": 1.7428449392318726, "learning_rate": 0.00019576233183856502, "loss": 1.5184, "step": 388 }, { "epoch": 0.043584213327357776, "grad_norm": 1.0605696439743042, "learning_rate": 0.0001957511210762332, "loss": 1.3066, "step": 389 }, { "epoch": 0.043696255006862555, "grad_norm": 3.658942937850952, "learning_rate": 0.00019573991031390136, "loss": 1.3568, "step": 390 }, { "epoch": 0.04380829668636733, "grad_norm": 1.1926772594451904, "learning_rate": 0.00019572869955156952, "loss": 1.6639, "step": 391 }, { "epoch": 0.04392033836587211, "grad_norm": 1.6239262819290161, "learning_rate": 0.0001957174887892377, "loss": 1.8316, "step": 392 }, { "epoch": 0.04403238004537688, "grad_norm": 1.5452934503555298, "learning_rate": 0.00019570627802690583, "loss": 1.9652, "step": 393 }, { "epoch": 0.04414442172488166, "grad_norm": 1.6719918251037598, "learning_rate": 0.000195695067264574, "loss": 1.8504, "step": 394 }, { "epoch": 0.04425646340438643, "grad_norm": 1.3847441673278809, "learning_rate": 0.00019568385650224217, "loss": 2.2518, "step": 395 }, { "epoch": 0.04436850508389121, "grad_norm": 1.0616874694824219, "learning_rate": 0.00019567264573991033, "loss": 1.8741, "step": 396 }, { "epoch": 0.04448054676339598, "grad_norm": 1.1261235475540161, "learning_rate": 0.0001956614349775785, "loss": 1.4618, "step": 397 }, { "epoch": 0.04459258844290076, "grad_norm": 2.4279415607452393, "learning_rate": 0.00019565022421524664, "loss": 1.7089, "step": 398 }, { "epoch": 0.04470463012240553, "grad_norm": 0.9406999945640564, "learning_rate": 0.0001956390134529148, "loss": 1.656, "step": 399 }, { "epoch": 0.04481667180191031, "grad_norm": 0.9876976013183594, "learning_rate": 0.00019562780269058295, "loss": 1.9191, "step": 400 }, { "epoch": 0.044928713481415083, "grad_norm": 1.2022384405136108, "learning_rate": 0.00019561659192825112, "loss": 1.4566, "step": 401 }, { "epoch": 0.04504075516091986, "grad_norm": 1.5037908554077148, "learning_rate": 0.00019560538116591928, "loss": 1.6817, "step": 402 }, { "epoch": 0.045152796840424635, "grad_norm": 0.8299053311347961, "learning_rate": 0.00019559417040358745, "loss": 1.6755, "step": 403 }, { "epoch": 0.045264838519929414, "grad_norm": 1.238200306892395, "learning_rate": 0.00019558295964125562, "loss": 1.1448, "step": 404 }, { "epoch": 0.04537688019943419, "grad_norm": 1.0826996564865112, "learning_rate": 0.00019557174887892379, "loss": 1.8699, "step": 405 }, { "epoch": 0.045488921878938965, "grad_norm": 1.012237548828125, "learning_rate": 0.00019556053811659195, "loss": 1.3251, "step": 406 }, { "epoch": 0.045600963558443744, "grad_norm": 0.9314955472946167, "learning_rate": 0.0001955493273542601, "loss": 1.8235, "step": 407 }, { "epoch": 0.045713005237948516, "grad_norm": 1.8796555995941162, "learning_rate": 0.00019553811659192826, "loss": 1.7569, "step": 408 }, { "epoch": 0.045825046917453295, "grad_norm": 1.3469252586364746, "learning_rate": 0.00019552690582959643, "loss": 1.7263, "step": 409 }, { "epoch": 0.04593708859695807, "grad_norm": 1.0205450057983398, "learning_rate": 0.0001955156950672646, "loss": 1.4553, "step": 410 }, { "epoch": 0.046049130276462846, "grad_norm": 0.8612064719200134, "learning_rate": 0.00019550448430493276, "loss": 1.4138, "step": 411 }, { "epoch": 0.04616117195596762, "grad_norm": 1.3465920686721802, "learning_rate": 0.0001954932735426009, "loss": 2.0413, "step": 412 }, { "epoch": 0.0462732136354724, "grad_norm": 1.3121436834335327, "learning_rate": 0.00019548206278026907, "loss": 1.2605, "step": 413 }, { "epoch": 0.04638525531497717, "grad_norm": 1.4270296096801758, "learning_rate": 0.0001954708520179372, "loss": 1.997, "step": 414 }, { "epoch": 0.04649729699448195, "grad_norm": 0.8366150259971619, "learning_rate": 0.00019545964125560538, "loss": 1.5546, "step": 415 }, { "epoch": 0.04660933867398672, "grad_norm": 2.7186005115509033, "learning_rate": 0.00019544843049327355, "loss": 1.1515, "step": 416 }, { "epoch": 0.0467213803534915, "grad_norm": 1.0428087711334229, "learning_rate": 0.0001954372197309417, "loss": 1.8941, "step": 417 }, { "epoch": 0.04683342203299627, "grad_norm": 1.7345372438430786, "learning_rate": 0.00019542600896860988, "loss": 1.5902, "step": 418 }, { "epoch": 0.04694546371250105, "grad_norm": 0.7889722585678101, "learning_rate": 0.00019541479820627805, "loss": 1.5483, "step": 419 }, { "epoch": 0.04705750539200582, "grad_norm": 0.8405618071556091, "learning_rate": 0.0001954035874439462, "loss": 1.6097, "step": 420 }, { "epoch": 0.0471695470715106, "grad_norm": 1.5713777542114258, "learning_rate": 0.00019539237668161436, "loss": 1.7015, "step": 421 }, { "epoch": 0.047281588751015374, "grad_norm": 1.4724451303482056, "learning_rate": 0.00019538116591928252, "loss": 1.6705, "step": 422 }, { "epoch": 0.04739363043052015, "grad_norm": 1.2453540563583374, "learning_rate": 0.0001953699551569507, "loss": 1.7336, "step": 423 }, { "epoch": 0.04750567211002493, "grad_norm": 1.6392441987991333, "learning_rate": 0.00019535874439461886, "loss": 1.6482, "step": 424 }, { "epoch": 0.047617713789529705, "grad_norm": 1.2431135177612305, "learning_rate": 0.000195347533632287, "loss": 1.8787, "step": 425 }, { "epoch": 0.047729755469034484, "grad_norm": 1.403626799583435, "learning_rate": 0.00019533632286995517, "loss": 1.8271, "step": 426 }, { "epoch": 0.047841797148539256, "grad_norm": 0.9284049272537231, "learning_rate": 0.00019532511210762333, "loss": 1.7346, "step": 427 }, { "epoch": 0.047953838828044035, "grad_norm": 0.7753445506095886, "learning_rate": 0.00019531390134529147, "loss": 1.6563, "step": 428 }, { "epoch": 0.04806588050754881, "grad_norm": 2.99448299407959, "learning_rate": 0.00019530269058295964, "loss": 1.864, "step": 429 }, { "epoch": 0.048177922187053586, "grad_norm": 2.515038013458252, "learning_rate": 0.0001952914798206278, "loss": 2.1239, "step": 430 }, { "epoch": 0.04828996386655836, "grad_norm": 1.0251903533935547, "learning_rate": 0.00019528026905829598, "loss": 1.2749, "step": 431 }, { "epoch": 0.04840200554606314, "grad_norm": 0.7387140393257141, "learning_rate": 0.00019526905829596414, "loss": 1.5976, "step": 432 }, { "epoch": 0.04851404722556791, "grad_norm": 1.4521501064300537, "learning_rate": 0.0001952578475336323, "loss": 1.3855, "step": 433 }, { "epoch": 0.04862608890507269, "grad_norm": 1.8010774850845337, "learning_rate": 0.00019524663677130045, "loss": 1.8613, "step": 434 }, { "epoch": 0.04873813058457746, "grad_norm": 1.1915080547332764, "learning_rate": 0.00019523542600896862, "loss": 2.2927, "step": 435 }, { "epoch": 0.04885017226408224, "grad_norm": 1.3017874956130981, "learning_rate": 0.00019522421524663678, "loss": 1.6742, "step": 436 }, { "epoch": 0.04896221394358701, "grad_norm": 1.4115819931030273, "learning_rate": 0.00019521300448430495, "loss": 1.2849, "step": 437 }, { "epoch": 0.04907425562309179, "grad_norm": 0.7979238629341125, "learning_rate": 0.00019520179372197312, "loss": 1.8925, "step": 438 }, { "epoch": 0.04918629730259656, "grad_norm": 1.8239283561706543, "learning_rate": 0.00019519058295964126, "loss": 1.3358, "step": 439 }, { "epoch": 0.04929833898210134, "grad_norm": 1.5216443538665771, "learning_rate": 0.00019517937219730943, "loss": 2.0686, "step": 440 }, { "epoch": 0.04941038066160612, "grad_norm": 1.3584249019622803, "learning_rate": 0.0001951681614349776, "loss": 2.0896, "step": 441 }, { "epoch": 0.04952242234111089, "grad_norm": 0.9599785208702087, "learning_rate": 0.00019515695067264573, "loss": 1.5626, "step": 442 }, { "epoch": 0.04963446402061567, "grad_norm": 0.9808817505836487, "learning_rate": 0.0001951457399103139, "loss": 1.8718, "step": 443 }, { "epoch": 0.049746505700120444, "grad_norm": 0.7888431549072266, "learning_rate": 0.00019513452914798207, "loss": 1.4428, "step": 444 }, { "epoch": 0.04985854737962522, "grad_norm": 2.3148183822631836, "learning_rate": 0.00019512331838565024, "loss": 1.8029, "step": 445 }, { "epoch": 0.049970589059129995, "grad_norm": 1.2516789436340332, "learning_rate": 0.0001951121076233184, "loss": 1.9047, "step": 446 }, { "epoch": 0.050082630738634774, "grad_norm": 1.9923287630081177, "learning_rate": 0.00019510089686098657, "loss": 1.537, "step": 447 }, { "epoch": 0.050194672418139546, "grad_norm": 0.9905103445053101, "learning_rate": 0.0001950896860986547, "loss": 1.4308, "step": 448 }, { "epoch": 0.050306714097644326, "grad_norm": 1.121449589729309, "learning_rate": 0.00019507847533632288, "loss": 1.6683, "step": 449 }, { "epoch": 0.0504187557771491, "grad_norm": 1.0734014511108398, "learning_rate": 0.00019506726457399105, "loss": 1.6593, "step": 450 }, { "epoch": 0.05053079745665388, "grad_norm": 1.0387274026870728, "learning_rate": 0.00019505605381165921, "loss": 1.5196, "step": 451 }, { "epoch": 0.05064283913615865, "grad_norm": 1.393731713294983, "learning_rate": 0.00019504484304932735, "loss": 1.751, "step": 452 }, { "epoch": 0.05075488081566343, "grad_norm": 0.8763155341148376, "learning_rate": 0.00019503363228699552, "loss": 1.8221, "step": 453 }, { "epoch": 0.0508669224951682, "grad_norm": 1.5712419748306274, "learning_rate": 0.0001950224215246637, "loss": 0.999, "step": 454 }, { "epoch": 0.05097896417467298, "grad_norm": 0.9974522590637207, "learning_rate": 0.00019501121076233183, "loss": 1.2821, "step": 455 }, { "epoch": 0.05109100585417775, "grad_norm": 0.8900431394577026, "learning_rate": 0.000195, "loss": 2.107, "step": 456 }, { "epoch": 0.05120304753368253, "grad_norm": 1.3398561477661133, "learning_rate": 0.00019498878923766816, "loss": 1.8211, "step": 457 }, { "epoch": 0.0513150892131873, "grad_norm": 1.1634259223937988, "learning_rate": 0.00019497757847533633, "loss": 1.146, "step": 458 }, { "epoch": 0.05142713089269208, "grad_norm": 1.3084980249404907, "learning_rate": 0.0001949663677130045, "loss": 1.5242, "step": 459 }, { "epoch": 0.05153917257219686, "grad_norm": 1.2305419445037842, "learning_rate": 0.00019495515695067267, "loss": 1.8817, "step": 460 }, { "epoch": 0.05165121425170163, "grad_norm": 1.3854093551635742, "learning_rate": 0.00019494394618834083, "loss": 1.8562, "step": 461 }, { "epoch": 0.05176325593120641, "grad_norm": 0.8933766484260559, "learning_rate": 0.00019493273542600897, "loss": 1.1939, "step": 462 }, { "epoch": 0.051875297610711184, "grad_norm": 0.9334263801574707, "learning_rate": 0.00019492152466367714, "loss": 1.2558, "step": 463 }, { "epoch": 0.05198733929021596, "grad_norm": 1.2120236158370972, "learning_rate": 0.0001949103139013453, "loss": 1.8497, "step": 464 }, { "epoch": 0.052099380969720735, "grad_norm": 1.6347684860229492, "learning_rate": 0.00019489910313901348, "loss": 1.9409, "step": 465 }, { "epoch": 0.052211422649225514, "grad_norm": 2.5327394008636475, "learning_rate": 0.00019488789237668162, "loss": 2.2259, "step": 466 }, { "epoch": 0.052323464328730286, "grad_norm": 1.2266803979873657, "learning_rate": 0.00019487668161434978, "loss": 1.5234, "step": 467 }, { "epoch": 0.052435506008235065, "grad_norm": 1.2157096862792969, "learning_rate": 0.00019486547085201795, "loss": 1.8274, "step": 468 }, { "epoch": 0.05254754768773984, "grad_norm": 1.2895034551620483, "learning_rate": 0.0001948542600896861, "loss": 1.5464, "step": 469 }, { "epoch": 0.052659589367244616, "grad_norm": 1.2867181301116943, "learning_rate": 0.00019484304932735426, "loss": 1.7527, "step": 470 }, { "epoch": 0.05277163104674939, "grad_norm": 1.1708359718322754, "learning_rate": 0.00019483183856502243, "loss": 2.1753, "step": 471 }, { "epoch": 0.05288367272625417, "grad_norm": 1.0046950578689575, "learning_rate": 0.0001948206278026906, "loss": 2.0765, "step": 472 }, { "epoch": 0.05299571440575894, "grad_norm": 1.1552709341049194, "learning_rate": 0.00019480941704035876, "loss": 1.4875, "step": 473 }, { "epoch": 0.05310775608526372, "grad_norm": 3.8343429565429688, "learning_rate": 0.00019479820627802693, "loss": 1.6043, "step": 474 }, { "epoch": 0.05321979776476849, "grad_norm": 1.2765480279922485, "learning_rate": 0.00019478699551569507, "loss": 1.8429, "step": 475 }, { "epoch": 0.05333183944427327, "grad_norm": 1.891106367111206, "learning_rate": 0.00019477578475336324, "loss": 1.604, "step": 476 }, { "epoch": 0.05344388112377804, "grad_norm": 1.648125171661377, "learning_rate": 0.0001947645739910314, "loss": 1.9258, "step": 477 }, { "epoch": 0.05355592280328282, "grad_norm": 0.8202097415924072, "learning_rate": 0.00019475336322869957, "loss": 1.5917, "step": 478 }, { "epoch": 0.0536679644827876, "grad_norm": 1.2674000263214111, "learning_rate": 0.00019474215246636774, "loss": 2.125, "step": 479 }, { "epoch": 0.05378000616229237, "grad_norm": 1.1370609998703003, "learning_rate": 0.00019473094170403588, "loss": 1.0458, "step": 480 }, { "epoch": 0.05389204784179715, "grad_norm": 1.2864603996276855, "learning_rate": 0.00019471973094170404, "loss": 1.4418, "step": 481 }, { "epoch": 0.05400408952130192, "grad_norm": 1.4680211544036865, "learning_rate": 0.0001947085201793722, "loss": 1.8607, "step": 482 }, { "epoch": 0.0541161312008067, "grad_norm": 1.7599356174468994, "learning_rate": 0.00019469730941704035, "loss": 1.7562, "step": 483 }, { "epoch": 0.054228172880311475, "grad_norm": 0.9292686581611633, "learning_rate": 0.00019468609865470852, "loss": 1.6821, "step": 484 }, { "epoch": 0.054340214559816254, "grad_norm": 0.7321863770484924, "learning_rate": 0.0001946748878923767, "loss": 1.815, "step": 485 }, { "epoch": 0.054452256239321026, "grad_norm": 1.0326498746871948, "learning_rate": 0.00019466367713004485, "loss": 1.5709, "step": 486 }, { "epoch": 0.054564297918825805, "grad_norm": 0.9008824825286865, "learning_rate": 0.00019465246636771302, "loss": 1.8069, "step": 487 }, { "epoch": 0.05467633959833058, "grad_norm": 1.5894330739974976, "learning_rate": 0.0001946412556053812, "loss": 1.8591, "step": 488 }, { "epoch": 0.054788381277835356, "grad_norm": 1.2621482610702515, "learning_rate": 0.00019463004484304933, "loss": 2.325, "step": 489 }, { "epoch": 0.05490042295734013, "grad_norm": 1.1861852407455444, "learning_rate": 0.0001946188340807175, "loss": 1.7992, "step": 490 }, { "epoch": 0.05501246463684491, "grad_norm": 1.1596291065216064, "learning_rate": 0.00019460762331838566, "loss": 1.8402, "step": 491 }, { "epoch": 0.05512450631634968, "grad_norm": 1.3016706705093384, "learning_rate": 0.00019459641255605383, "loss": 2.2526, "step": 492 }, { "epoch": 0.05523654799585446, "grad_norm": 1.4892476797103882, "learning_rate": 0.00019458520179372197, "loss": 1.3269, "step": 493 }, { "epoch": 0.05534858967535923, "grad_norm": 1.6941190958023071, "learning_rate": 0.00019457399103139014, "loss": 1.2917, "step": 494 }, { "epoch": 0.05546063135486401, "grad_norm": 0.7974201440811157, "learning_rate": 0.0001945627802690583, "loss": 1.5719, "step": 495 }, { "epoch": 0.05557267303436879, "grad_norm": 1.457253336906433, "learning_rate": 0.00019455156950672647, "loss": 1.3188, "step": 496 }, { "epoch": 0.05568471471387356, "grad_norm": 1.2274360656738281, "learning_rate": 0.00019454035874439461, "loss": 1.7864, "step": 497 }, { "epoch": 0.05579675639337834, "grad_norm": 2.237525463104248, "learning_rate": 0.00019452914798206278, "loss": 1.8495, "step": 498 }, { "epoch": 0.05590879807288311, "grad_norm": 1.3327926397323608, "learning_rate": 0.00019451793721973095, "loss": 0.9039, "step": 499 }, { "epoch": 0.05602083975238789, "grad_norm": 0.7876049876213074, "learning_rate": 0.00019450672645739912, "loss": 1.3663, "step": 500 }, { "epoch": 0.05613288143189266, "grad_norm": 1.9813367128372192, "learning_rate": 0.00019449551569506728, "loss": 1.4414, "step": 501 }, { "epoch": 0.05624492311139744, "grad_norm": 1.329017996788025, "learning_rate": 0.00019448430493273545, "loss": 1.6578, "step": 502 }, { "epoch": 0.056356964790902214, "grad_norm": 1.3830513954162598, "learning_rate": 0.0001944730941704036, "loss": 1.3302, "step": 503 }, { "epoch": 0.05646900647040699, "grad_norm": 1.1096640825271606, "learning_rate": 0.00019446188340807176, "loss": 1.5354, "step": 504 }, { "epoch": 0.056581048149911765, "grad_norm": 1.455906629562378, "learning_rate": 0.00019445067264573993, "loss": 1.6559, "step": 505 }, { "epoch": 0.056693089829416544, "grad_norm": 0.9843322038650513, "learning_rate": 0.0001944394618834081, "loss": 1.4746, "step": 506 }, { "epoch": 0.05680513150892132, "grad_norm": 1.704386591911316, "learning_rate": 0.00019442825112107623, "loss": 1.7783, "step": 507 }, { "epoch": 0.056917173188426096, "grad_norm": 1.5055381059646606, "learning_rate": 0.0001944170403587444, "loss": 1.3677, "step": 508 }, { "epoch": 0.05702921486793087, "grad_norm": 1.5124728679656982, "learning_rate": 0.00019440582959641257, "loss": 1.6121, "step": 509 }, { "epoch": 0.05714125654743565, "grad_norm": 3.4074649810791016, "learning_rate": 0.0001943946188340807, "loss": 1.6547, "step": 510 }, { "epoch": 0.05725329822694042, "grad_norm": 1.635071873664856, "learning_rate": 0.00019438340807174888, "loss": 1.6014, "step": 511 }, { "epoch": 0.0573653399064452, "grad_norm": 1.5960386991500854, "learning_rate": 0.00019437219730941704, "loss": 1.6351, "step": 512 }, { "epoch": 0.05747738158594997, "grad_norm": 1.1542471647262573, "learning_rate": 0.0001943609865470852, "loss": 1.8033, "step": 513 }, { "epoch": 0.05758942326545475, "grad_norm": 2.3391950130462646, "learning_rate": 0.00019434977578475338, "loss": 1.472, "step": 514 }, { "epoch": 0.05770146494495953, "grad_norm": 1.0469648838043213, "learning_rate": 0.00019433856502242155, "loss": 1.5707, "step": 515 }, { "epoch": 0.0578135066244643, "grad_norm": 1.7558293342590332, "learning_rate": 0.0001943273542600897, "loss": 1.4467, "step": 516 }, { "epoch": 0.05792554830396908, "grad_norm": 0.7670688033103943, "learning_rate": 0.00019431614349775785, "loss": 1.4948, "step": 517 }, { "epoch": 0.05803758998347385, "grad_norm": 1.607663869857788, "learning_rate": 0.00019430493273542602, "loss": 0.8478, "step": 518 }, { "epoch": 0.05814963166297863, "grad_norm": 1.3174241781234741, "learning_rate": 0.0001942937219730942, "loss": 1.7015, "step": 519 }, { "epoch": 0.0582616733424834, "grad_norm": 1.257320761680603, "learning_rate": 0.00019428251121076233, "loss": 1.5568, "step": 520 }, { "epoch": 0.05837371502198818, "grad_norm": 1.334360122680664, "learning_rate": 0.0001942713004484305, "loss": 2.0381, "step": 521 }, { "epoch": 0.058485756701492954, "grad_norm": 1.567186951637268, "learning_rate": 0.00019426008968609866, "loss": 1.3422, "step": 522 }, { "epoch": 0.05859779838099773, "grad_norm": 1.9177517890930176, "learning_rate": 0.00019424887892376683, "loss": 1.3268, "step": 523 }, { "epoch": 0.058709840060502505, "grad_norm": 1.280199646949768, "learning_rate": 0.00019423766816143497, "loss": 1.6393, "step": 524 }, { "epoch": 0.058821881740007284, "grad_norm": 1.3562207221984863, "learning_rate": 0.00019422645739910314, "loss": 1.8326, "step": 525 }, { "epoch": 0.058933923419512056, "grad_norm": 1.2781028747558594, "learning_rate": 0.0001942152466367713, "loss": 1.5, "step": 526 }, { "epoch": 0.059045965099016835, "grad_norm": 1.1996397972106934, "learning_rate": 0.00019420403587443947, "loss": 1.6628, "step": 527 }, { "epoch": 0.05915800677852161, "grad_norm": 1.3632756471633911, "learning_rate": 0.00019419282511210764, "loss": 1.5914, "step": 528 }, { "epoch": 0.059270048458026386, "grad_norm": 1.2196897268295288, "learning_rate": 0.0001941816143497758, "loss": 2.0197, "step": 529 }, { "epoch": 0.05938209013753116, "grad_norm": 0.8799840211868286, "learning_rate": 0.00019417040358744395, "loss": 2.2983, "step": 530 }, { "epoch": 0.05949413181703594, "grad_norm": 1.0671155452728271, "learning_rate": 0.00019415919282511211, "loss": 1.6146, "step": 531 }, { "epoch": 0.05960617349654071, "grad_norm": 1.0285813808441162, "learning_rate": 0.00019414798206278028, "loss": 1.5848, "step": 532 }, { "epoch": 0.05971821517604549, "grad_norm": 1.1296168565750122, "learning_rate": 0.00019413677130044845, "loss": 1.1899, "step": 533 }, { "epoch": 0.05983025685555027, "grad_norm": 1.3228754997253418, "learning_rate": 0.0001941255605381166, "loss": 1.7298, "step": 534 }, { "epoch": 0.05994229853505504, "grad_norm": 1.3209829330444336, "learning_rate": 0.00019411434977578476, "loss": 1.7438, "step": 535 }, { "epoch": 0.06005434021455982, "grad_norm": 1.6977521181106567, "learning_rate": 0.00019410313901345292, "loss": 1.5587, "step": 536 }, { "epoch": 0.06016638189406459, "grad_norm": 1.986832857131958, "learning_rate": 0.0001940919282511211, "loss": 2.1739, "step": 537 }, { "epoch": 0.06027842357356937, "grad_norm": 0.9943732023239136, "learning_rate": 0.00019408071748878923, "loss": 1.7715, "step": 538 }, { "epoch": 0.06039046525307414, "grad_norm": 1.8003402948379517, "learning_rate": 0.0001940695067264574, "loss": 1.7535, "step": 539 }, { "epoch": 0.06050250693257892, "grad_norm": 1.2386789321899414, "learning_rate": 0.00019405829596412557, "loss": 1.3448, "step": 540 }, { "epoch": 0.060614548612083693, "grad_norm": 1.526781678199768, "learning_rate": 0.00019404708520179373, "loss": 1.3277, "step": 541 }, { "epoch": 0.06072659029158847, "grad_norm": 1.0718719959259033, "learning_rate": 0.0001940358744394619, "loss": 1.7342, "step": 542 }, { "epoch": 0.060838631971093245, "grad_norm": 1.589309811592102, "learning_rate": 0.00019402466367713007, "loss": 1.6161, "step": 543 }, { "epoch": 0.060950673650598024, "grad_norm": 0.9211404919624329, "learning_rate": 0.0001940134529147982, "loss": 1.1559, "step": 544 }, { "epoch": 0.061062715330102796, "grad_norm": 1.2579773664474487, "learning_rate": 0.00019400224215246638, "loss": 1.4888, "step": 545 }, { "epoch": 0.061174757009607575, "grad_norm": 1.1922067403793335, "learning_rate": 0.00019399103139013454, "loss": 1.2771, "step": 546 }, { "epoch": 0.06128679868911235, "grad_norm": 1.1067430973052979, "learning_rate": 0.0001939798206278027, "loss": 1.0728, "step": 547 }, { "epoch": 0.061398840368617126, "grad_norm": 1.2491462230682373, "learning_rate": 0.00019396860986547085, "loss": 1.9878, "step": 548 }, { "epoch": 0.0615108820481219, "grad_norm": 1.8326877355575562, "learning_rate": 0.00019395739910313902, "loss": 1.3451, "step": 549 }, { "epoch": 0.06162292372762668, "grad_norm": 1.2168675661087036, "learning_rate": 0.00019394618834080719, "loss": 1.3351, "step": 550 }, { "epoch": 0.061734965407131456, "grad_norm": 1.781981348991394, "learning_rate": 0.00019393497757847535, "loss": 1.256, "step": 551 }, { "epoch": 0.06184700708663623, "grad_norm": 0.8874163031578064, "learning_rate": 0.0001939237668161435, "loss": 1.5759, "step": 552 }, { "epoch": 0.06195904876614101, "grad_norm": 1.7908767461776733, "learning_rate": 0.00019391255605381166, "loss": 1.8538, "step": 553 }, { "epoch": 0.06207109044564578, "grad_norm": 1.9541566371917725, "learning_rate": 0.00019390134529147983, "loss": 2.1793, "step": 554 }, { "epoch": 0.06218313212515056, "grad_norm": 1.1118524074554443, "learning_rate": 0.000193890134529148, "loss": 2.1606, "step": 555 }, { "epoch": 0.06229517380465533, "grad_norm": 5.547523021697998, "learning_rate": 0.00019387892376681616, "loss": 1.9808, "step": 556 }, { "epoch": 0.06240721548416011, "grad_norm": 0.864743709564209, "learning_rate": 0.00019386771300448433, "loss": 1.5757, "step": 557 }, { "epoch": 0.06251925716366488, "grad_norm": 2.2572951316833496, "learning_rate": 0.00019385650224215247, "loss": 1.9432, "step": 558 }, { "epoch": 0.06263129884316966, "grad_norm": 2.5195441246032715, "learning_rate": 0.00019384529147982064, "loss": 1.7142, "step": 559 }, { "epoch": 0.06274334052267444, "grad_norm": 1.440269112586975, "learning_rate": 0.0001938340807174888, "loss": 1.5233, "step": 560 }, { "epoch": 0.0628553822021792, "grad_norm": 2.0224246978759766, "learning_rate": 0.00019382286995515695, "loss": 2.2311, "step": 561 }, { "epoch": 0.06296742388168398, "grad_norm": 1.1698909997940063, "learning_rate": 0.0001938116591928251, "loss": 1.6611, "step": 562 }, { "epoch": 0.06307946556118876, "grad_norm": 1.1620304584503174, "learning_rate": 0.00019380044843049328, "loss": 1.9531, "step": 563 }, { "epoch": 0.06319150724069354, "grad_norm": 1.2336853742599487, "learning_rate": 0.00019378923766816145, "loss": 1.6202, "step": 564 }, { "epoch": 0.06330354892019831, "grad_norm": 1.0554990768432617, "learning_rate": 0.0001937780269058296, "loss": 1.5115, "step": 565 }, { "epoch": 0.06341559059970309, "grad_norm": 1.457017183303833, "learning_rate": 0.00019376681614349776, "loss": 1.5595, "step": 566 }, { "epoch": 0.06352763227920787, "grad_norm": 1.8213491439819336, "learning_rate": 0.00019375560538116592, "loss": 2.0033, "step": 567 }, { "epoch": 0.06363967395871264, "grad_norm": 1.5032572746276855, "learning_rate": 0.0001937443946188341, "loss": 1.5336, "step": 568 }, { "epoch": 0.06375171563821741, "grad_norm": 1.5448825359344482, "learning_rate": 0.00019373318385650226, "loss": 1.3652, "step": 569 }, { "epoch": 0.06386375731772219, "grad_norm": 1.9192923307418823, "learning_rate": 0.00019372197309417043, "loss": 1.4356, "step": 570 }, { "epoch": 0.06397579899722697, "grad_norm": 1.1655329465866089, "learning_rate": 0.0001937107623318386, "loss": 2.1782, "step": 571 }, { "epoch": 0.06408784067673175, "grad_norm": 1.3928945064544678, "learning_rate": 0.00019369955156950673, "loss": 2.169, "step": 572 }, { "epoch": 0.06419988235623653, "grad_norm": 1.5251027345657349, "learning_rate": 0.0001936883408071749, "loss": 1.372, "step": 573 }, { "epoch": 0.06431192403574129, "grad_norm": 1.3985291719436646, "learning_rate": 0.00019367713004484307, "loss": 2.4333, "step": 574 }, { "epoch": 0.06442396571524607, "grad_norm": 0.8858767747879028, "learning_rate": 0.0001936659192825112, "loss": 1.3738, "step": 575 }, { "epoch": 0.06453600739475085, "grad_norm": 0.9515557289123535, "learning_rate": 0.00019365470852017938, "loss": 1.5385, "step": 576 }, { "epoch": 0.06464804907425563, "grad_norm": 1.5776488780975342, "learning_rate": 0.00019364349775784754, "loss": 1.9943, "step": 577 }, { "epoch": 0.0647600907537604, "grad_norm": 0.975053608417511, "learning_rate": 0.0001936322869955157, "loss": 1.6046, "step": 578 }, { "epoch": 0.06487213243326517, "grad_norm": 1.2328945398330688, "learning_rate": 0.00019362107623318385, "loss": 1.7605, "step": 579 }, { "epoch": 0.06498417411276995, "grad_norm": 0.8134734034538269, "learning_rate": 0.00019360986547085202, "loss": 1.6539, "step": 580 }, { "epoch": 0.06509621579227473, "grad_norm": 1.0618723630905151, "learning_rate": 0.00019359865470852018, "loss": 1.4436, "step": 581 }, { "epoch": 0.0652082574717795, "grad_norm": 1.86225163936615, "learning_rate": 0.00019358744394618835, "loss": 1.4718, "step": 582 }, { "epoch": 0.06532029915128428, "grad_norm": 1.5602246522903442, "learning_rate": 0.00019357623318385652, "loss": 1.442, "step": 583 }, { "epoch": 0.06543234083078905, "grad_norm": 1.4900997877120972, "learning_rate": 0.0001935650224215247, "loss": 1.8355, "step": 584 }, { "epoch": 0.06554438251029383, "grad_norm": 2.4491498470306396, "learning_rate": 0.00019355381165919285, "loss": 1.7099, "step": 585 }, { "epoch": 0.0656564241897986, "grad_norm": 1.132898211479187, "learning_rate": 0.000193542600896861, "loss": 1.6208, "step": 586 }, { "epoch": 0.06576846586930338, "grad_norm": 3.2781739234924316, "learning_rate": 0.00019353139013452916, "loss": 1.9114, "step": 587 }, { "epoch": 0.06588050754880816, "grad_norm": 1.025285243988037, "learning_rate": 0.0001935201793721973, "loss": 1.3943, "step": 588 }, { "epoch": 0.06599254922831294, "grad_norm": 1.939928650856018, "learning_rate": 0.00019350896860986547, "loss": 1.7018, "step": 589 }, { "epoch": 0.06610459090781771, "grad_norm": 0.8566080927848816, "learning_rate": 0.00019349775784753364, "loss": 2.1022, "step": 590 }, { "epoch": 0.06621663258732248, "grad_norm": 1.2879219055175781, "learning_rate": 0.0001934865470852018, "loss": 1.7952, "step": 591 }, { "epoch": 0.06632867426682726, "grad_norm": 0.9299782514572144, "learning_rate": 0.00019347533632286997, "loss": 1.8905, "step": 592 }, { "epoch": 0.06644071594633204, "grad_norm": 1.6826132535934448, "learning_rate": 0.0001934641255605381, "loss": 1.8091, "step": 593 }, { "epoch": 0.06655275762583682, "grad_norm": 0.9929999709129333, "learning_rate": 0.00019345291479820628, "loss": 1.974, "step": 594 }, { "epoch": 0.06666479930534158, "grad_norm": 0.9665341973304749, "learning_rate": 0.00019344170403587445, "loss": 1.906, "step": 595 }, { "epoch": 0.06677684098484636, "grad_norm": 1.2471909523010254, "learning_rate": 0.00019343049327354261, "loss": 1.845, "step": 596 }, { "epoch": 0.06688888266435114, "grad_norm": 1.9563641548156738, "learning_rate": 0.00019341928251121078, "loss": 1.7998, "step": 597 }, { "epoch": 0.06700092434385592, "grad_norm": 1.5464650392532349, "learning_rate": 0.00019340807174887895, "loss": 2.2046, "step": 598 }, { "epoch": 0.06711296602336068, "grad_norm": 1.3337452411651611, "learning_rate": 0.0001933968609865471, "loss": 1.4272, "step": 599 }, { "epoch": 0.06722500770286546, "grad_norm": 2.5334770679473877, "learning_rate": 0.00019338565022421526, "loss": 2.0757, "step": 600 }, { "epoch": 0.06733704938237024, "grad_norm": 1.867078423500061, "learning_rate": 0.00019337443946188342, "loss": 1.8649, "step": 601 }, { "epoch": 0.06744909106187502, "grad_norm": 1.5922883749008179, "learning_rate": 0.00019336322869955156, "loss": 0.8325, "step": 602 }, { "epoch": 0.06756113274137979, "grad_norm": 1.1922354698181152, "learning_rate": 0.00019335201793721973, "loss": 1.9678, "step": 603 }, { "epoch": 0.06767317442088457, "grad_norm": 1.2455699443817139, "learning_rate": 0.0001933408071748879, "loss": 1.1069, "step": 604 }, { "epoch": 0.06778521610038934, "grad_norm": 1.3597272634506226, "learning_rate": 0.00019332959641255607, "loss": 1.1807, "step": 605 }, { "epoch": 0.06789725777989412, "grad_norm": 1.8468396663665771, "learning_rate": 0.00019331838565022423, "loss": 1.5257, "step": 606 }, { "epoch": 0.0680092994593989, "grad_norm": 0.7068287134170532, "learning_rate": 0.00019330717488789237, "loss": 1.9937, "step": 607 }, { "epoch": 0.06812134113890367, "grad_norm": 1.4060081243515015, "learning_rate": 0.00019329596412556054, "loss": 1.27, "step": 608 }, { "epoch": 0.06823338281840845, "grad_norm": 0.7737904787063599, "learning_rate": 0.0001932847533632287, "loss": 1.507, "step": 609 }, { "epoch": 0.06834542449791323, "grad_norm": 1.593989610671997, "learning_rate": 0.00019327354260089688, "loss": 1.6939, "step": 610 }, { "epoch": 0.068457466177418, "grad_norm": 1.7589298486709595, "learning_rate": 0.00019326233183856504, "loss": 1.8039, "step": 611 }, { "epoch": 0.06856950785692277, "grad_norm": 1.6184428930282593, "learning_rate": 0.0001932511210762332, "loss": 1.5757, "step": 612 }, { "epoch": 0.06868154953642755, "grad_norm": 1.857283592224121, "learning_rate": 0.00019323991031390135, "loss": 1.7324, "step": 613 }, { "epoch": 0.06879359121593233, "grad_norm": 1.2704938650131226, "learning_rate": 0.00019322869955156952, "loss": 1.378, "step": 614 }, { "epoch": 0.06890563289543711, "grad_norm": 1.8776118755340576, "learning_rate": 0.00019321748878923769, "loss": 2.1221, "step": 615 }, { "epoch": 0.06901767457494187, "grad_norm": 1.158272385597229, "learning_rate": 0.00019320627802690583, "loss": 1.1972, "step": 616 }, { "epoch": 0.06912971625444665, "grad_norm": 1.3880807161331177, "learning_rate": 0.000193195067264574, "loss": 1.7701, "step": 617 }, { "epoch": 0.06924175793395143, "grad_norm": 1.00984787940979, "learning_rate": 0.00019318385650224216, "loss": 1.2773, "step": 618 }, { "epoch": 0.06935379961345621, "grad_norm": 1.2057770490646362, "learning_rate": 0.00019317264573991033, "loss": 1.1083, "step": 619 }, { "epoch": 0.06946584129296098, "grad_norm": 1.2074064016342163, "learning_rate": 0.00019316143497757847, "loss": 1.1421, "step": 620 }, { "epoch": 0.06957788297246575, "grad_norm": 1.715919017791748, "learning_rate": 0.00019315022421524664, "loss": 1.8369, "step": 621 }, { "epoch": 0.06968992465197053, "grad_norm": 1.6098237037658691, "learning_rate": 0.0001931390134529148, "loss": 1.7134, "step": 622 }, { "epoch": 0.06980196633147531, "grad_norm": 1.2525181770324707, "learning_rate": 0.00019312780269058297, "loss": 1.4347, "step": 623 }, { "epoch": 0.06991400801098008, "grad_norm": 1.8823970556259155, "learning_rate": 0.00019311659192825114, "loss": 2.198, "step": 624 }, { "epoch": 0.07002604969048486, "grad_norm": 1.219985008239746, "learning_rate": 0.0001931053811659193, "loss": 1.4229, "step": 625 }, { "epoch": 0.07013809136998964, "grad_norm": 1.146316409111023, "learning_rate": 0.00019309417040358747, "loss": 1.1816, "step": 626 }, { "epoch": 0.07025013304949441, "grad_norm": 1.0068341493606567, "learning_rate": 0.0001930829596412556, "loss": 1.6907, "step": 627 }, { "epoch": 0.0703621747289992, "grad_norm": 1.6991386413574219, "learning_rate": 0.00019307174887892378, "loss": 1.3607, "step": 628 }, { "epoch": 0.07047421640850396, "grad_norm": 1.4927194118499756, "learning_rate": 0.00019306053811659192, "loss": 1.5724, "step": 629 }, { "epoch": 0.07058625808800874, "grad_norm": 1.7052232027053833, "learning_rate": 0.0001930493273542601, "loss": 1.5109, "step": 630 }, { "epoch": 0.07069829976751352, "grad_norm": 1.6351121664047241, "learning_rate": 0.00019303811659192825, "loss": 1.9138, "step": 631 }, { "epoch": 0.0708103414470183, "grad_norm": 1.0644959211349487, "learning_rate": 0.00019302690582959642, "loss": 1.5888, "step": 632 }, { "epoch": 0.07092238312652306, "grad_norm": 1.6471359729766846, "learning_rate": 0.0001930156950672646, "loss": 1.4784, "step": 633 }, { "epoch": 0.07103442480602784, "grad_norm": 1.648800253868103, "learning_rate": 0.00019300448430493273, "loss": 1.7406, "step": 634 }, { "epoch": 0.07114646648553262, "grad_norm": 1.3840559720993042, "learning_rate": 0.0001929932735426009, "loss": 1.5102, "step": 635 }, { "epoch": 0.0712585081650374, "grad_norm": 1.3428007364273071, "learning_rate": 0.00019298206278026906, "loss": 1.7919, "step": 636 }, { "epoch": 0.07137054984454216, "grad_norm": 0.9965330958366394, "learning_rate": 0.00019297085201793723, "loss": 1.4908, "step": 637 }, { "epoch": 0.07148259152404694, "grad_norm": 1.4655885696411133, "learning_rate": 0.0001929596412556054, "loss": 1.4924, "step": 638 }, { "epoch": 0.07159463320355172, "grad_norm": 1.157935619354248, "learning_rate": 0.00019294843049327357, "loss": 1.2802, "step": 639 }, { "epoch": 0.0717066748830565, "grad_norm": 1.8351969718933105, "learning_rate": 0.00019293721973094173, "loss": 1.5484, "step": 640 }, { "epoch": 0.07181871656256127, "grad_norm": 1.5923106670379639, "learning_rate": 0.00019292600896860987, "loss": 1.8617, "step": 641 }, { "epoch": 0.07193075824206605, "grad_norm": 0.8235540390014648, "learning_rate": 0.00019291479820627804, "loss": 1.5726, "step": 642 }, { "epoch": 0.07204279992157082, "grad_norm": 2.3871333599090576, "learning_rate": 0.00019290358744394618, "loss": 1.7369, "step": 643 }, { "epoch": 0.0721548416010756, "grad_norm": 1.1768988370895386, "learning_rate": 0.00019289237668161435, "loss": 1.2023, "step": 644 }, { "epoch": 0.07226688328058038, "grad_norm": 1.9633204936981201, "learning_rate": 0.00019288116591928252, "loss": 2.0607, "step": 645 }, { "epoch": 0.07237892496008515, "grad_norm": 1.1605974435806274, "learning_rate": 0.00019286995515695068, "loss": 1.5268, "step": 646 }, { "epoch": 0.07249096663958993, "grad_norm": 1.2688724994659424, "learning_rate": 0.00019285874439461885, "loss": 1.7027, "step": 647 }, { "epoch": 0.0726030083190947, "grad_norm": 0.9373829364776611, "learning_rate": 0.000192847533632287, "loss": 1.5658, "step": 648 }, { "epoch": 0.07271504999859948, "grad_norm": 1.6827270984649658, "learning_rate": 0.00019283632286995516, "loss": 1.3273, "step": 649 }, { "epoch": 0.07282709167810425, "grad_norm": 1.392282485961914, "learning_rate": 0.00019282511210762333, "loss": 1.9313, "step": 650 }, { "epoch": 0.07293913335760903, "grad_norm": 1.2647862434387207, "learning_rate": 0.0001928139013452915, "loss": 1.2022, "step": 651 }, { "epoch": 0.07305117503711381, "grad_norm": 1.8673008680343628, "learning_rate": 0.00019280269058295966, "loss": 1.6429, "step": 652 }, { "epoch": 0.07316321671661859, "grad_norm": 1.0790807008743286, "learning_rate": 0.00019279147982062783, "loss": 1.0707, "step": 653 }, { "epoch": 0.07327525839612335, "grad_norm": 1.4229792356491089, "learning_rate": 0.00019278026905829597, "loss": 2.5421, "step": 654 }, { "epoch": 0.07338730007562813, "grad_norm": 1.3801692724227905, "learning_rate": 0.00019276905829596414, "loss": 1.4029, "step": 655 }, { "epoch": 0.07349934175513291, "grad_norm": 4.27157735824585, "learning_rate": 0.00019275784753363228, "loss": 2.0188, "step": 656 }, { "epoch": 0.07361138343463769, "grad_norm": 1.2200791835784912, "learning_rate": 0.00019274663677130044, "loss": 1.7894, "step": 657 }, { "epoch": 0.07372342511414245, "grad_norm": 1.2106549739837646, "learning_rate": 0.0001927354260089686, "loss": 1.702, "step": 658 }, { "epoch": 0.07383546679364723, "grad_norm": 1.0735986232757568, "learning_rate": 0.00019272421524663678, "loss": 1.9229, "step": 659 }, { "epoch": 0.07394750847315201, "grad_norm": 1.818118691444397, "learning_rate": 0.00019271300448430495, "loss": 1.3984, "step": 660 }, { "epoch": 0.07405955015265679, "grad_norm": 0.9554075002670288, "learning_rate": 0.0001927017937219731, "loss": 1.2977, "step": 661 }, { "epoch": 0.07417159183216157, "grad_norm": 2.798494815826416, "learning_rate": 0.00019269058295964125, "loss": 1.6091, "step": 662 }, { "epoch": 0.07428363351166634, "grad_norm": 1.741694450378418, "learning_rate": 0.00019267937219730942, "loss": 1.7432, "step": 663 }, { "epoch": 0.07439567519117112, "grad_norm": 1.6263402700424194, "learning_rate": 0.0001926681614349776, "loss": 1.7858, "step": 664 }, { "epoch": 0.0745077168706759, "grad_norm": 1.250962257385254, "learning_rate": 0.00019265695067264576, "loss": 1.7349, "step": 665 }, { "epoch": 0.07461975855018067, "grad_norm": 1.0143482685089111, "learning_rate": 0.00019264573991031392, "loss": 1.4479, "step": 666 }, { "epoch": 0.07473180022968544, "grad_norm": 2.1976938247680664, "learning_rate": 0.0001926345291479821, "loss": 1.9715, "step": 667 }, { "epoch": 0.07484384190919022, "grad_norm": 1.5041617155075073, "learning_rate": 0.00019262331838565023, "loss": 1.6615, "step": 668 }, { "epoch": 0.074955883588695, "grad_norm": 1.379031777381897, "learning_rate": 0.0001926121076233184, "loss": 1.302, "step": 669 }, { "epoch": 0.07506792526819978, "grad_norm": 0.9097609519958496, "learning_rate": 0.00019260089686098654, "loss": 1.8427, "step": 670 }, { "epoch": 0.07517996694770454, "grad_norm": 1.6485527753829956, "learning_rate": 0.0001925896860986547, "loss": 1.8756, "step": 671 }, { "epoch": 0.07529200862720932, "grad_norm": 1.0029425621032715, "learning_rate": 0.00019257847533632287, "loss": 1.3802, "step": 672 }, { "epoch": 0.0754040503067141, "grad_norm": 1.4712623357772827, "learning_rate": 0.00019256726457399104, "loss": 1.7709, "step": 673 }, { "epoch": 0.07551609198621888, "grad_norm": 1.0914727449417114, "learning_rate": 0.0001925560538116592, "loss": 1.5071, "step": 674 }, { "epoch": 0.07562813366572364, "grad_norm": 1.524736762046814, "learning_rate": 0.00019254484304932737, "loss": 1.7861, "step": 675 }, { "epoch": 0.07574017534522842, "grad_norm": 1.3950635194778442, "learning_rate": 0.00019253363228699551, "loss": 1.6668, "step": 676 }, { "epoch": 0.0758522170247332, "grad_norm": 1.3355886936187744, "learning_rate": 0.00019252242152466368, "loss": 1.1069, "step": 677 }, { "epoch": 0.07596425870423798, "grad_norm": 1.4459270238876343, "learning_rate": 0.00019251121076233185, "loss": 1.297, "step": 678 }, { "epoch": 0.07607630038374275, "grad_norm": 2.76487135887146, "learning_rate": 0.00019250000000000002, "loss": 1.7925, "step": 679 }, { "epoch": 0.07618834206324752, "grad_norm": 1.0359196662902832, "learning_rate": 0.00019248878923766818, "loss": 1.6316, "step": 680 }, { "epoch": 0.0763003837427523, "grad_norm": 0.8216026425361633, "learning_rate": 0.00019247757847533635, "loss": 2.105, "step": 681 }, { "epoch": 0.07641242542225708, "grad_norm": 1.3211109638214111, "learning_rate": 0.0001924663677130045, "loss": 1.455, "step": 682 }, { "epoch": 0.07652446710176186, "grad_norm": 2.0110697746276855, "learning_rate": 0.00019245515695067266, "loss": 1.4375, "step": 683 }, { "epoch": 0.07663650878126663, "grad_norm": 1.8433934450149536, "learning_rate": 0.0001924439461883408, "loss": 1.4944, "step": 684 }, { "epoch": 0.0767485504607714, "grad_norm": 2.063809394836426, "learning_rate": 0.00019243273542600897, "loss": 2.0859, "step": 685 }, { "epoch": 0.07686059214027618, "grad_norm": 1.5021167993545532, "learning_rate": 0.00019242152466367713, "loss": 1.6151, "step": 686 }, { "epoch": 0.07697263381978096, "grad_norm": 3.8020730018615723, "learning_rate": 0.0001924103139013453, "loss": 1.6036, "step": 687 }, { "epoch": 0.07708467549928573, "grad_norm": 0.9469996690750122, "learning_rate": 0.00019239910313901347, "loss": 1.6454, "step": 688 }, { "epoch": 0.07719671717879051, "grad_norm": 1.1572556495666504, "learning_rate": 0.0001923878923766816, "loss": 1.9934, "step": 689 }, { "epoch": 0.07730875885829529, "grad_norm": 1.3121399879455566, "learning_rate": 0.00019237668161434978, "loss": 1.7716, "step": 690 }, { "epoch": 0.07742080053780007, "grad_norm": 3.092367172241211, "learning_rate": 0.00019236547085201794, "loss": 1.4252, "step": 691 }, { "epoch": 0.07753284221730483, "grad_norm": 1.2301249504089355, "learning_rate": 0.0001923542600896861, "loss": 1.6341, "step": 692 }, { "epoch": 0.07764488389680961, "grad_norm": 2.5081045627593994, "learning_rate": 0.00019234304932735428, "loss": 2.1749, "step": 693 }, { "epoch": 0.07775692557631439, "grad_norm": 1.88653564453125, "learning_rate": 0.00019233183856502245, "loss": 2.0223, "step": 694 }, { "epoch": 0.07786896725581917, "grad_norm": 0.9954336881637573, "learning_rate": 0.0001923206278026906, "loss": 1.6613, "step": 695 }, { "epoch": 0.07798100893532393, "grad_norm": 1.1692800521850586, "learning_rate": 0.00019230941704035875, "loss": 1.8653, "step": 696 }, { "epoch": 0.07809305061482871, "grad_norm": 1.79278564453125, "learning_rate": 0.0001922982062780269, "loss": 1.4322, "step": 697 }, { "epoch": 0.07820509229433349, "grad_norm": 2.6901299953460693, "learning_rate": 0.00019228699551569506, "loss": 1.9447, "step": 698 }, { "epoch": 0.07831713397383827, "grad_norm": 3.5576841831207275, "learning_rate": 0.00019227578475336323, "loss": 1.2763, "step": 699 }, { "epoch": 0.07842917565334305, "grad_norm": 1.9653180837631226, "learning_rate": 0.0001922645739910314, "loss": 1.7316, "step": 700 }, { "epoch": 0.07854121733284782, "grad_norm": 2.226872444152832, "learning_rate": 0.00019225336322869956, "loss": 1.2575, "step": 701 }, { "epoch": 0.0786532590123526, "grad_norm": 2.245013475418091, "learning_rate": 0.00019224215246636773, "loss": 2.1608, "step": 702 }, { "epoch": 0.07876530069185737, "grad_norm": 2.794711112976074, "learning_rate": 0.00019223094170403587, "loss": 1.6245, "step": 703 }, { "epoch": 0.07887734237136215, "grad_norm": 1.2057268619537354, "learning_rate": 0.00019221973094170404, "loss": 1.6231, "step": 704 }, { "epoch": 0.07898938405086692, "grad_norm": 1.2526170015335083, "learning_rate": 0.0001922085201793722, "loss": 1.1319, "step": 705 }, { "epoch": 0.0791014257303717, "grad_norm": 1.0712053775787354, "learning_rate": 0.00019219730941704037, "loss": 1.2923, "step": 706 }, { "epoch": 0.07921346740987648, "grad_norm": 1.4878727197647095, "learning_rate": 0.00019218609865470854, "loss": 1.4077, "step": 707 }, { "epoch": 0.07932550908938125, "grad_norm": 0.9522237777709961, "learning_rate": 0.0001921748878923767, "loss": 1.5971, "step": 708 }, { "epoch": 0.07943755076888602, "grad_norm": 2.0140554904937744, "learning_rate": 0.00019216367713004485, "loss": 1.4808, "step": 709 }, { "epoch": 0.0795495924483908, "grad_norm": 1.4914402961730957, "learning_rate": 0.00019215246636771302, "loss": 1.6418, "step": 710 }, { "epoch": 0.07966163412789558, "grad_norm": 1.6894279718399048, "learning_rate": 0.00019214125560538116, "loss": 1.3458, "step": 711 }, { "epoch": 0.07977367580740036, "grad_norm": 1.349097728729248, "learning_rate": 0.00019213004484304932, "loss": 1.7794, "step": 712 }, { "epoch": 0.07988571748690512, "grad_norm": 0.82957524061203, "learning_rate": 0.0001921188340807175, "loss": 1.7709, "step": 713 }, { "epoch": 0.0799977591664099, "grad_norm": 1.1667425632476807, "learning_rate": 0.00019210762331838566, "loss": 1.973, "step": 714 }, { "epoch": 0.08010980084591468, "grad_norm": 1.3632237911224365, "learning_rate": 0.00019209641255605383, "loss": 1.6385, "step": 715 }, { "epoch": 0.08022184252541946, "grad_norm": 0.9819737076759338, "learning_rate": 0.000192085201793722, "loss": 1.5238, "step": 716 }, { "epoch": 0.08033388420492424, "grad_norm": 1.420215129852295, "learning_rate": 0.00019207399103139013, "loss": 1.9979, "step": 717 }, { "epoch": 0.080445925884429, "grad_norm": 1.40873384475708, "learning_rate": 0.0001920627802690583, "loss": 1.3569, "step": 718 }, { "epoch": 0.08055796756393378, "grad_norm": 1.2131725549697876, "learning_rate": 0.00019205156950672647, "loss": 1.7278, "step": 719 }, { "epoch": 0.08067000924343856, "grad_norm": 1.748119592666626, "learning_rate": 0.00019204035874439463, "loss": 1.5855, "step": 720 }, { "epoch": 0.08078205092294334, "grad_norm": 1.2138866186141968, "learning_rate": 0.0001920291479820628, "loss": 1.6122, "step": 721 }, { "epoch": 0.0808940926024481, "grad_norm": 1.7375926971435547, "learning_rate": 0.00019201793721973097, "loss": 2.0149, "step": 722 }, { "epoch": 0.08100613428195289, "grad_norm": 1.781054139137268, "learning_rate": 0.0001920067264573991, "loss": 2.1021, "step": 723 }, { "epoch": 0.08111817596145766, "grad_norm": 0.9861398935317993, "learning_rate": 0.00019199551569506725, "loss": 1.2904, "step": 724 }, { "epoch": 0.08123021764096244, "grad_norm": 1.4197885990142822, "learning_rate": 0.00019198430493273542, "loss": 1.6323, "step": 725 }, { "epoch": 0.08134225932046721, "grad_norm": 1.4126633405685425, "learning_rate": 0.00019197309417040358, "loss": 1.2524, "step": 726 }, { "epoch": 0.08145430099997199, "grad_norm": 1.3889906406402588, "learning_rate": 0.00019196188340807175, "loss": 1.3404, "step": 727 }, { "epoch": 0.08156634267947677, "grad_norm": 1.1135979890823364, "learning_rate": 0.00019195067264573992, "loss": 1.4475, "step": 728 }, { "epoch": 0.08167838435898155, "grad_norm": 1.7179895639419556, "learning_rate": 0.0001919394618834081, "loss": 1.4179, "step": 729 }, { "epoch": 0.08179042603848631, "grad_norm": 2.3190109729766846, "learning_rate": 0.00019192825112107625, "loss": 1.9751, "step": 730 }, { "epoch": 0.08190246771799109, "grad_norm": 1.83403480052948, "learning_rate": 0.0001919170403587444, "loss": 2.1773, "step": 731 }, { "epoch": 0.08201450939749587, "grad_norm": 1.0584089756011963, "learning_rate": 0.00019190582959641256, "loss": 1.7815, "step": 732 }, { "epoch": 0.08212655107700065, "grad_norm": 1.2346776723861694, "learning_rate": 0.00019189461883408073, "loss": 1.8737, "step": 733 }, { "epoch": 0.08223859275650541, "grad_norm": 2.419379234313965, "learning_rate": 0.0001918834080717489, "loss": 1.6649, "step": 734 }, { "epoch": 0.08235063443601019, "grad_norm": 0.8249149918556213, "learning_rate": 0.00019187219730941706, "loss": 1.4758, "step": 735 }, { "epoch": 0.08246267611551497, "grad_norm": 0.7987962365150452, "learning_rate": 0.00019186098654708523, "loss": 1.8825, "step": 736 }, { "epoch": 0.08257471779501975, "grad_norm": 1.3334965705871582, "learning_rate": 0.00019184977578475337, "loss": 1.2616, "step": 737 }, { "epoch": 0.08268675947452453, "grad_norm": 1.3374146223068237, "learning_rate": 0.0001918385650224215, "loss": 1.67, "step": 738 }, { "epoch": 0.0827988011540293, "grad_norm": 1.8274801969528198, "learning_rate": 0.00019182735426008968, "loss": 1.7883, "step": 739 }, { "epoch": 0.08291084283353407, "grad_norm": 0.927907407283783, "learning_rate": 0.00019181614349775785, "loss": 1.5137, "step": 740 }, { "epoch": 0.08302288451303885, "grad_norm": 1.6287070512771606, "learning_rate": 0.00019180493273542601, "loss": 1.6129, "step": 741 }, { "epoch": 0.08313492619254363, "grad_norm": 1.0573532581329346, "learning_rate": 0.00019179372197309418, "loss": 1.8678, "step": 742 }, { "epoch": 0.0832469678720484, "grad_norm": 1.9506629705429077, "learning_rate": 0.00019178251121076235, "loss": 2.0669, "step": 743 }, { "epoch": 0.08335900955155318, "grad_norm": 1.2414381504058838, "learning_rate": 0.0001917713004484305, "loss": 1.4923, "step": 744 }, { "epoch": 0.08347105123105795, "grad_norm": 1.3295695781707764, "learning_rate": 0.00019176008968609866, "loss": 1.4901, "step": 745 }, { "epoch": 0.08358309291056273, "grad_norm": 1.3792515993118286, "learning_rate": 0.00019174887892376682, "loss": 1.4954, "step": 746 }, { "epoch": 0.0836951345900675, "grad_norm": 1.359349012374878, "learning_rate": 0.000191737668161435, "loss": 1.4152, "step": 747 }, { "epoch": 0.08380717626957228, "grad_norm": 1.322647213935852, "learning_rate": 0.00019172645739910316, "loss": 1.7131, "step": 748 }, { "epoch": 0.08391921794907706, "grad_norm": 1.539189100265503, "learning_rate": 0.00019171524663677133, "loss": 1.5366, "step": 749 }, { "epoch": 0.08403125962858184, "grad_norm": 2.0792412757873535, "learning_rate": 0.0001917040358744395, "loss": 0.8419, "step": 750 }, { "epoch": 0.0841433013080866, "grad_norm": 1.0477129220962524, "learning_rate": 0.00019169282511210763, "loss": 1.5835, "step": 751 }, { "epoch": 0.08425534298759138, "grad_norm": 2.156876802444458, "learning_rate": 0.00019168161434977577, "loss": 1.3536, "step": 752 }, { "epoch": 0.08436738466709616, "grad_norm": 2.7179315090179443, "learning_rate": 0.00019167040358744394, "loss": 1.4005, "step": 753 }, { "epoch": 0.08447942634660094, "grad_norm": 1.0690211057662964, "learning_rate": 0.0001916591928251121, "loss": 0.9029, "step": 754 }, { "epoch": 0.08459146802610572, "grad_norm": 2.1160542964935303, "learning_rate": 0.00019164798206278028, "loss": 1.4961, "step": 755 }, { "epoch": 0.08470350970561048, "grad_norm": 1.583309531211853, "learning_rate": 0.00019163677130044844, "loss": 1.5834, "step": 756 }, { "epoch": 0.08481555138511526, "grad_norm": 1.1766353845596313, "learning_rate": 0.0001916255605381166, "loss": 1.5866, "step": 757 }, { "epoch": 0.08492759306462004, "grad_norm": 1.419318675994873, "learning_rate": 0.00019161434977578475, "loss": 1.6326, "step": 758 }, { "epoch": 0.08503963474412482, "grad_norm": 1.5485981702804565, "learning_rate": 0.00019160313901345292, "loss": 1.9556, "step": 759 }, { "epoch": 0.08515167642362959, "grad_norm": 1.268850326538086, "learning_rate": 0.00019159192825112109, "loss": 1.7377, "step": 760 }, { "epoch": 0.08526371810313436, "grad_norm": 1.3385443687438965, "learning_rate": 0.00019158071748878925, "loss": 1.6286, "step": 761 }, { "epoch": 0.08537575978263914, "grad_norm": 1.0045478343963623, "learning_rate": 0.00019156950672645742, "loss": 1.6761, "step": 762 }, { "epoch": 0.08548780146214392, "grad_norm": 1.4866834878921509, "learning_rate": 0.0001915582959641256, "loss": 1.685, "step": 763 }, { "epoch": 0.08559984314164869, "grad_norm": 2.112055778503418, "learning_rate": 0.00019154708520179373, "loss": 1.6881, "step": 764 }, { "epoch": 0.08571188482115347, "grad_norm": 1.4385652542114258, "learning_rate": 0.0001915358744394619, "loss": 2.0051, "step": 765 }, { "epoch": 0.08582392650065825, "grad_norm": 2.977006196975708, "learning_rate": 0.00019152466367713004, "loss": 2.0398, "step": 766 }, { "epoch": 0.08593596818016302, "grad_norm": 2.1669135093688965, "learning_rate": 0.0001915134529147982, "loss": 2.0812, "step": 767 }, { "epoch": 0.08604800985966779, "grad_norm": 2.776005744934082, "learning_rate": 0.00019150224215246637, "loss": 1.5251, "step": 768 }, { "epoch": 0.08616005153917257, "grad_norm": 1.2335178852081299, "learning_rate": 0.00019149103139013454, "loss": 1.5721, "step": 769 }, { "epoch": 0.08627209321867735, "grad_norm": 1.8979558944702148, "learning_rate": 0.0001914798206278027, "loss": 1.5453, "step": 770 }, { "epoch": 0.08638413489818213, "grad_norm": 1.4379222393035889, "learning_rate": 0.00019146860986547087, "loss": 1.676, "step": 771 }, { "epoch": 0.0864961765776869, "grad_norm": 1.1782034635543823, "learning_rate": 0.000191457399103139, "loss": 1.5513, "step": 772 }, { "epoch": 0.08660821825719167, "grad_norm": 1.2528765201568604, "learning_rate": 0.00019144618834080718, "loss": 1.7923, "step": 773 }, { "epoch": 0.08672025993669645, "grad_norm": 1.1731401681900024, "learning_rate": 0.00019143497757847535, "loss": 1.2589, "step": 774 }, { "epoch": 0.08683230161620123, "grad_norm": 1.9159369468688965, "learning_rate": 0.00019142376681614351, "loss": 1.2467, "step": 775 }, { "epoch": 0.08694434329570601, "grad_norm": 1.0542395114898682, "learning_rate": 0.00019141255605381168, "loss": 1.7523, "step": 776 }, { "epoch": 0.08705638497521077, "grad_norm": 1.2532873153686523, "learning_rate": 0.00019140134529147985, "loss": 1.9731, "step": 777 }, { "epoch": 0.08716842665471555, "grad_norm": 1.799538016319275, "learning_rate": 0.000191390134529148, "loss": 1.5027, "step": 778 }, { "epoch": 0.08728046833422033, "grad_norm": 1.4426912069320679, "learning_rate": 0.00019137892376681613, "loss": 1.5173, "step": 779 }, { "epoch": 0.08739251001372511, "grad_norm": 1.2636182308197021, "learning_rate": 0.0001913677130044843, "loss": 1.5291, "step": 780 }, { "epoch": 0.08750455169322988, "grad_norm": 2.103139877319336, "learning_rate": 0.00019135650224215246, "loss": 1.1686, "step": 781 }, { "epoch": 0.08761659337273466, "grad_norm": 0.9959813356399536, "learning_rate": 0.00019134529147982063, "loss": 1.7824, "step": 782 }, { "epoch": 0.08772863505223943, "grad_norm": 1.6544737815856934, "learning_rate": 0.0001913340807174888, "loss": 1.3541, "step": 783 }, { "epoch": 0.08784067673174421, "grad_norm": 1.2097121477127075, "learning_rate": 0.00019132286995515697, "loss": 1.8456, "step": 784 }, { "epoch": 0.08795271841124898, "grad_norm": 1.4652163982391357, "learning_rate": 0.00019131165919282513, "loss": 1.7983, "step": 785 }, { "epoch": 0.08806476009075376, "grad_norm": 1.5618325471878052, "learning_rate": 0.00019130044843049327, "loss": 1.3297, "step": 786 }, { "epoch": 0.08817680177025854, "grad_norm": 1.4958552122116089, "learning_rate": 0.00019128923766816144, "loss": 1.0544, "step": 787 }, { "epoch": 0.08828884344976332, "grad_norm": 0.8024129271507263, "learning_rate": 0.0001912780269058296, "loss": 1.9873, "step": 788 }, { "epoch": 0.08840088512926808, "grad_norm": 1.7374728918075562, "learning_rate": 0.00019126681614349778, "loss": 1.5538, "step": 789 }, { "epoch": 0.08851292680877286, "grad_norm": 1.5014393329620361, "learning_rate": 0.00019125560538116594, "loss": 1.3612, "step": 790 }, { "epoch": 0.08862496848827764, "grad_norm": 1.150998592376709, "learning_rate": 0.0001912443946188341, "loss": 1.1144, "step": 791 }, { "epoch": 0.08873701016778242, "grad_norm": 0.840151846408844, "learning_rate": 0.00019123318385650225, "loss": 1.7167, "step": 792 }, { "epoch": 0.0888490518472872, "grad_norm": 0.8940207362174988, "learning_rate": 0.0001912219730941704, "loss": 2.15, "step": 793 }, { "epoch": 0.08896109352679196, "grad_norm": 1.468660593032837, "learning_rate": 0.00019121076233183856, "loss": 1.8046, "step": 794 }, { "epoch": 0.08907313520629674, "grad_norm": 1.7399226427078247, "learning_rate": 0.00019119955156950673, "loss": 1.9703, "step": 795 }, { "epoch": 0.08918517688580152, "grad_norm": 2.310018301010132, "learning_rate": 0.0001911883408071749, "loss": 1.252, "step": 796 }, { "epoch": 0.0892972185653063, "grad_norm": 2.276061773300171, "learning_rate": 0.00019117713004484306, "loss": 2.1799, "step": 797 }, { "epoch": 0.08940926024481106, "grad_norm": 1.7988471984863281, "learning_rate": 0.00019116591928251123, "loss": 1.2902, "step": 798 }, { "epoch": 0.08952130192431584, "grad_norm": 1.3268860578536987, "learning_rate": 0.00019115470852017937, "loss": 1.714, "step": 799 }, { "epoch": 0.08963334360382062, "grad_norm": 1.6129930019378662, "learning_rate": 0.00019114349775784754, "loss": 1.4637, "step": 800 }, { "epoch": 0.0897453852833254, "grad_norm": 1.5696073770523071, "learning_rate": 0.0001911322869955157, "loss": 0.9598, "step": 801 }, { "epoch": 0.08985742696283017, "grad_norm": 1.3652467727661133, "learning_rate": 0.00019112107623318387, "loss": 1.7702, "step": 802 }, { "epoch": 0.08996946864233495, "grad_norm": 2.531113624572754, "learning_rate": 0.00019110986547085204, "loss": 2.1914, "step": 803 }, { "epoch": 0.09008151032183973, "grad_norm": 1.702888011932373, "learning_rate": 0.0001910986547085202, "loss": 1.8668, "step": 804 }, { "epoch": 0.0901935520013445, "grad_norm": 1.2086619138717651, "learning_rate": 0.00019108744394618837, "loss": 0.9038, "step": 805 }, { "epoch": 0.09030559368084927, "grad_norm": 1.8835450410842896, "learning_rate": 0.0001910762331838565, "loss": 1.7971, "step": 806 }, { "epoch": 0.09041763536035405, "grad_norm": 0.9297971129417419, "learning_rate": 0.00019106502242152465, "loss": 1.801, "step": 807 }, { "epoch": 0.09052967703985883, "grad_norm": 1.287916660308838, "learning_rate": 0.00019105381165919282, "loss": 1.5356, "step": 808 }, { "epoch": 0.0906417187193636, "grad_norm": 1.5094938278198242, "learning_rate": 0.000191042600896861, "loss": 1.5186, "step": 809 }, { "epoch": 0.09075376039886839, "grad_norm": 1.3839126825332642, "learning_rate": 0.00019103139013452916, "loss": 1.9964, "step": 810 }, { "epoch": 0.09086580207837315, "grad_norm": 1.4169389009475708, "learning_rate": 0.00019102017937219732, "loss": 1.1092, "step": 811 }, { "epoch": 0.09097784375787793, "grad_norm": 1.2247720956802368, "learning_rate": 0.0001910089686098655, "loss": 1.5151, "step": 812 }, { "epoch": 0.09108988543738271, "grad_norm": 3.2513222694396973, "learning_rate": 0.00019099775784753363, "loss": 1.3811, "step": 813 }, { "epoch": 0.09120192711688749, "grad_norm": 2.9917190074920654, "learning_rate": 0.0001909865470852018, "loss": 1.6719, "step": 814 }, { "epoch": 0.09131396879639225, "grad_norm": 1.4377148151397705, "learning_rate": 0.00019097533632286997, "loss": 2.0493, "step": 815 }, { "epoch": 0.09142601047589703, "grad_norm": 1.2648364305496216, "learning_rate": 0.00019096412556053813, "loss": 1.4856, "step": 816 }, { "epoch": 0.09153805215540181, "grad_norm": 1.2196696996688843, "learning_rate": 0.0001909529147982063, "loss": 1.5793, "step": 817 }, { "epoch": 0.09165009383490659, "grad_norm": 1.936629056930542, "learning_rate": 0.00019094170403587447, "loss": 1.8304, "step": 818 }, { "epoch": 0.09176213551441136, "grad_norm": 1.3467540740966797, "learning_rate": 0.00019093049327354263, "loss": 1.5844, "step": 819 }, { "epoch": 0.09187417719391613, "grad_norm": 1.0181195735931396, "learning_rate": 0.00019091928251121077, "loss": 1.1846, "step": 820 }, { "epoch": 0.09198621887342091, "grad_norm": 0.8066987991333008, "learning_rate": 0.00019090807174887891, "loss": 1.463, "step": 821 }, { "epoch": 0.09209826055292569, "grad_norm": 1.4936957359313965, "learning_rate": 0.00019089686098654708, "loss": 2.064, "step": 822 }, { "epoch": 0.09221030223243046, "grad_norm": 1.0827463865280151, "learning_rate": 0.00019088565022421525, "loss": 1.2914, "step": 823 }, { "epoch": 0.09232234391193524, "grad_norm": 2.2994191646575928, "learning_rate": 0.00019087443946188342, "loss": 1.729, "step": 824 }, { "epoch": 0.09243438559144002, "grad_norm": 1.5784837007522583, "learning_rate": 0.00019086322869955158, "loss": 1.549, "step": 825 }, { "epoch": 0.0925464272709448, "grad_norm": 1.601065754890442, "learning_rate": 0.00019085201793721975, "loss": 1.6005, "step": 826 }, { "epoch": 0.09265846895044957, "grad_norm": 1.7969263792037964, "learning_rate": 0.0001908408071748879, "loss": 1.9893, "step": 827 }, { "epoch": 0.09277051062995434, "grad_norm": 1.51492178440094, "learning_rate": 0.00019082959641255606, "loss": 1.2178, "step": 828 }, { "epoch": 0.09288255230945912, "grad_norm": 2.1908648014068604, "learning_rate": 0.00019081838565022423, "loss": 1.8138, "step": 829 }, { "epoch": 0.0929945939889639, "grad_norm": 1.2633848190307617, "learning_rate": 0.0001908071748878924, "loss": 1.9758, "step": 830 }, { "epoch": 0.09310663566846868, "grad_norm": 1.7157303094863892, "learning_rate": 0.00019079596412556056, "loss": 1.8938, "step": 831 }, { "epoch": 0.09321867734797344, "grad_norm": 1.8596490621566772, "learning_rate": 0.00019078475336322873, "loss": 1.6611, "step": 832 }, { "epoch": 0.09333071902747822, "grad_norm": 0.9326462149620056, "learning_rate": 0.00019077354260089687, "loss": 1.8486, "step": 833 }, { "epoch": 0.093442760706983, "grad_norm": 1.1556771993637085, "learning_rate": 0.000190762331838565, "loss": 1.4611, "step": 834 }, { "epoch": 0.09355480238648778, "grad_norm": 1.3485546112060547, "learning_rate": 0.00019075112107623318, "loss": 1.2627, "step": 835 }, { "epoch": 0.09366684406599254, "grad_norm": 0.9178305268287659, "learning_rate": 0.00019073991031390134, "loss": 1.1021, "step": 836 }, { "epoch": 0.09377888574549732, "grad_norm": 1.3511427640914917, "learning_rate": 0.0001907286995515695, "loss": 2.0397, "step": 837 }, { "epoch": 0.0938909274250021, "grad_norm": 0.938939094543457, "learning_rate": 0.00019071748878923768, "loss": 1.874, "step": 838 }, { "epoch": 0.09400296910450688, "grad_norm": 1.4488400220870972, "learning_rate": 0.00019070627802690585, "loss": 1.5133, "step": 839 }, { "epoch": 0.09411501078401165, "grad_norm": 1.1752744913101196, "learning_rate": 0.000190695067264574, "loss": 1.3265, "step": 840 }, { "epoch": 0.09422705246351643, "grad_norm": 2.2610857486724854, "learning_rate": 0.00019068385650224215, "loss": 2.2551, "step": 841 }, { "epoch": 0.0943390941430212, "grad_norm": 2.1081557273864746, "learning_rate": 0.00019067264573991032, "loss": 2.2563, "step": 842 }, { "epoch": 0.09445113582252598, "grad_norm": 1.5040111541748047, "learning_rate": 0.0001906614349775785, "loss": 1.569, "step": 843 }, { "epoch": 0.09456317750203075, "grad_norm": 1.423416256904602, "learning_rate": 0.00019065022421524666, "loss": 1.8711, "step": 844 }, { "epoch": 0.09467521918153553, "grad_norm": 1.010010838508606, "learning_rate": 0.00019063901345291482, "loss": 1.8152, "step": 845 }, { "epoch": 0.0947872608610403, "grad_norm": 1.2485531568527222, "learning_rate": 0.000190627802690583, "loss": 1.5795, "step": 846 }, { "epoch": 0.09489930254054509, "grad_norm": 1.7884469032287598, "learning_rate": 0.00019061659192825113, "loss": 1.85, "step": 847 }, { "epoch": 0.09501134422004986, "grad_norm": 1.913137674331665, "learning_rate": 0.00019060538116591927, "loss": 1.1078, "step": 848 }, { "epoch": 0.09512338589955463, "grad_norm": 1.3539319038391113, "learning_rate": 0.00019059417040358744, "loss": 2.0948, "step": 849 }, { "epoch": 0.09523542757905941, "grad_norm": 2.2315945625305176, "learning_rate": 0.0001905829596412556, "loss": 1.6717, "step": 850 }, { "epoch": 0.09534746925856419, "grad_norm": 1.6454825401306152, "learning_rate": 0.00019057174887892377, "loss": 1.8291, "step": 851 }, { "epoch": 0.09545951093806897, "grad_norm": 0.9636425971984863, "learning_rate": 0.00019056053811659194, "loss": 1.894, "step": 852 }, { "epoch": 0.09557155261757373, "grad_norm": 0.9712859392166138, "learning_rate": 0.0001905493273542601, "loss": 1.3102, "step": 853 }, { "epoch": 0.09568359429707851, "grad_norm": 2.145829677581787, "learning_rate": 0.00019053811659192828, "loss": 1.5468, "step": 854 }, { "epoch": 0.09579563597658329, "grad_norm": 1.8615005016326904, "learning_rate": 0.00019052690582959642, "loss": 1.4827, "step": 855 }, { "epoch": 0.09590767765608807, "grad_norm": 1.097975254058838, "learning_rate": 0.00019051569506726458, "loss": 1.5307, "step": 856 }, { "epoch": 0.09601971933559283, "grad_norm": 1.2616732120513916, "learning_rate": 0.00019050448430493275, "loss": 1.8361, "step": 857 }, { "epoch": 0.09613176101509761, "grad_norm": 1.0333034992218018, "learning_rate": 0.00019049327354260092, "loss": 1.6881, "step": 858 }, { "epoch": 0.09624380269460239, "grad_norm": 0.6269555687904358, "learning_rate": 0.00019048206278026908, "loss": 1.6997, "step": 859 }, { "epoch": 0.09635584437410717, "grad_norm": 1.0800385475158691, "learning_rate": 0.00019047085201793723, "loss": 1.6483, "step": 860 }, { "epoch": 0.09646788605361194, "grad_norm": 1.6854339838027954, "learning_rate": 0.0001904596412556054, "loss": 1.9758, "step": 861 }, { "epoch": 0.09657992773311672, "grad_norm": 1.0728893280029297, "learning_rate": 0.00019044843049327353, "loss": 1.8254, "step": 862 }, { "epoch": 0.0966919694126215, "grad_norm": 1.298189640045166, "learning_rate": 0.0001904372197309417, "loss": 1.3215, "step": 863 }, { "epoch": 0.09680401109212627, "grad_norm": 2.5043559074401855, "learning_rate": 0.00019042600896860987, "loss": 1.789, "step": 864 }, { "epoch": 0.09691605277163105, "grad_norm": 1.1725019216537476, "learning_rate": 0.00019041479820627803, "loss": 1.1571, "step": 865 }, { "epoch": 0.09702809445113582, "grad_norm": 1.9095876216888428, "learning_rate": 0.0001904035874439462, "loss": 1.6395, "step": 866 }, { "epoch": 0.0971401361306406, "grad_norm": 1.2248321771621704, "learning_rate": 0.00019039237668161437, "loss": 1.7718, "step": 867 }, { "epoch": 0.09725217781014538, "grad_norm": 1.1460652351379395, "learning_rate": 0.0001903811659192825, "loss": 1.6832, "step": 868 }, { "epoch": 0.09736421948965016, "grad_norm": 1.2989020347595215, "learning_rate": 0.00019036995515695068, "loss": 1.5479, "step": 869 }, { "epoch": 0.09747626116915492, "grad_norm": 1.4916307926177979, "learning_rate": 0.00019035874439461884, "loss": 2.1977, "step": 870 }, { "epoch": 0.0975883028486597, "grad_norm": 1.8671627044677734, "learning_rate": 0.000190347533632287, "loss": 1.7409, "step": 871 }, { "epoch": 0.09770034452816448, "grad_norm": 0.7891229391098022, "learning_rate": 0.00019033632286995518, "loss": 1.7845, "step": 872 }, { "epoch": 0.09781238620766926, "grad_norm": 1.500860571861267, "learning_rate": 0.00019032511210762335, "loss": 2.0323, "step": 873 }, { "epoch": 0.09792442788717402, "grad_norm": 2.420445680618286, "learning_rate": 0.0001903139013452915, "loss": 1.7354, "step": 874 }, { "epoch": 0.0980364695666788, "grad_norm": 2.0569050312042236, "learning_rate": 0.00019030269058295965, "loss": 1.8661, "step": 875 }, { "epoch": 0.09814851124618358, "grad_norm": 1.8435910940170288, "learning_rate": 0.0001902914798206278, "loss": 1.7907, "step": 876 }, { "epoch": 0.09826055292568836, "grad_norm": 0.960837721824646, "learning_rate": 0.00019028026905829596, "loss": 2.0862, "step": 877 }, { "epoch": 0.09837259460519313, "grad_norm": 1.6574361324310303, "learning_rate": 0.00019026905829596413, "loss": 1.451, "step": 878 }, { "epoch": 0.0984846362846979, "grad_norm": 1.8239772319793701, "learning_rate": 0.0001902578475336323, "loss": 1.7046, "step": 879 }, { "epoch": 0.09859667796420268, "grad_norm": 1.7163302898406982, "learning_rate": 0.00019024663677130046, "loss": 1.4661, "step": 880 }, { "epoch": 0.09870871964370746, "grad_norm": 2.010563850402832, "learning_rate": 0.00019023542600896863, "loss": 1.4554, "step": 881 }, { "epoch": 0.09882076132321224, "grad_norm": 1.0625498294830322, "learning_rate": 0.00019022421524663677, "loss": 1.9973, "step": 882 }, { "epoch": 0.098932803002717, "grad_norm": 1.18570876121521, "learning_rate": 0.00019021300448430494, "loss": 1.3812, "step": 883 }, { "epoch": 0.09904484468222179, "grad_norm": 1.155850887298584, "learning_rate": 0.0001902017937219731, "loss": 1.6529, "step": 884 }, { "epoch": 0.09915688636172657, "grad_norm": 0.9372645020484924, "learning_rate": 0.00019019058295964127, "loss": 0.7109, "step": 885 }, { "epoch": 0.09926892804123134, "grad_norm": 1.5681971311569214, "learning_rate": 0.00019017937219730944, "loss": 1.4818, "step": 886 }, { "epoch": 0.09938096972073611, "grad_norm": 1.6102690696716309, "learning_rate": 0.0001901681614349776, "loss": 1.4932, "step": 887 }, { "epoch": 0.09949301140024089, "grad_norm": 0.5983882546424866, "learning_rate": 0.00019015695067264575, "loss": 1.3948, "step": 888 }, { "epoch": 0.09960505307974567, "grad_norm": 1.2839800119400024, "learning_rate": 0.00019014573991031392, "loss": 1.7582, "step": 889 }, { "epoch": 0.09971709475925045, "grad_norm": 1.4640576839447021, "learning_rate": 0.00019013452914798206, "loss": 1.9802, "step": 890 }, { "epoch": 0.09982913643875521, "grad_norm": 1.8316210508346558, "learning_rate": 0.00019012331838565022, "loss": 1.3988, "step": 891 }, { "epoch": 0.09994117811825999, "grad_norm": 2.4962213039398193, "learning_rate": 0.0001901121076233184, "loss": 1.5539, "step": 892 }, { "epoch": 0.10005321979776477, "grad_norm": 1.7808531522750854, "learning_rate": 0.00019010089686098656, "loss": 1.7608, "step": 893 }, { "epoch": 0.10016526147726955, "grad_norm": 2.064326524734497, "learning_rate": 0.00019008968609865473, "loss": 1.4776, "step": 894 }, { "epoch": 0.10027730315677431, "grad_norm": 1.4051932096481323, "learning_rate": 0.0001900784753363229, "loss": 1.402, "step": 895 }, { "epoch": 0.10038934483627909, "grad_norm": 1.4038610458374023, "learning_rate": 0.00019006726457399103, "loss": 1.2625, "step": 896 }, { "epoch": 0.10050138651578387, "grad_norm": 2.0812604427337646, "learning_rate": 0.0001900560538116592, "loss": 2.0652, "step": 897 }, { "epoch": 0.10061342819528865, "grad_norm": 1.9558478593826294, "learning_rate": 0.00019004484304932737, "loss": 1.4946, "step": 898 }, { "epoch": 0.10072546987479342, "grad_norm": 1.4161128997802734, "learning_rate": 0.00019003363228699554, "loss": 1.511, "step": 899 }, { "epoch": 0.1008375115542982, "grad_norm": 1.3336784839630127, "learning_rate": 0.0001900224215246637, "loss": 1.6602, "step": 900 }, { "epoch": 0.10094955323380297, "grad_norm": 1.1809784173965454, "learning_rate": 0.00019001121076233184, "loss": 1.5256, "step": 901 }, { "epoch": 0.10106159491330775, "grad_norm": 1.020554780960083, "learning_rate": 0.00019, "loss": 1.2975, "step": 902 }, { "epoch": 0.10117363659281253, "grad_norm": 0.9619705080986023, "learning_rate": 0.00018998878923766815, "loss": 1.781, "step": 903 }, { "epoch": 0.1012856782723173, "grad_norm": 1.506252408027649, "learning_rate": 0.00018997757847533632, "loss": 1.4934, "step": 904 }, { "epoch": 0.10139771995182208, "grad_norm": 1.1716561317443848, "learning_rate": 0.00018996636771300449, "loss": 1.0881, "step": 905 }, { "epoch": 0.10150976163132686, "grad_norm": 0.9163817763328552, "learning_rate": 0.00018995515695067265, "loss": 1.3749, "step": 906 }, { "epoch": 0.10162180331083163, "grad_norm": 1.656356930732727, "learning_rate": 0.00018994394618834082, "loss": 1.2352, "step": 907 }, { "epoch": 0.1017338449903364, "grad_norm": 2.0034427642822266, "learning_rate": 0.000189932735426009, "loss": 1.778, "step": 908 }, { "epoch": 0.10184588666984118, "grad_norm": 1.1967873573303223, "learning_rate": 0.00018992152466367715, "loss": 1.412, "step": 909 }, { "epoch": 0.10195792834934596, "grad_norm": 1.1976884603500366, "learning_rate": 0.0001899103139013453, "loss": 2.018, "step": 910 }, { "epoch": 0.10206997002885074, "grad_norm": 1.5037317276000977, "learning_rate": 0.00018989910313901346, "loss": 2.003, "step": 911 }, { "epoch": 0.1021820117083555, "grad_norm": 2.3447389602661133, "learning_rate": 0.00018988789237668163, "loss": 1.6345, "step": 912 }, { "epoch": 0.10229405338786028, "grad_norm": 2.476271629333496, "learning_rate": 0.0001898766816143498, "loss": 1.985, "step": 913 }, { "epoch": 0.10240609506736506, "grad_norm": 1.1526224613189697, "learning_rate": 0.00018986547085201796, "loss": 1.4334, "step": 914 }, { "epoch": 0.10251813674686984, "grad_norm": 1.5318467617034912, "learning_rate": 0.0001898542600896861, "loss": 2.2436, "step": 915 }, { "epoch": 0.1026301784263746, "grad_norm": 1.522047758102417, "learning_rate": 0.00018984304932735427, "loss": 1.6066, "step": 916 }, { "epoch": 0.10274222010587938, "grad_norm": 2.0543901920318604, "learning_rate": 0.0001898318385650224, "loss": 1.9602, "step": 917 }, { "epoch": 0.10285426178538416, "grad_norm": 1.668056845664978, "learning_rate": 0.00018982062780269058, "loss": 1.4992, "step": 918 }, { "epoch": 0.10296630346488894, "grad_norm": 1.0761185884475708, "learning_rate": 0.00018980941704035875, "loss": 1.1756, "step": 919 }, { "epoch": 0.10307834514439372, "grad_norm": 1.1710165739059448, "learning_rate": 0.00018979820627802691, "loss": 1.5681, "step": 920 }, { "epoch": 0.10319038682389849, "grad_norm": 1.836548924446106, "learning_rate": 0.00018978699551569508, "loss": 1.4409, "step": 921 }, { "epoch": 0.10330242850340327, "grad_norm": 2.0795719623565674, "learning_rate": 0.00018977578475336325, "loss": 1.6676, "step": 922 }, { "epoch": 0.10341447018290804, "grad_norm": 1.7500196695327759, "learning_rate": 0.0001897645739910314, "loss": 1.4105, "step": 923 }, { "epoch": 0.10352651186241282, "grad_norm": 1.1700764894485474, "learning_rate": 0.00018975336322869956, "loss": 1.0493, "step": 924 }, { "epoch": 0.10363855354191759, "grad_norm": 1.7889467477798462, "learning_rate": 0.00018974215246636772, "loss": 1.9412, "step": 925 }, { "epoch": 0.10375059522142237, "grad_norm": 1.4230329990386963, "learning_rate": 0.0001897309417040359, "loss": 1.3793, "step": 926 }, { "epoch": 0.10386263690092715, "grad_norm": 2.4129559993743896, "learning_rate": 0.00018971973094170406, "loss": 1.5878, "step": 927 }, { "epoch": 0.10397467858043193, "grad_norm": 1.469306468963623, "learning_rate": 0.0001897085201793722, "loss": 1.9533, "step": 928 }, { "epoch": 0.10408672025993669, "grad_norm": 1.2764030694961548, "learning_rate": 0.00018969730941704037, "loss": 1.8592, "step": 929 }, { "epoch": 0.10419876193944147, "grad_norm": 1.7059234380722046, "learning_rate": 0.00018968609865470853, "loss": 2.0264, "step": 930 }, { "epoch": 0.10431080361894625, "grad_norm": 1.240073323249817, "learning_rate": 0.00018967488789237667, "loss": 1.6161, "step": 931 }, { "epoch": 0.10442284529845103, "grad_norm": 0.9819527864456177, "learning_rate": 0.00018966367713004484, "loss": 1.3796, "step": 932 }, { "epoch": 0.1045348869779558, "grad_norm": 0.8821762204170227, "learning_rate": 0.000189652466367713, "loss": 1.8412, "step": 933 }, { "epoch": 0.10464692865746057, "grad_norm": 1.326291561126709, "learning_rate": 0.00018964125560538118, "loss": 1.4662, "step": 934 }, { "epoch": 0.10475897033696535, "grad_norm": 1.3162821531295776, "learning_rate": 0.00018963004484304934, "loss": 1.1898, "step": 935 }, { "epoch": 0.10487101201647013, "grad_norm": 1.3574730157852173, "learning_rate": 0.0001896188340807175, "loss": 1.4975, "step": 936 }, { "epoch": 0.10498305369597491, "grad_norm": 1.0984898805618286, "learning_rate": 0.00018960762331838565, "loss": 1.4799, "step": 937 }, { "epoch": 0.10509509537547967, "grad_norm": 0.9456785917282104, "learning_rate": 0.00018959641255605382, "loss": 1.394, "step": 938 }, { "epoch": 0.10520713705498445, "grad_norm": 1.1608121395111084, "learning_rate": 0.00018958520179372199, "loss": 1.4399, "step": 939 }, { "epoch": 0.10531917873448923, "grad_norm": 1.0202219486236572, "learning_rate": 0.00018957399103139015, "loss": 1.5093, "step": 940 }, { "epoch": 0.10543122041399401, "grad_norm": 1.3823832273483276, "learning_rate": 0.00018956278026905832, "loss": 1.0064, "step": 941 }, { "epoch": 0.10554326209349878, "grad_norm": 2.5737757682800293, "learning_rate": 0.00018955156950672646, "loss": 2.0318, "step": 942 }, { "epoch": 0.10565530377300356, "grad_norm": 1.398633360862732, "learning_rate": 0.00018954035874439463, "loss": 1.3235, "step": 943 }, { "epoch": 0.10576734545250834, "grad_norm": 1.0123497247695923, "learning_rate": 0.0001895291479820628, "loss": 1.8211, "step": 944 }, { "epoch": 0.10587938713201311, "grad_norm": 1.5623855590820312, "learning_rate": 0.00018951793721973094, "loss": 1.5035, "step": 945 }, { "epoch": 0.10599142881151788, "grad_norm": 1.5584602355957031, "learning_rate": 0.0001895067264573991, "loss": 1.4113, "step": 946 }, { "epoch": 0.10610347049102266, "grad_norm": 1.6150078773498535, "learning_rate": 0.00018949551569506727, "loss": 1.6633, "step": 947 }, { "epoch": 0.10621551217052744, "grad_norm": 1.0512274503707886, "learning_rate": 0.00018948430493273544, "loss": 1.7648, "step": 948 }, { "epoch": 0.10632755385003222, "grad_norm": 1.7366350889205933, "learning_rate": 0.0001894730941704036, "loss": 1.6694, "step": 949 }, { "epoch": 0.10643959552953698, "grad_norm": 0.9088833332061768, "learning_rate": 0.00018946188340807177, "loss": 1.4882, "step": 950 }, { "epoch": 0.10655163720904176, "grad_norm": 1.2315540313720703, "learning_rate": 0.0001894506726457399, "loss": 1.6461, "step": 951 }, { "epoch": 0.10666367888854654, "grad_norm": 1.975296974182129, "learning_rate": 0.00018943946188340808, "loss": 0.7851, "step": 952 }, { "epoch": 0.10677572056805132, "grad_norm": 1.4263640642166138, "learning_rate": 0.00018942825112107625, "loss": 1.4683, "step": 953 }, { "epoch": 0.10688776224755608, "grad_norm": 1.7573119401931763, "learning_rate": 0.00018941704035874442, "loss": 1.6726, "step": 954 }, { "epoch": 0.10699980392706086, "grad_norm": 1.8572345972061157, "learning_rate": 0.00018940582959641258, "loss": 1.4758, "step": 955 }, { "epoch": 0.10711184560656564, "grad_norm": 1.3331706523895264, "learning_rate": 0.00018939461883408072, "loss": 1.8168, "step": 956 }, { "epoch": 0.10722388728607042, "grad_norm": 1.7975895404815674, "learning_rate": 0.0001893834080717489, "loss": 1.2514, "step": 957 }, { "epoch": 0.1073359289655752, "grad_norm": 0.9501906633377075, "learning_rate": 0.00018937219730941703, "loss": 1.2572, "step": 958 }, { "epoch": 0.10744797064507997, "grad_norm": 1.1985948085784912, "learning_rate": 0.0001893609865470852, "loss": 1.5368, "step": 959 }, { "epoch": 0.10756001232458474, "grad_norm": 1.858951449394226, "learning_rate": 0.00018934977578475337, "loss": 1.5011, "step": 960 }, { "epoch": 0.10767205400408952, "grad_norm": 1.469639778137207, "learning_rate": 0.00018933856502242153, "loss": 1.8287, "step": 961 }, { "epoch": 0.1077840956835943, "grad_norm": 1.0255814790725708, "learning_rate": 0.0001893273542600897, "loss": 1.7385, "step": 962 }, { "epoch": 0.10789613736309907, "grad_norm": 1.3543230295181274, "learning_rate": 0.00018931614349775787, "loss": 1.5174, "step": 963 }, { "epoch": 0.10800817904260385, "grad_norm": 1.864389181137085, "learning_rate": 0.00018930493273542603, "loss": 1.5676, "step": 964 }, { "epoch": 0.10812022072210863, "grad_norm": 1.4070225954055786, "learning_rate": 0.00018929372197309417, "loss": 1.0937, "step": 965 }, { "epoch": 0.1082322624016134, "grad_norm": 1.3763872385025024, "learning_rate": 0.00018928251121076234, "loss": 1.4875, "step": 966 }, { "epoch": 0.10834430408111817, "grad_norm": 0.5404292345046997, "learning_rate": 0.0001892713004484305, "loss": 2.0342, "step": 967 }, { "epoch": 0.10845634576062295, "grad_norm": 1.4776684045791626, "learning_rate": 0.00018926008968609868, "loss": 1.6998, "step": 968 }, { "epoch": 0.10856838744012773, "grad_norm": 0.817471444606781, "learning_rate": 0.00018924887892376682, "loss": 1.8938, "step": 969 }, { "epoch": 0.10868042911963251, "grad_norm": 1.5000287294387817, "learning_rate": 0.00018923766816143498, "loss": 1.5676, "step": 970 }, { "epoch": 0.10879247079913727, "grad_norm": 1.6064025163650513, "learning_rate": 0.00018922645739910315, "loss": 1.2321, "step": 971 }, { "epoch": 0.10890451247864205, "grad_norm": 2.1155455112457275, "learning_rate": 0.0001892152466367713, "loss": 1.4263, "step": 972 }, { "epoch": 0.10901655415814683, "grad_norm": 1.6395606994628906, "learning_rate": 0.00018920403587443946, "loss": 1.5674, "step": 973 }, { "epoch": 0.10912859583765161, "grad_norm": 1.2282512187957764, "learning_rate": 0.00018919282511210763, "loss": 1.247, "step": 974 }, { "epoch": 0.10924063751715639, "grad_norm": 1.7950876951217651, "learning_rate": 0.0001891816143497758, "loss": 1.0405, "step": 975 }, { "epoch": 0.10935267919666115, "grad_norm": 1.3710075616836548, "learning_rate": 0.00018917040358744396, "loss": 1.2137, "step": 976 }, { "epoch": 0.10946472087616593, "grad_norm": 1.2060927152633667, "learning_rate": 0.00018915919282511213, "loss": 1.8283, "step": 977 }, { "epoch": 0.10957676255567071, "grad_norm": 1.8856624364852905, "learning_rate": 0.00018914798206278027, "loss": 1.3876, "step": 978 }, { "epoch": 0.10968880423517549, "grad_norm": 2.659623622894287, "learning_rate": 0.00018913677130044844, "loss": 1.8796, "step": 979 }, { "epoch": 0.10980084591468026, "grad_norm": 2.250305414199829, "learning_rate": 0.0001891255605381166, "loss": 1.7882, "step": 980 }, { "epoch": 0.10991288759418504, "grad_norm": 1.1937744617462158, "learning_rate": 0.00018911434977578477, "loss": 1.1009, "step": 981 }, { "epoch": 0.11002492927368981, "grad_norm": 0.9852249026298523, "learning_rate": 0.00018910313901345294, "loss": 1.2884, "step": 982 }, { "epoch": 0.1101369709531946, "grad_norm": 1.4839378595352173, "learning_rate": 0.00018909192825112108, "loss": 1.0234, "step": 983 }, { "epoch": 0.11024901263269936, "grad_norm": 0.962624192237854, "learning_rate": 0.00018908071748878925, "loss": 1.2992, "step": 984 }, { "epoch": 0.11036105431220414, "grad_norm": 2.1264169216156006, "learning_rate": 0.00018906950672645741, "loss": 1.7317, "step": 985 }, { "epoch": 0.11047309599170892, "grad_norm": 1.1468653678894043, "learning_rate": 0.00018905829596412555, "loss": 1.6789, "step": 986 }, { "epoch": 0.1105851376712137, "grad_norm": 1.9247063398361206, "learning_rate": 0.00018904708520179372, "loss": 1.4336, "step": 987 }, { "epoch": 0.11069717935071846, "grad_norm": 1.236794352531433, "learning_rate": 0.0001890358744394619, "loss": 1.8639, "step": 988 }, { "epoch": 0.11080922103022324, "grad_norm": 1.4858473539352417, "learning_rate": 0.00018902466367713006, "loss": 1.5645, "step": 989 }, { "epoch": 0.11092126270972802, "grad_norm": 1.0515556335449219, "learning_rate": 0.00018901345291479822, "loss": 1.7598, "step": 990 }, { "epoch": 0.1110333043892328, "grad_norm": 1.306551218032837, "learning_rate": 0.0001890022421524664, "loss": 1.3179, "step": 991 }, { "epoch": 0.11114534606873758, "grad_norm": 1.5684454441070557, "learning_rate": 0.00018899103139013453, "loss": 1.3286, "step": 992 }, { "epoch": 0.11125738774824234, "grad_norm": 1.5325795412063599, "learning_rate": 0.0001889798206278027, "loss": 1.2614, "step": 993 }, { "epoch": 0.11136942942774712, "grad_norm": 1.7805489301681519, "learning_rate": 0.00018896860986547087, "loss": 1.9807, "step": 994 }, { "epoch": 0.1114814711072519, "grad_norm": 1.449579119682312, "learning_rate": 0.00018895739910313903, "loss": 1.9432, "step": 995 }, { "epoch": 0.11159351278675668, "grad_norm": 0.9688440561294556, "learning_rate": 0.00018894618834080717, "loss": 1.4219, "step": 996 }, { "epoch": 0.11170555446626144, "grad_norm": 1.2533743381500244, "learning_rate": 0.00018893497757847534, "loss": 1.4873, "step": 997 }, { "epoch": 0.11181759614576622, "grad_norm": 1.3599330186843872, "learning_rate": 0.0001889237668161435, "loss": 1.5301, "step": 998 }, { "epoch": 0.111929637825271, "grad_norm": 1.9801050424575806, "learning_rate": 0.00018891255605381168, "loss": 2.2939, "step": 999 }, { "epoch": 0.11204167950477578, "grad_norm": 2.398099184036255, "learning_rate": 0.00018890134529147982, "loss": 1.6964, "step": 1000 }, { "epoch": 0.11215372118428055, "grad_norm": 1.3079990148544312, "learning_rate": 0.00018889013452914798, "loss": 1.5128, "step": 1001 }, { "epoch": 0.11226576286378533, "grad_norm": 1.622146725654602, "learning_rate": 0.00018887892376681615, "loss": 1.892, "step": 1002 }, { "epoch": 0.1123778045432901, "grad_norm": 1.657357931137085, "learning_rate": 0.00018886771300448432, "loss": 1.3011, "step": 1003 }, { "epoch": 0.11248984622279488, "grad_norm": 1.2244879007339478, "learning_rate": 0.00018885650224215249, "loss": 1.8452, "step": 1004 }, { "epoch": 0.11260188790229965, "grad_norm": 2.2575857639312744, "learning_rate": 0.00018884529147982065, "loss": 1.6385, "step": 1005 }, { "epoch": 0.11271392958180443, "grad_norm": 1.4053219556808472, "learning_rate": 0.0001888340807174888, "loss": 1.871, "step": 1006 }, { "epoch": 0.11282597126130921, "grad_norm": 2.3001794815063477, "learning_rate": 0.00018882286995515696, "loss": 1.7152, "step": 1007 }, { "epoch": 0.11293801294081399, "grad_norm": 1.194921851158142, "learning_rate": 0.00018881165919282513, "loss": 1.4821, "step": 1008 }, { "epoch": 0.11305005462031875, "grad_norm": 1.6533067226409912, "learning_rate": 0.0001888004484304933, "loss": 2.0697, "step": 1009 }, { "epoch": 0.11316209629982353, "grad_norm": 1.2700567245483398, "learning_rate": 0.00018878923766816143, "loss": 1.751, "step": 1010 }, { "epoch": 0.11327413797932831, "grad_norm": 0.9071850776672363, "learning_rate": 0.0001887780269058296, "loss": 1.4875, "step": 1011 }, { "epoch": 0.11338617965883309, "grad_norm": 1.7255955934524536, "learning_rate": 0.00018876681614349777, "loss": 1.4695, "step": 1012 }, { "epoch": 0.11349822133833787, "grad_norm": 1.8656980991363525, "learning_rate": 0.0001887556053811659, "loss": 1.6646, "step": 1013 }, { "epoch": 0.11361026301784263, "grad_norm": 0.826682448387146, "learning_rate": 0.00018874439461883408, "loss": 1.112, "step": 1014 }, { "epoch": 0.11372230469734741, "grad_norm": 1.2118510007858276, "learning_rate": 0.00018873318385650224, "loss": 1.4831, "step": 1015 }, { "epoch": 0.11383434637685219, "grad_norm": 0.9639819860458374, "learning_rate": 0.0001887219730941704, "loss": 2.0193, "step": 1016 }, { "epoch": 0.11394638805635697, "grad_norm": 1.5115575790405273, "learning_rate": 0.00018871076233183858, "loss": 1.6594, "step": 1017 }, { "epoch": 0.11405842973586174, "grad_norm": 1.8071820735931396, "learning_rate": 0.00018869955156950675, "loss": 1.4349, "step": 1018 }, { "epoch": 0.11417047141536651, "grad_norm": 2.1397342681884766, "learning_rate": 0.00018868834080717491, "loss": 1.5237, "step": 1019 }, { "epoch": 0.1142825130948713, "grad_norm": 1.9744607210159302, "learning_rate": 0.00018867713004484305, "loss": 1.9447, "step": 1020 }, { "epoch": 0.11439455477437607, "grad_norm": 1.4146044254302979, "learning_rate": 0.00018866591928251122, "loss": 1.7771, "step": 1021 }, { "epoch": 0.11450659645388084, "grad_norm": 1.4845824241638184, "learning_rate": 0.0001886547085201794, "loss": 1.4631, "step": 1022 }, { "epoch": 0.11461863813338562, "grad_norm": 1.929305076599121, "learning_rate": 0.00018864349775784756, "loss": 2.0898, "step": 1023 }, { "epoch": 0.1147306798128904, "grad_norm": 1.5724400281906128, "learning_rate": 0.0001886322869955157, "loss": 1.4901, "step": 1024 }, { "epoch": 0.11484272149239518, "grad_norm": 1.0998176336288452, "learning_rate": 0.00018862107623318386, "loss": 2.1825, "step": 1025 }, { "epoch": 0.11495476317189994, "grad_norm": 2.2030091285705566, "learning_rate": 0.00018860986547085203, "loss": 1.4978, "step": 1026 }, { "epoch": 0.11506680485140472, "grad_norm": 1.7921961545944214, "learning_rate": 0.00018859865470852017, "loss": 2.2413, "step": 1027 }, { "epoch": 0.1151788465309095, "grad_norm": 1.4612380266189575, "learning_rate": 0.00018858744394618834, "loss": 2.0042, "step": 1028 }, { "epoch": 0.11529088821041428, "grad_norm": 1.618935227394104, "learning_rate": 0.0001885762331838565, "loss": 1.475, "step": 1029 }, { "epoch": 0.11540292988991906, "grad_norm": 2.0939135551452637, "learning_rate": 0.00018856502242152467, "loss": 1.1355, "step": 1030 }, { "epoch": 0.11551497156942382, "grad_norm": 2.4288392066955566, "learning_rate": 0.00018855381165919284, "loss": 1.7629, "step": 1031 }, { "epoch": 0.1156270132489286, "grad_norm": 1.2769538164138794, "learning_rate": 0.000188542600896861, "loss": 1.185, "step": 1032 }, { "epoch": 0.11573905492843338, "grad_norm": 1.9874002933502197, "learning_rate": 0.00018853139013452915, "loss": 1.4859, "step": 1033 }, { "epoch": 0.11585109660793816, "grad_norm": 1.8041077852249146, "learning_rate": 0.00018852017937219732, "loss": 1.6215, "step": 1034 }, { "epoch": 0.11596313828744292, "grad_norm": 1.2754580974578857, "learning_rate": 0.00018850896860986548, "loss": 1.3758, "step": 1035 }, { "epoch": 0.1160751799669477, "grad_norm": 1.6835205554962158, "learning_rate": 0.00018849775784753365, "loss": 1.4382, "step": 1036 }, { "epoch": 0.11618722164645248, "grad_norm": 1.0885504484176636, "learning_rate": 0.0001884865470852018, "loss": 2.0172, "step": 1037 }, { "epoch": 0.11629926332595726, "grad_norm": 1.4596117734909058, "learning_rate": 0.00018847533632286996, "loss": 1.5947, "step": 1038 }, { "epoch": 0.11641130500546203, "grad_norm": 1.7352464199066162, "learning_rate": 0.00018846412556053813, "loss": 1.1332, "step": 1039 }, { "epoch": 0.1165233466849668, "grad_norm": 1.0744255781173706, "learning_rate": 0.0001884529147982063, "loss": 1.2531, "step": 1040 }, { "epoch": 0.11663538836447158, "grad_norm": 2.444344997406006, "learning_rate": 0.00018844170403587443, "loss": 1.4308, "step": 1041 }, { "epoch": 0.11674743004397636, "grad_norm": 1.9086079597473145, "learning_rate": 0.0001884304932735426, "loss": 1.356, "step": 1042 }, { "epoch": 0.11685947172348113, "grad_norm": 1.7402255535125732, "learning_rate": 0.00018841928251121077, "loss": 1.9419, "step": 1043 }, { "epoch": 0.11697151340298591, "grad_norm": 2.132375955581665, "learning_rate": 0.00018840807174887894, "loss": 1.5059, "step": 1044 }, { "epoch": 0.11708355508249069, "grad_norm": 1.5283995866775513, "learning_rate": 0.0001883968609865471, "loss": 1.4188, "step": 1045 }, { "epoch": 0.11719559676199547, "grad_norm": 1.038689136505127, "learning_rate": 0.00018838565022421527, "loss": 1.774, "step": 1046 }, { "epoch": 0.11730763844150024, "grad_norm": 0.6443621516227722, "learning_rate": 0.0001883744394618834, "loss": 1.2983, "step": 1047 }, { "epoch": 0.11741968012100501, "grad_norm": 1.087794303894043, "learning_rate": 0.00018836322869955158, "loss": 1.3035, "step": 1048 }, { "epoch": 0.11753172180050979, "grad_norm": 0.8623591661453247, "learning_rate": 0.00018835201793721975, "loss": 2.1704, "step": 1049 }, { "epoch": 0.11764376348001457, "grad_norm": 1.1607695817947388, "learning_rate": 0.0001883408071748879, "loss": 1.6475, "step": 1050 }, { "epoch": 0.11775580515951935, "grad_norm": 2.268831968307495, "learning_rate": 0.00018832959641255605, "loss": 1.4099, "step": 1051 }, { "epoch": 0.11786784683902411, "grad_norm": 1.1156142950057983, "learning_rate": 0.00018831838565022422, "loss": 1.758, "step": 1052 }, { "epoch": 0.11797988851852889, "grad_norm": 2.2018117904663086, "learning_rate": 0.0001883071748878924, "loss": 1.8489, "step": 1053 }, { "epoch": 0.11809193019803367, "grad_norm": 1.9882720708847046, "learning_rate": 0.00018829596412556055, "loss": 1.2045, "step": 1054 }, { "epoch": 0.11820397187753845, "grad_norm": 1.6095256805419922, "learning_rate": 0.0001882847533632287, "loss": 1.916, "step": 1055 }, { "epoch": 0.11831601355704321, "grad_norm": 2.6211628913879395, "learning_rate": 0.00018827354260089686, "loss": 1.5813, "step": 1056 }, { "epoch": 0.118428055236548, "grad_norm": 1.4501980543136597, "learning_rate": 0.00018826233183856503, "loss": 1.9038, "step": 1057 }, { "epoch": 0.11854009691605277, "grad_norm": 2.1715850830078125, "learning_rate": 0.0001882511210762332, "loss": 1.6545, "step": 1058 }, { "epoch": 0.11865213859555755, "grad_norm": 1.7407867908477783, "learning_rate": 0.00018823991031390136, "loss": 1.2612, "step": 1059 }, { "epoch": 0.11876418027506232, "grad_norm": 1.5661680698394775, "learning_rate": 0.00018822869955156953, "loss": 1.5733, "step": 1060 }, { "epoch": 0.1188762219545671, "grad_norm": 1.1886523962020874, "learning_rate": 0.00018821748878923767, "loss": 1.4596, "step": 1061 }, { "epoch": 0.11898826363407188, "grad_norm": 4.53162956237793, "learning_rate": 0.00018820627802690584, "loss": 1.9712, "step": 1062 }, { "epoch": 0.11910030531357665, "grad_norm": 1.5565561056137085, "learning_rate": 0.000188195067264574, "loss": 1.2968, "step": 1063 }, { "epoch": 0.11921234699308142, "grad_norm": 1.4446288347244263, "learning_rate": 0.00018818385650224215, "loss": 2.0714, "step": 1064 }, { "epoch": 0.1193243886725862, "grad_norm": 2.6644506454467773, "learning_rate": 0.00018817264573991031, "loss": 1.5569, "step": 1065 }, { "epoch": 0.11943643035209098, "grad_norm": 0.9218505024909973, "learning_rate": 0.00018816143497757848, "loss": 1.5679, "step": 1066 }, { "epoch": 0.11954847203159576, "grad_norm": 1.4001480340957642, "learning_rate": 0.00018815022421524665, "loss": 2.1344, "step": 1067 }, { "epoch": 0.11966051371110054, "grad_norm": 1.8936715126037598, "learning_rate": 0.0001881390134529148, "loss": 1.2774, "step": 1068 }, { "epoch": 0.1197725553906053, "grad_norm": 1.3668771982192993, "learning_rate": 0.00018812780269058296, "loss": 1.3055, "step": 1069 }, { "epoch": 0.11988459707011008, "grad_norm": 2.555520534515381, "learning_rate": 0.00018811659192825112, "loss": 1.3283, "step": 1070 }, { "epoch": 0.11999663874961486, "grad_norm": 1.3914583921432495, "learning_rate": 0.0001881053811659193, "loss": 1.2735, "step": 1071 }, { "epoch": 0.12010868042911964, "grad_norm": 1.6742722988128662, "learning_rate": 0.00018809417040358746, "loss": 1.4629, "step": 1072 }, { "epoch": 0.1202207221086244, "grad_norm": 1.2338154315948486, "learning_rate": 0.00018808295964125563, "loss": 1.5971, "step": 1073 }, { "epoch": 0.12033276378812918, "grad_norm": 1.58329176902771, "learning_rate": 0.0001880717488789238, "loss": 1.5856, "step": 1074 }, { "epoch": 0.12044480546763396, "grad_norm": 1.0977815389633179, "learning_rate": 0.00018806053811659193, "loss": 1.7097, "step": 1075 }, { "epoch": 0.12055684714713874, "grad_norm": 1.6461944580078125, "learning_rate": 0.0001880493273542601, "loss": 0.9778, "step": 1076 }, { "epoch": 0.1206688888266435, "grad_norm": 1.7189710140228271, "learning_rate": 0.00018803811659192827, "loss": 1.2516, "step": 1077 }, { "epoch": 0.12078093050614828, "grad_norm": 1.3144108057022095, "learning_rate": 0.0001880269058295964, "loss": 1.9922, "step": 1078 }, { "epoch": 0.12089297218565306, "grad_norm": 1.353492021560669, "learning_rate": 0.00018801569506726458, "loss": 1.2406, "step": 1079 }, { "epoch": 0.12100501386515784, "grad_norm": 1.1186522245407104, "learning_rate": 0.00018800448430493274, "loss": 1.2828, "step": 1080 }, { "epoch": 0.12111705554466261, "grad_norm": 1.7442059516906738, "learning_rate": 0.0001879932735426009, "loss": 1.4498, "step": 1081 }, { "epoch": 0.12122909722416739, "grad_norm": 2.0966644287109375, "learning_rate": 0.00018798206278026905, "loss": 1.8261, "step": 1082 }, { "epoch": 0.12134113890367217, "grad_norm": 1.896040678024292, "learning_rate": 0.00018797085201793722, "loss": 1.5383, "step": 1083 }, { "epoch": 0.12145318058317695, "grad_norm": 1.6941802501678467, "learning_rate": 0.00018795964125560539, "loss": 1.7581, "step": 1084 }, { "epoch": 0.12156522226268172, "grad_norm": 1.2401145696640015, "learning_rate": 0.00018794843049327355, "loss": 1.1561, "step": 1085 }, { "epoch": 0.12167726394218649, "grad_norm": 2.8364408016204834, "learning_rate": 0.00018793721973094172, "loss": 1.0204, "step": 1086 }, { "epoch": 0.12178930562169127, "grad_norm": 2.5821685791015625, "learning_rate": 0.0001879260089686099, "loss": 1.5712, "step": 1087 }, { "epoch": 0.12190134730119605, "grad_norm": 2.3016855716705322, "learning_rate": 0.00018791479820627806, "loss": 1.3134, "step": 1088 }, { "epoch": 0.12201338898070083, "grad_norm": 1.5644071102142334, "learning_rate": 0.0001879035874439462, "loss": 1.4604, "step": 1089 }, { "epoch": 0.12212543066020559, "grad_norm": 1.1575491428375244, "learning_rate": 0.00018789237668161436, "loss": 1.5203, "step": 1090 }, { "epoch": 0.12223747233971037, "grad_norm": 1.940112590789795, "learning_rate": 0.00018788116591928253, "loss": 2.0186, "step": 1091 }, { "epoch": 0.12234951401921515, "grad_norm": 2.6135590076446533, "learning_rate": 0.00018786995515695067, "loss": 2.2598, "step": 1092 }, { "epoch": 0.12246155569871993, "grad_norm": 1.7832342386245728, "learning_rate": 0.00018785874439461884, "loss": 1.7188, "step": 1093 }, { "epoch": 0.1225735973782247, "grad_norm": 1.922625184059143, "learning_rate": 0.000187847533632287, "loss": 1.5241, "step": 1094 }, { "epoch": 0.12268563905772947, "grad_norm": 1.0636463165283203, "learning_rate": 0.00018783632286995517, "loss": 0.9216, "step": 1095 }, { "epoch": 0.12279768073723425, "grad_norm": 0.8798263072967529, "learning_rate": 0.0001878251121076233, "loss": 1.3706, "step": 1096 }, { "epoch": 0.12290972241673903, "grad_norm": 1.016133427619934, "learning_rate": 0.00018781390134529148, "loss": 1.1638, "step": 1097 }, { "epoch": 0.1230217640962438, "grad_norm": 2.573178768157959, "learning_rate": 0.00018780269058295965, "loss": 1.5836, "step": 1098 }, { "epoch": 0.12313380577574858, "grad_norm": 2.0545923709869385, "learning_rate": 0.00018779147982062782, "loss": 1.5082, "step": 1099 }, { "epoch": 0.12324584745525335, "grad_norm": 1.6580615043640137, "learning_rate": 0.00018778026905829598, "loss": 0.9793, "step": 1100 }, { "epoch": 0.12335788913475813, "grad_norm": 1.141614556312561, "learning_rate": 0.00018776905829596415, "loss": 0.9799, "step": 1101 }, { "epoch": 0.12346993081426291, "grad_norm": 1.1152334213256836, "learning_rate": 0.0001877578475336323, "loss": 2.1956, "step": 1102 }, { "epoch": 0.12358197249376768, "grad_norm": 1.4727879762649536, "learning_rate": 0.00018774663677130046, "loss": 1.8364, "step": 1103 }, { "epoch": 0.12369401417327246, "grad_norm": 1.0166385173797607, "learning_rate": 0.00018773542600896862, "loss": 2.0202, "step": 1104 }, { "epoch": 0.12380605585277724, "grad_norm": 1.7606598138809204, "learning_rate": 0.00018772421524663677, "loss": 1.0189, "step": 1105 }, { "epoch": 0.12391809753228201, "grad_norm": 1.2334370613098145, "learning_rate": 0.00018771300448430493, "loss": 1.1034, "step": 1106 }, { "epoch": 0.12403013921178678, "grad_norm": 3.01291823387146, "learning_rate": 0.0001877017937219731, "loss": 1.5772, "step": 1107 }, { "epoch": 0.12414218089129156, "grad_norm": 1.1779786348342896, "learning_rate": 0.00018769058295964127, "loss": 1.6983, "step": 1108 }, { "epoch": 0.12425422257079634, "grad_norm": 0.8913339972496033, "learning_rate": 0.00018767937219730943, "loss": 0.7473, "step": 1109 }, { "epoch": 0.12436626425030112, "grad_norm": 2.0784482955932617, "learning_rate": 0.00018766816143497757, "loss": 1.5849, "step": 1110 }, { "epoch": 0.12447830592980588, "grad_norm": 1.0257015228271484, "learning_rate": 0.00018765695067264574, "loss": 1.4458, "step": 1111 }, { "epoch": 0.12459034760931066, "grad_norm": 1.290440320968628, "learning_rate": 0.0001876457399103139, "loss": 1.2162, "step": 1112 }, { "epoch": 0.12470238928881544, "grad_norm": 1.459418535232544, "learning_rate": 0.00018763452914798208, "loss": 1.7328, "step": 1113 }, { "epoch": 0.12481443096832022, "grad_norm": 1.9463837146759033, "learning_rate": 0.00018762331838565024, "loss": 2.052, "step": 1114 }, { "epoch": 0.12492647264782498, "grad_norm": 1.7049462795257568, "learning_rate": 0.0001876121076233184, "loss": 1.4708, "step": 1115 }, { "epoch": 0.12503851432732976, "grad_norm": 1.590088963508606, "learning_rate": 0.00018760089686098655, "loss": 1.8516, "step": 1116 }, { "epoch": 0.12515055600683453, "grad_norm": 1.5018975734710693, "learning_rate": 0.00018758968609865472, "loss": 1.3066, "step": 1117 }, { "epoch": 0.12526259768633932, "grad_norm": 1.3822623491287231, "learning_rate": 0.0001875784753363229, "loss": 1.4222, "step": 1118 }, { "epoch": 0.1253746393658441, "grad_norm": 1.6622188091278076, "learning_rate": 0.00018756726457399103, "loss": 1.3755, "step": 1119 }, { "epoch": 0.12548668104534888, "grad_norm": 1.6023526191711426, "learning_rate": 0.0001875560538116592, "loss": 1.0596, "step": 1120 }, { "epoch": 0.12559872272485365, "grad_norm": 1.5817945003509521, "learning_rate": 0.00018754484304932736, "loss": 1.6941, "step": 1121 }, { "epoch": 0.1257107644043584, "grad_norm": 1.8528891801834106, "learning_rate": 0.00018753363228699553, "loss": 1.5493, "step": 1122 }, { "epoch": 0.1258228060838632, "grad_norm": 1.275618076324463, "learning_rate": 0.0001875224215246637, "loss": 0.7488, "step": 1123 }, { "epoch": 0.12593484776336797, "grad_norm": 0.8299435377120972, "learning_rate": 0.00018751121076233184, "loss": 1.5441, "step": 1124 }, { "epoch": 0.12604688944287276, "grad_norm": 2.0555694103240967, "learning_rate": 0.0001875, "loss": 1.4558, "step": 1125 }, { "epoch": 0.12615893112237753, "grad_norm": 1.6238466501235962, "learning_rate": 0.00018748878923766817, "loss": 1.5565, "step": 1126 }, { "epoch": 0.1262709728018823, "grad_norm": 0.9960426092147827, "learning_rate": 0.00018747757847533634, "loss": 1.7022, "step": 1127 }, { "epoch": 0.12638301448138708, "grad_norm": 0.8002447485923767, "learning_rate": 0.0001874663677130045, "loss": 1.3421, "step": 1128 }, { "epoch": 0.12649505616089185, "grad_norm": 1.5849549770355225, "learning_rate": 0.00018745515695067267, "loss": 1.135, "step": 1129 }, { "epoch": 0.12660709784039662, "grad_norm": 1.7799932956695557, "learning_rate": 0.00018744394618834081, "loss": 1.5816, "step": 1130 }, { "epoch": 0.1267191395199014, "grad_norm": 1.1657170057296753, "learning_rate": 0.00018743273542600898, "loss": 1.647, "step": 1131 }, { "epoch": 0.12683118119940617, "grad_norm": 0.9074099063873291, "learning_rate": 0.00018742152466367712, "loss": 1.2567, "step": 1132 }, { "epoch": 0.12694322287891097, "grad_norm": 1.904752492904663, "learning_rate": 0.0001874103139013453, "loss": 1.8377, "step": 1133 }, { "epoch": 0.12705526455841573, "grad_norm": 1.517867922782898, "learning_rate": 0.00018739910313901346, "loss": 1.2726, "step": 1134 }, { "epoch": 0.1271673062379205, "grad_norm": 1.6673177480697632, "learning_rate": 0.00018738789237668162, "loss": 1.0909, "step": 1135 }, { "epoch": 0.1272793479174253, "grad_norm": 1.7382824420928955, "learning_rate": 0.0001873766816143498, "loss": 1.5515, "step": 1136 }, { "epoch": 0.12739138959693005, "grad_norm": 2.049703598022461, "learning_rate": 0.00018736547085201793, "loss": 1.2109, "step": 1137 }, { "epoch": 0.12750343127643482, "grad_norm": 1.6853530406951904, "learning_rate": 0.0001873542600896861, "loss": 1.8697, "step": 1138 }, { "epoch": 0.1276154729559396, "grad_norm": 3.273667097091675, "learning_rate": 0.00018734304932735427, "loss": 1.6665, "step": 1139 }, { "epoch": 0.12772751463544438, "grad_norm": 2.3022239208221436, "learning_rate": 0.00018733183856502243, "loss": 1.7674, "step": 1140 }, { "epoch": 0.12783955631494917, "grad_norm": 1.588897466659546, "learning_rate": 0.0001873206278026906, "loss": 1.1445, "step": 1141 }, { "epoch": 0.12795159799445394, "grad_norm": 1.6422961950302124, "learning_rate": 0.00018730941704035877, "loss": 1.6872, "step": 1142 }, { "epoch": 0.1280636396739587, "grad_norm": 1.9040683507919312, "learning_rate": 0.00018729820627802694, "loss": 1.6709, "step": 1143 }, { "epoch": 0.1281756813534635, "grad_norm": 1.63062584400177, "learning_rate": 0.00018728699551569508, "loss": 1.7314, "step": 1144 }, { "epoch": 0.12828772303296826, "grad_norm": 1.3890410661697388, "learning_rate": 0.00018727578475336324, "loss": 2.1511, "step": 1145 }, { "epoch": 0.12839976471247305, "grad_norm": 1.5317187309265137, "learning_rate": 0.00018726457399103138, "loss": 1.3547, "step": 1146 }, { "epoch": 0.12851180639197782, "grad_norm": 1.363296627998352, "learning_rate": 0.00018725336322869955, "loss": 1.6723, "step": 1147 }, { "epoch": 0.12862384807148258, "grad_norm": 1.0610668659210205, "learning_rate": 0.00018724215246636772, "loss": 1.3775, "step": 1148 }, { "epoch": 0.12873588975098738, "grad_norm": 0.9194682240486145, "learning_rate": 0.00018723094170403589, "loss": 1.6484, "step": 1149 }, { "epoch": 0.12884793143049214, "grad_norm": 0.844358503818512, "learning_rate": 0.00018721973094170405, "loss": 1.61, "step": 1150 }, { "epoch": 0.1289599731099969, "grad_norm": 0.9679080247879028, "learning_rate": 0.0001872085201793722, "loss": 1.0871, "step": 1151 }, { "epoch": 0.1290720147895017, "grad_norm": 1.498260736465454, "learning_rate": 0.00018719730941704036, "loss": 1.3617, "step": 1152 }, { "epoch": 0.12918405646900646, "grad_norm": 1.5754985809326172, "learning_rate": 0.00018718609865470853, "loss": 0.9979, "step": 1153 }, { "epoch": 0.12929609814851126, "grad_norm": 1.1075730323791504, "learning_rate": 0.0001871748878923767, "loss": 1.8124, "step": 1154 }, { "epoch": 0.12940813982801602, "grad_norm": 1.0610558986663818, "learning_rate": 0.00018716367713004486, "loss": 0.8218, "step": 1155 }, { "epoch": 0.1295201815075208, "grad_norm": 1.4933853149414062, "learning_rate": 0.00018715246636771303, "loss": 2.1818, "step": 1156 }, { "epoch": 0.12963222318702558, "grad_norm": 1.461762547492981, "learning_rate": 0.00018714125560538117, "loss": 1.3651, "step": 1157 }, { "epoch": 0.12974426486653035, "grad_norm": 1.732432246208191, "learning_rate": 0.00018713004484304934, "loss": 1.3644, "step": 1158 }, { "epoch": 0.12985630654603514, "grad_norm": 1.3825899362564087, "learning_rate": 0.00018711883408071748, "loss": 1.4159, "step": 1159 }, { "epoch": 0.1299683482255399, "grad_norm": 2.024397611618042, "learning_rate": 0.00018710762331838564, "loss": 1.727, "step": 1160 }, { "epoch": 0.13008038990504467, "grad_norm": 1.9614179134368896, "learning_rate": 0.0001870964125560538, "loss": 2.0591, "step": 1161 }, { "epoch": 0.13019243158454946, "grad_norm": 3.1303272247314453, "learning_rate": 0.00018708520179372198, "loss": 1.4059, "step": 1162 }, { "epoch": 0.13030447326405423, "grad_norm": 2.2398760318756104, "learning_rate": 0.00018707399103139015, "loss": 1.0906, "step": 1163 }, { "epoch": 0.130416514943559, "grad_norm": 2.685075521469116, "learning_rate": 0.00018706278026905831, "loss": 1.769, "step": 1164 }, { "epoch": 0.13052855662306379, "grad_norm": 1.7581768035888672, "learning_rate": 0.00018705156950672645, "loss": 1.8656, "step": 1165 }, { "epoch": 0.13064059830256855, "grad_norm": 1.5590026378631592, "learning_rate": 0.00018704035874439462, "loss": 1.7081, "step": 1166 }, { "epoch": 0.13075263998207334, "grad_norm": 1.4943839311599731, "learning_rate": 0.0001870291479820628, "loss": 1.2293, "step": 1167 }, { "epoch": 0.1308646816615781, "grad_norm": 1.1355737447738647, "learning_rate": 0.00018701793721973096, "loss": 0.8271, "step": 1168 }, { "epoch": 0.13097672334108287, "grad_norm": 1.563325047492981, "learning_rate": 0.00018700672645739912, "loss": 1.3113, "step": 1169 }, { "epoch": 0.13108876502058767, "grad_norm": 1.5260415077209473, "learning_rate": 0.0001869955156950673, "loss": 2.0799, "step": 1170 }, { "epoch": 0.13120080670009243, "grad_norm": 2.342811346054077, "learning_rate": 0.00018698430493273543, "loss": 2.0438, "step": 1171 }, { "epoch": 0.1313128483795972, "grad_norm": 2.5999746322631836, "learning_rate": 0.0001869730941704036, "loss": 1.8207, "step": 1172 }, { "epoch": 0.131424890059102, "grad_norm": 1.3597218990325928, "learning_rate": 0.00018696188340807174, "loss": 0.8327, "step": 1173 }, { "epoch": 0.13153693173860675, "grad_norm": 1.8815906047821045, "learning_rate": 0.0001869506726457399, "loss": 1.3862, "step": 1174 }, { "epoch": 0.13164897341811155, "grad_norm": 0.9034279584884644, "learning_rate": 0.00018693946188340807, "loss": 1.3602, "step": 1175 }, { "epoch": 0.1317610150976163, "grad_norm": 1.3076611757278442, "learning_rate": 0.00018692825112107624, "loss": 1.2712, "step": 1176 }, { "epoch": 0.13187305677712108, "grad_norm": 2.2324326038360596, "learning_rate": 0.0001869170403587444, "loss": 2.3027, "step": 1177 }, { "epoch": 0.13198509845662587, "grad_norm": 1.6875048875808716, "learning_rate": 0.00018690582959641258, "loss": 1.4937, "step": 1178 }, { "epoch": 0.13209714013613064, "grad_norm": 1.324818730354309, "learning_rate": 0.00018689461883408072, "loss": 0.6691, "step": 1179 }, { "epoch": 0.13220918181563543, "grad_norm": 1.6373106241226196, "learning_rate": 0.00018688340807174888, "loss": 1.8573, "step": 1180 }, { "epoch": 0.1323212234951402, "grad_norm": 1.4050589799880981, "learning_rate": 0.00018687219730941705, "loss": 1.5664, "step": 1181 }, { "epoch": 0.13243326517464496, "grad_norm": 1.8042280673980713, "learning_rate": 0.00018686098654708522, "loss": 1.3923, "step": 1182 }, { "epoch": 0.13254530685414975, "grad_norm": 1.422881841659546, "learning_rate": 0.00018684977578475339, "loss": 1.1698, "step": 1183 }, { "epoch": 0.13265734853365452, "grad_norm": 1.9511743783950806, "learning_rate": 0.00018683856502242155, "loss": 1.3844, "step": 1184 }, { "epoch": 0.13276939021315928, "grad_norm": 1.3422256708145142, "learning_rate": 0.0001868273542600897, "loss": 1.5877, "step": 1185 }, { "epoch": 0.13288143189266408, "grad_norm": 2.0319533348083496, "learning_rate": 0.00018681614349775786, "loss": 1.4173, "step": 1186 }, { "epoch": 0.13299347357216884, "grad_norm": 1.4657912254333496, "learning_rate": 0.000186804932735426, "loss": 1.1278, "step": 1187 }, { "epoch": 0.13310551525167363, "grad_norm": 1.6324150562286377, "learning_rate": 0.00018679372197309417, "loss": 1.6808, "step": 1188 }, { "epoch": 0.1332175569311784, "grad_norm": 1.9429125785827637, "learning_rate": 0.00018678251121076234, "loss": 1.5367, "step": 1189 }, { "epoch": 0.13332959861068316, "grad_norm": 1.4946774244308472, "learning_rate": 0.0001867713004484305, "loss": 1.5259, "step": 1190 }, { "epoch": 0.13344164029018796, "grad_norm": 2.3331472873687744, "learning_rate": 0.00018676008968609867, "loss": 1.9603, "step": 1191 }, { "epoch": 0.13355368196969272, "grad_norm": 1.024499535560608, "learning_rate": 0.0001867488789237668, "loss": 1.1313, "step": 1192 }, { "epoch": 0.1336657236491975, "grad_norm": 1.15455162525177, "learning_rate": 0.00018673766816143498, "loss": 1.6294, "step": 1193 }, { "epoch": 0.13377776532870228, "grad_norm": 1.0431082248687744, "learning_rate": 0.00018672645739910315, "loss": 1.0516, "step": 1194 }, { "epoch": 0.13388980700820705, "grad_norm": 1.108364224433899, "learning_rate": 0.0001867152466367713, "loss": 1.6757, "step": 1195 }, { "epoch": 0.13400184868771184, "grad_norm": 3.117595672607422, "learning_rate": 0.00018670403587443948, "loss": 1.5524, "step": 1196 }, { "epoch": 0.1341138903672166, "grad_norm": 1.3483400344848633, "learning_rate": 0.00018669282511210765, "loss": 1.6452, "step": 1197 }, { "epoch": 0.13422593204672137, "grad_norm": 1.2711856365203857, "learning_rate": 0.00018668161434977581, "loss": 1.377, "step": 1198 }, { "epoch": 0.13433797372622616, "grad_norm": 1.1080363988876343, "learning_rate": 0.00018667040358744395, "loss": 1.7001, "step": 1199 }, { "epoch": 0.13445001540573093, "grad_norm": 1.7139546871185303, "learning_rate": 0.0001866591928251121, "loss": 1.892, "step": 1200 }, { "epoch": 0.13456205708523572, "grad_norm": 2.0561530590057373, "learning_rate": 0.00018664798206278026, "loss": 1.8216, "step": 1201 }, { "epoch": 0.13467409876474049, "grad_norm": 1.1964876651763916, "learning_rate": 0.00018663677130044843, "loss": 2.0122, "step": 1202 }, { "epoch": 0.13478614044424525, "grad_norm": 1.5439486503601074, "learning_rate": 0.0001866255605381166, "loss": 1.7851, "step": 1203 }, { "epoch": 0.13489818212375004, "grad_norm": 1.58514404296875, "learning_rate": 0.00018661434977578476, "loss": 1.32, "step": 1204 }, { "epoch": 0.1350102238032548, "grad_norm": 1.157100796699524, "learning_rate": 0.00018660313901345293, "loss": 1.5565, "step": 1205 }, { "epoch": 0.13512226548275957, "grad_norm": 1.5880330801010132, "learning_rate": 0.00018659192825112107, "loss": 1.7532, "step": 1206 }, { "epoch": 0.13523430716226437, "grad_norm": 1.18852698802948, "learning_rate": 0.00018658071748878924, "loss": 1.7699, "step": 1207 }, { "epoch": 0.13534634884176913, "grad_norm": 2.180375576019287, "learning_rate": 0.0001865695067264574, "loss": 1.6202, "step": 1208 }, { "epoch": 0.13545839052127392, "grad_norm": 1.2764447927474976, "learning_rate": 0.00018655829596412557, "loss": 1.5081, "step": 1209 }, { "epoch": 0.1355704322007787, "grad_norm": 1.473776936531067, "learning_rate": 0.00018654708520179374, "loss": 1.6314, "step": 1210 }, { "epoch": 0.13568247388028346, "grad_norm": 1.208632469177246, "learning_rate": 0.0001865358744394619, "loss": 0.9372, "step": 1211 }, { "epoch": 0.13579451555978825, "grad_norm": 1.8673516511917114, "learning_rate": 0.00018652466367713005, "loss": 1.5313, "step": 1212 }, { "epoch": 0.135906557239293, "grad_norm": 1.821739912033081, "learning_rate": 0.00018651345291479822, "loss": 2.0428, "step": 1213 }, { "epoch": 0.1360185989187978, "grad_norm": 1.7214354276657104, "learning_rate": 0.00018650224215246636, "loss": 1.6392, "step": 1214 }, { "epoch": 0.13613064059830257, "grad_norm": 1.4541807174682617, "learning_rate": 0.00018649103139013452, "loss": 0.8325, "step": 1215 }, { "epoch": 0.13624268227780734, "grad_norm": 1.1090161800384521, "learning_rate": 0.0001864798206278027, "loss": 1.2129, "step": 1216 }, { "epoch": 0.13635472395731213, "grad_norm": 1.208639144897461, "learning_rate": 0.00018646860986547086, "loss": 1.4718, "step": 1217 }, { "epoch": 0.1364667656368169, "grad_norm": 1.3442260026931763, "learning_rate": 0.00018645739910313903, "loss": 1.853, "step": 1218 }, { "epoch": 0.13657880731632166, "grad_norm": 1.9982670545578003, "learning_rate": 0.0001864461883408072, "loss": 1.7065, "step": 1219 }, { "epoch": 0.13669084899582645, "grad_norm": 2.113999128341675, "learning_rate": 0.00018643497757847533, "loss": 1.7311, "step": 1220 }, { "epoch": 0.13680289067533122, "grad_norm": 1.686738133430481, "learning_rate": 0.0001864237668161435, "loss": 1.8911, "step": 1221 }, { "epoch": 0.136914932354836, "grad_norm": 1.411128044128418, "learning_rate": 0.00018641255605381167, "loss": 1.6073, "step": 1222 }, { "epoch": 0.13702697403434078, "grad_norm": 1.0301393270492554, "learning_rate": 0.00018640134529147984, "loss": 1.4571, "step": 1223 }, { "epoch": 0.13713901571384554, "grad_norm": 1.5304756164550781, "learning_rate": 0.000186390134529148, "loss": 1.4121, "step": 1224 }, { "epoch": 0.13725105739335033, "grad_norm": 1.592961311340332, "learning_rate": 0.00018637892376681617, "loss": 1.945, "step": 1225 }, { "epoch": 0.1373630990728551, "grad_norm": 1.3797760009765625, "learning_rate": 0.0001863677130044843, "loss": 1.565, "step": 1226 }, { "epoch": 0.13747514075235986, "grad_norm": 0.8853676915168762, "learning_rate": 0.00018635650224215245, "loss": 1.7007, "step": 1227 }, { "epoch": 0.13758718243186466, "grad_norm": 1.7784823179244995, "learning_rate": 0.00018634529147982062, "loss": 1.6425, "step": 1228 }, { "epoch": 0.13769922411136942, "grad_norm": 1.7005332708358765, "learning_rate": 0.00018633408071748879, "loss": 1.5095, "step": 1229 }, { "epoch": 0.13781126579087422, "grad_norm": 1.2346413135528564, "learning_rate": 0.00018632286995515695, "loss": 2.1648, "step": 1230 }, { "epoch": 0.13792330747037898, "grad_norm": 2.2326791286468506, "learning_rate": 0.00018631165919282512, "loss": 1.5491, "step": 1231 }, { "epoch": 0.13803534914988375, "grad_norm": 4.305312156677246, "learning_rate": 0.0001863004484304933, "loss": 1.8952, "step": 1232 }, { "epoch": 0.13814739082938854, "grad_norm": 2.1103789806365967, "learning_rate": 0.00018628923766816146, "loss": 1.6568, "step": 1233 }, { "epoch": 0.1382594325088933, "grad_norm": 1.5490082502365112, "learning_rate": 0.0001862780269058296, "loss": 1.385, "step": 1234 }, { "epoch": 0.1383714741883981, "grad_norm": 1.1355441808700562, "learning_rate": 0.00018626681614349776, "loss": 1.5701, "step": 1235 }, { "epoch": 0.13848351586790286, "grad_norm": 1.808889627456665, "learning_rate": 0.00018625560538116593, "loss": 1.0673, "step": 1236 }, { "epoch": 0.13859555754740763, "grad_norm": 1.0763615369796753, "learning_rate": 0.0001862443946188341, "loss": 1.3235, "step": 1237 }, { "epoch": 0.13870759922691242, "grad_norm": 1.256640076637268, "learning_rate": 0.00018623318385650227, "loss": 1.2683, "step": 1238 }, { "epoch": 0.13881964090641719, "grad_norm": 2.702099084854126, "learning_rate": 0.00018622197309417043, "loss": 1.2475, "step": 1239 }, { "epoch": 0.13893168258592195, "grad_norm": 3.7516696453094482, "learning_rate": 0.00018621076233183857, "loss": 1.7416, "step": 1240 }, { "epoch": 0.13904372426542674, "grad_norm": 2.0176427364349365, "learning_rate": 0.0001861995515695067, "loss": 1.4691, "step": 1241 }, { "epoch": 0.1391557659449315, "grad_norm": 1.548699140548706, "learning_rate": 0.00018618834080717488, "loss": 1.1047, "step": 1242 }, { "epoch": 0.1392678076244363, "grad_norm": 1.5286359786987305, "learning_rate": 0.00018617713004484305, "loss": 1.439, "step": 1243 }, { "epoch": 0.13937984930394107, "grad_norm": 2.5854148864746094, "learning_rate": 0.00018616591928251122, "loss": 1.7089, "step": 1244 }, { "epoch": 0.13949189098344583, "grad_norm": 2.666842222213745, "learning_rate": 0.00018615470852017938, "loss": 1.5907, "step": 1245 }, { "epoch": 0.13960393266295062, "grad_norm": 1.6597497463226318, "learning_rate": 0.00018614349775784755, "loss": 1.6886, "step": 1246 }, { "epoch": 0.1397159743424554, "grad_norm": 1.1364705562591553, "learning_rate": 0.0001861322869955157, "loss": 1.3397, "step": 1247 }, { "epoch": 0.13982801602196016, "grad_norm": 1.8391960859298706, "learning_rate": 0.00018612107623318386, "loss": 1.6837, "step": 1248 }, { "epoch": 0.13994005770146495, "grad_norm": 1.7948687076568604, "learning_rate": 0.00018610986547085202, "loss": 1.3846, "step": 1249 }, { "epoch": 0.1400520993809697, "grad_norm": 1.7554460763931274, "learning_rate": 0.0001860986547085202, "loss": 1.795, "step": 1250 }, { "epoch": 0.1401641410604745, "grad_norm": 2.6685876846313477, "learning_rate": 0.00018608744394618836, "loss": 1.871, "step": 1251 }, { "epoch": 0.14027618273997927, "grad_norm": 3.07920241355896, "learning_rate": 0.00018607623318385653, "loss": 1.8532, "step": 1252 }, { "epoch": 0.14038822441948404, "grad_norm": 1.006218671798706, "learning_rate": 0.0001860650224215247, "loss": 0.9778, "step": 1253 }, { "epoch": 0.14050026609898883, "grad_norm": 1.3976384401321411, "learning_rate": 0.00018605381165919283, "loss": 1.3723, "step": 1254 }, { "epoch": 0.1406123077784936, "grad_norm": 1.4148651361465454, "learning_rate": 0.00018604260089686097, "loss": 1.6448, "step": 1255 }, { "epoch": 0.1407243494579984, "grad_norm": 1.841371774673462, "learning_rate": 0.00018603139013452914, "loss": 2.0605, "step": 1256 }, { "epoch": 0.14083639113750315, "grad_norm": 3.004255771636963, "learning_rate": 0.0001860201793721973, "loss": 1.4481, "step": 1257 }, { "epoch": 0.14094843281700792, "grad_norm": 2.825855016708374, "learning_rate": 0.00018600896860986548, "loss": 2.0836, "step": 1258 }, { "epoch": 0.1410604744965127, "grad_norm": 1.416593074798584, "learning_rate": 0.00018599775784753364, "loss": 0.8309, "step": 1259 }, { "epoch": 0.14117251617601748, "grad_norm": 1.641356348991394, "learning_rate": 0.0001859865470852018, "loss": 1.2876, "step": 1260 }, { "epoch": 0.14128455785552224, "grad_norm": 2.3959012031555176, "learning_rate": 0.00018597533632286995, "loss": 1.5824, "step": 1261 }, { "epoch": 0.14139659953502703, "grad_norm": 1.4487465620040894, "learning_rate": 0.00018596412556053812, "loss": 1.6875, "step": 1262 }, { "epoch": 0.1415086412145318, "grad_norm": 1.827488660812378, "learning_rate": 0.0001859529147982063, "loss": 2.3295, "step": 1263 }, { "epoch": 0.1416206828940366, "grad_norm": 1.6803877353668213, "learning_rate": 0.00018594170403587445, "loss": 1.9491, "step": 1264 }, { "epoch": 0.14173272457354136, "grad_norm": 1.0207635164260864, "learning_rate": 0.00018593049327354262, "loss": 1.6239, "step": 1265 }, { "epoch": 0.14184476625304612, "grad_norm": 1.213748812675476, "learning_rate": 0.0001859192825112108, "loss": 1.2224, "step": 1266 }, { "epoch": 0.14195680793255092, "grad_norm": 1.293150782585144, "learning_rate": 0.00018590807174887893, "loss": 1.2851, "step": 1267 }, { "epoch": 0.14206884961205568, "grad_norm": 1.3778995275497437, "learning_rate": 0.0001858968609865471, "loss": 1.1929, "step": 1268 }, { "epoch": 0.14218089129156047, "grad_norm": 1.1147325038909912, "learning_rate": 0.00018588565022421524, "loss": 1.2899, "step": 1269 }, { "epoch": 0.14229293297106524, "grad_norm": 2.3289849758148193, "learning_rate": 0.0001858744394618834, "loss": 2.1569, "step": 1270 }, { "epoch": 0.14240497465057, "grad_norm": 1.224012851715088, "learning_rate": 0.00018586322869955157, "loss": 0.8568, "step": 1271 }, { "epoch": 0.1425170163300748, "grad_norm": 2.1730384826660156, "learning_rate": 0.00018585201793721974, "loss": 1.9843, "step": 1272 }, { "epoch": 0.14262905800957956, "grad_norm": 1.66231107711792, "learning_rate": 0.0001858408071748879, "loss": 1.7234, "step": 1273 }, { "epoch": 0.14274109968908433, "grad_norm": 1.3478394746780396, "learning_rate": 0.00018582959641255607, "loss": 1.3514, "step": 1274 }, { "epoch": 0.14285314136858912, "grad_norm": 2.098986864089966, "learning_rate": 0.00018581838565022421, "loss": 1.3902, "step": 1275 }, { "epoch": 0.14296518304809389, "grad_norm": 1.2753535509109497, "learning_rate": 0.00018580717488789238, "loss": 1.6886, "step": 1276 }, { "epoch": 0.14307722472759868, "grad_norm": 1.3885709047317505, "learning_rate": 0.00018579596412556055, "loss": 1.6189, "step": 1277 }, { "epoch": 0.14318926640710344, "grad_norm": 1.5166406631469727, "learning_rate": 0.00018578475336322872, "loss": 1.875, "step": 1278 }, { "epoch": 0.1433013080866082, "grad_norm": 1.7424076795578003, "learning_rate": 0.00018577354260089688, "loss": 1.6976, "step": 1279 }, { "epoch": 0.143413349766113, "grad_norm": 1.5474110841751099, "learning_rate": 0.00018576233183856505, "loss": 1.125, "step": 1280 }, { "epoch": 0.14352539144561777, "grad_norm": 2.0098912715911865, "learning_rate": 0.0001857511210762332, "loss": 0.9667, "step": 1281 }, { "epoch": 0.14363743312512253, "grad_norm": 2.0677802562713623, "learning_rate": 0.00018573991031390133, "loss": 1.4924, "step": 1282 }, { "epoch": 0.14374947480462733, "grad_norm": 1.0024768114089966, "learning_rate": 0.0001857286995515695, "loss": 1.7901, "step": 1283 }, { "epoch": 0.1438615164841321, "grad_norm": 1.5800038576126099, "learning_rate": 0.00018571748878923767, "loss": 1.3799, "step": 1284 }, { "epoch": 0.14397355816363688, "grad_norm": 1.2554512023925781, "learning_rate": 0.00018570627802690583, "loss": 1.5672, "step": 1285 }, { "epoch": 0.14408559984314165, "grad_norm": 1.3319966793060303, "learning_rate": 0.000185695067264574, "loss": 1.499, "step": 1286 }, { "epoch": 0.1441976415226464, "grad_norm": 1.9264663457870483, "learning_rate": 0.00018568385650224217, "loss": 1.4644, "step": 1287 }, { "epoch": 0.1443096832021512, "grad_norm": 2.553652048110962, "learning_rate": 0.00018567264573991034, "loss": 1.1725, "step": 1288 }, { "epoch": 0.14442172488165597, "grad_norm": 2.7755637168884277, "learning_rate": 0.00018566143497757848, "loss": 1.5118, "step": 1289 }, { "epoch": 0.14453376656116076, "grad_norm": 1.0561319589614868, "learning_rate": 0.00018565022421524664, "loss": 1.6604, "step": 1290 }, { "epoch": 0.14464580824066553, "grad_norm": 2.3351409435272217, "learning_rate": 0.0001856390134529148, "loss": 1.6422, "step": 1291 }, { "epoch": 0.1447578499201703, "grad_norm": 1.5291962623596191, "learning_rate": 0.00018562780269058298, "loss": 1.543, "step": 1292 }, { "epoch": 0.1448698915996751, "grad_norm": 2.3980679512023926, "learning_rate": 0.00018561659192825114, "loss": 1.5791, "step": 1293 }, { "epoch": 0.14498193327917985, "grad_norm": 1.483687400817871, "learning_rate": 0.0001856053811659193, "loss": 1.4208, "step": 1294 }, { "epoch": 0.14509397495868462, "grad_norm": 1.1106950044631958, "learning_rate": 0.00018559417040358745, "loss": 1.7484, "step": 1295 }, { "epoch": 0.1452060166381894, "grad_norm": 1.1540908813476562, "learning_rate": 0.0001855829596412556, "loss": 1.6341, "step": 1296 }, { "epoch": 0.14531805831769418, "grad_norm": 1.3235082626342773, "learning_rate": 0.00018557174887892376, "loss": 1.5339, "step": 1297 }, { "epoch": 0.14543009999719897, "grad_norm": 1.6723026037216187, "learning_rate": 0.00018556053811659193, "loss": 2.039, "step": 1298 }, { "epoch": 0.14554214167670373, "grad_norm": 1.72067391872406, "learning_rate": 0.0001855493273542601, "loss": 1.3159, "step": 1299 }, { "epoch": 0.1456541833562085, "grad_norm": 2.1962954998016357, "learning_rate": 0.00018553811659192826, "loss": 1.578, "step": 1300 }, { "epoch": 0.1457662250357133, "grad_norm": 1.4528141021728516, "learning_rate": 0.00018552690582959643, "loss": 1.6816, "step": 1301 }, { "epoch": 0.14587826671521806, "grad_norm": 1.3324685096740723, "learning_rate": 0.00018551569506726457, "loss": 1.8089, "step": 1302 }, { "epoch": 0.14599030839472282, "grad_norm": 2.017906427383423, "learning_rate": 0.00018550448430493274, "loss": 1.3316, "step": 1303 }, { "epoch": 0.14610235007422762, "grad_norm": 2.0503246784210205, "learning_rate": 0.0001854932735426009, "loss": 1.451, "step": 1304 }, { "epoch": 0.14621439175373238, "grad_norm": 1.582655429840088, "learning_rate": 0.00018548206278026907, "loss": 1.6842, "step": 1305 }, { "epoch": 0.14632643343323717, "grad_norm": 0.9524270296096802, "learning_rate": 0.00018547085201793724, "loss": 1.1949, "step": 1306 }, { "epoch": 0.14643847511274194, "grad_norm": 1.6498924493789673, "learning_rate": 0.0001854596412556054, "loss": 1.01, "step": 1307 }, { "epoch": 0.1465505167922467, "grad_norm": 4.007920265197754, "learning_rate": 0.00018544843049327357, "loss": 1.8076, "step": 1308 }, { "epoch": 0.1466625584717515, "grad_norm": 1.3863952159881592, "learning_rate": 0.00018543721973094171, "loss": 1.1208, "step": 1309 }, { "epoch": 0.14677460015125626, "grad_norm": 1.2241241931915283, "learning_rate": 0.00018542600896860985, "loss": 1.8501, "step": 1310 }, { "epoch": 0.14688664183076106, "grad_norm": 1.4118918180465698, "learning_rate": 0.00018541479820627802, "loss": 1.6142, "step": 1311 }, { "epoch": 0.14699868351026582, "grad_norm": 1.445029377937317, "learning_rate": 0.0001854035874439462, "loss": 1.5014, "step": 1312 }, { "epoch": 0.14711072518977059, "grad_norm": 1.19560968875885, "learning_rate": 0.00018539237668161436, "loss": 1.8406, "step": 1313 }, { "epoch": 0.14722276686927538, "grad_norm": 1.1170055866241455, "learning_rate": 0.00018538116591928252, "loss": 1.5343, "step": 1314 }, { "epoch": 0.14733480854878014, "grad_norm": 1.5472997426986694, "learning_rate": 0.0001853699551569507, "loss": 0.7617, "step": 1315 }, { "epoch": 0.1474468502282849, "grad_norm": 1.6267352104187012, "learning_rate": 0.00018535874439461883, "loss": 1.5164, "step": 1316 }, { "epoch": 0.1475588919077897, "grad_norm": 2.4575016498565674, "learning_rate": 0.000185347533632287, "loss": 1.2963, "step": 1317 }, { "epoch": 0.14767093358729447, "grad_norm": 1.4293317794799805, "learning_rate": 0.00018533632286995517, "loss": 0.9327, "step": 1318 }, { "epoch": 0.14778297526679926, "grad_norm": 1.8205885887145996, "learning_rate": 0.00018532511210762333, "loss": 1.227, "step": 1319 }, { "epoch": 0.14789501694630403, "grad_norm": 1.1970815658569336, "learning_rate": 0.0001853139013452915, "loss": 1.3014, "step": 1320 }, { "epoch": 0.1480070586258088, "grad_norm": 1.4094595909118652, "learning_rate": 0.00018530269058295967, "loss": 1.2993, "step": 1321 }, { "epoch": 0.14811910030531358, "grad_norm": 1.8952871561050415, "learning_rate": 0.00018529147982062784, "loss": 1.6363, "step": 1322 }, { "epoch": 0.14823114198481835, "grad_norm": 1.3200700283050537, "learning_rate": 0.00018528026905829598, "loss": 1.4436, "step": 1323 }, { "epoch": 0.14834318366432314, "grad_norm": 1.645381212234497, "learning_rate": 0.00018526905829596412, "loss": 1.0091, "step": 1324 }, { "epoch": 0.1484552253438279, "grad_norm": 1.1168032884597778, "learning_rate": 0.00018525784753363228, "loss": 1.418, "step": 1325 }, { "epoch": 0.14856726702333267, "grad_norm": 1.7876304388046265, "learning_rate": 0.00018524663677130045, "loss": 1.8566, "step": 1326 }, { "epoch": 0.14867930870283746, "grad_norm": 3.448566198348999, "learning_rate": 0.00018523542600896862, "loss": 1.3079, "step": 1327 }, { "epoch": 0.14879135038234223, "grad_norm": 1.514494776725769, "learning_rate": 0.00018522421524663679, "loss": 1.3778, "step": 1328 }, { "epoch": 0.148903392061847, "grad_norm": 1.6492645740509033, "learning_rate": 0.00018521300448430495, "loss": 1.6713, "step": 1329 }, { "epoch": 0.1490154337413518, "grad_norm": 0.8000540137290955, "learning_rate": 0.0001852017937219731, "loss": 0.9988, "step": 1330 }, { "epoch": 0.14912747542085655, "grad_norm": 1.6506764888763428, "learning_rate": 0.00018519058295964126, "loss": 0.7257, "step": 1331 }, { "epoch": 0.14923951710036135, "grad_norm": 1.6058783531188965, "learning_rate": 0.00018517937219730943, "loss": 1.9291, "step": 1332 }, { "epoch": 0.1493515587798661, "grad_norm": 1.6950197219848633, "learning_rate": 0.0001851681614349776, "loss": 2.1287, "step": 1333 }, { "epoch": 0.14946360045937088, "grad_norm": 1.4144039154052734, "learning_rate": 0.00018515695067264576, "loss": 1.407, "step": 1334 }, { "epoch": 0.14957564213887567, "grad_norm": 1.2338058948516846, "learning_rate": 0.00018514573991031393, "loss": 1.0039, "step": 1335 }, { "epoch": 0.14968768381838043, "grad_norm": 1.2859474420547485, "learning_rate": 0.00018513452914798207, "loss": 1.9259, "step": 1336 }, { "epoch": 0.1497997254978852, "grad_norm": 3.364668607711792, "learning_rate": 0.0001851233183856502, "loss": 1.1327, "step": 1337 }, { "epoch": 0.14991176717739, "grad_norm": 2.369525194168091, "learning_rate": 0.00018511210762331838, "loss": 1.8997, "step": 1338 }, { "epoch": 0.15002380885689476, "grad_norm": 1.727318525314331, "learning_rate": 0.00018510089686098655, "loss": 1.0789, "step": 1339 }, { "epoch": 0.15013585053639955, "grad_norm": 1.4991228580474854, "learning_rate": 0.0001850896860986547, "loss": 1.0931, "step": 1340 }, { "epoch": 0.15024789221590432, "grad_norm": 1.4146784543991089, "learning_rate": 0.00018507847533632288, "loss": 1.3284, "step": 1341 }, { "epoch": 0.15035993389540908, "grad_norm": 1.756340742111206, "learning_rate": 0.00018506726457399105, "loss": 1.2776, "step": 1342 }, { "epoch": 0.15047197557491387, "grad_norm": 1.5348256826400757, "learning_rate": 0.00018505605381165921, "loss": 1.6361, "step": 1343 }, { "epoch": 0.15058401725441864, "grad_norm": 1.1699140071868896, "learning_rate": 0.00018504484304932736, "loss": 1.0129, "step": 1344 }, { "epoch": 0.15069605893392343, "grad_norm": 2.6281192302703857, "learning_rate": 0.00018503363228699552, "loss": 1.9738, "step": 1345 }, { "epoch": 0.1508081006134282, "grad_norm": 1.6678053140640259, "learning_rate": 0.0001850224215246637, "loss": 1.627, "step": 1346 }, { "epoch": 0.15092014229293296, "grad_norm": 1.042446494102478, "learning_rate": 0.00018501121076233186, "loss": 1.8934, "step": 1347 }, { "epoch": 0.15103218397243776, "grad_norm": 1.1812924146652222, "learning_rate": 0.00018500000000000002, "loss": 1.5497, "step": 1348 }, { "epoch": 0.15114422565194252, "grad_norm": 1.415547251701355, "learning_rate": 0.0001849887892376682, "loss": 1.4292, "step": 1349 }, { "epoch": 0.15125626733144729, "grad_norm": 1.2310893535614014, "learning_rate": 0.00018497757847533633, "loss": 1.9885, "step": 1350 }, { "epoch": 0.15136830901095208, "grad_norm": 1.338964581489563, "learning_rate": 0.00018496636771300447, "loss": 1.3195, "step": 1351 }, { "epoch": 0.15148035069045684, "grad_norm": 1.4839930534362793, "learning_rate": 0.00018495515695067264, "loss": 1.3303, "step": 1352 }, { "epoch": 0.15159239236996164, "grad_norm": 1.5150694847106934, "learning_rate": 0.0001849439461883408, "loss": 1.9596, "step": 1353 }, { "epoch": 0.1517044340494664, "grad_norm": 1.5174999237060547, "learning_rate": 0.00018493273542600897, "loss": 1.4099, "step": 1354 }, { "epoch": 0.15181647572897117, "grad_norm": 2.5694656372070312, "learning_rate": 0.00018492152466367714, "loss": 2.2939, "step": 1355 }, { "epoch": 0.15192851740847596, "grad_norm": 1.686886191368103, "learning_rate": 0.0001849103139013453, "loss": 1.13, "step": 1356 }, { "epoch": 0.15204055908798073, "grad_norm": 1.0802971124649048, "learning_rate": 0.00018489910313901348, "loss": 1.3176, "step": 1357 }, { "epoch": 0.1521526007674855, "grad_norm": 1.376334547996521, "learning_rate": 0.00018488789237668162, "loss": 1.6444, "step": 1358 }, { "epoch": 0.15226464244699028, "grad_norm": 1.3267773389816284, "learning_rate": 0.00018487668161434978, "loss": 2.0569, "step": 1359 }, { "epoch": 0.15237668412649505, "grad_norm": 2.4471497535705566, "learning_rate": 0.00018486547085201795, "loss": 1.0809, "step": 1360 }, { "epoch": 0.15248872580599984, "grad_norm": 1.4867417812347412, "learning_rate": 0.00018485426008968612, "loss": 1.1309, "step": 1361 }, { "epoch": 0.1526007674855046, "grad_norm": 1.2444725036621094, "learning_rate": 0.00018484304932735429, "loss": 1.6837, "step": 1362 }, { "epoch": 0.15271280916500937, "grad_norm": 1.140865683555603, "learning_rate": 0.00018483183856502243, "loss": 1.7554, "step": 1363 }, { "epoch": 0.15282485084451417, "grad_norm": 1.9827643632888794, "learning_rate": 0.0001848206278026906, "loss": 1.4444, "step": 1364 }, { "epoch": 0.15293689252401893, "grad_norm": 0.8549766540527344, "learning_rate": 0.00018480941704035873, "loss": 1.3361, "step": 1365 }, { "epoch": 0.15304893420352372, "grad_norm": 1.8640084266662598, "learning_rate": 0.0001847982062780269, "loss": 1.187, "step": 1366 }, { "epoch": 0.1531609758830285, "grad_norm": 1.3695664405822754, "learning_rate": 0.00018478699551569507, "loss": 1.5216, "step": 1367 }, { "epoch": 0.15327301756253325, "grad_norm": 2.663503646850586, "learning_rate": 0.00018477578475336324, "loss": 1.9466, "step": 1368 }, { "epoch": 0.15338505924203805, "grad_norm": 1.1137624979019165, "learning_rate": 0.0001847645739910314, "loss": 2.1096, "step": 1369 }, { "epoch": 0.1534971009215428, "grad_norm": 0.9072723984718323, "learning_rate": 0.00018475336322869957, "loss": 1.2747, "step": 1370 }, { "epoch": 0.15360914260104758, "grad_norm": 2.0092058181762695, "learning_rate": 0.0001847421524663677, "loss": 1.3756, "step": 1371 }, { "epoch": 0.15372118428055237, "grad_norm": 0.950863242149353, "learning_rate": 0.00018473094170403588, "loss": 1.5436, "step": 1372 }, { "epoch": 0.15383322596005714, "grad_norm": 1.5545659065246582, "learning_rate": 0.00018471973094170405, "loss": 1.1965, "step": 1373 }, { "epoch": 0.15394526763956193, "grad_norm": 0.9433760643005371, "learning_rate": 0.0001847085201793722, "loss": 1.3668, "step": 1374 }, { "epoch": 0.1540573093190667, "grad_norm": 1.9437254667282104, "learning_rate": 0.00018469730941704038, "loss": 1.4362, "step": 1375 }, { "epoch": 0.15416935099857146, "grad_norm": 2.1313018798828125, "learning_rate": 0.00018468609865470855, "loss": 1.565, "step": 1376 }, { "epoch": 0.15428139267807625, "grad_norm": 1.0391138792037964, "learning_rate": 0.0001846748878923767, "loss": 1.9403, "step": 1377 }, { "epoch": 0.15439343435758102, "grad_norm": 1.6420506238937378, "learning_rate": 0.00018466367713004486, "loss": 1.9796, "step": 1378 }, { "epoch": 0.1545054760370858, "grad_norm": 1.3277451992034912, "learning_rate": 0.000184652466367713, "loss": 1.7487, "step": 1379 }, { "epoch": 0.15461751771659057, "grad_norm": 1.5393234491348267, "learning_rate": 0.00018464125560538116, "loss": 0.9738, "step": 1380 }, { "epoch": 0.15472955939609534, "grad_norm": 1.9640486240386963, "learning_rate": 0.00018463004484304933, "loss": 1.8665, "step": 1381 }, { "epoch": 0.15484160107560013, "grad_norm": 0.9347060918807983, "learning_rate": 0.0001846188340807175, "loss": 1.4286, "step": 1382 }, { "epoch": 0.1549536427551049, "grad_norm": 1.0720990896224976, "learning_rate": 0.00018460762331838567, "loss": 1.2162, "step": 1383 }, { "epoch": 0.15506568443460966, "grad_norm": 0.7623341083526611, "learning_rate": 0.00018459641255605383, "loss": 1.5644, "step": 1384 }, { "epoch": 0.15517772611411446, "grad_norm": 1.1315181255340576, "learning_rate": 0.00018458520179372197, "loss": 1.75, "step": 1385 }, { "epoch": 0.15528976779361922, "grad_norm": 1.322068452835083, "learning_rate": 0.00018457399103139014, "loss": 1.7132, "step": 1386 }, { "epoch": 0.15540180947312401, "grad_norm": 1.8373924493789673, "learning_rate": 0.0001845627802690583, "loss": 1.5866, "step": 1387 }, { "epoch": 0.15551385115262878, "grad_norm": 1.1959131956100464, "learning_rate": 0.00018455156950672647, "loss": 1.1538, "step": 1388 }, { "epoch": 0.15562589283213354, "grad_norm": 2.216815948486328, "learning_rate": 0.00018454035874439464, "loss": 2.2261, "step": 1389 }, { "epoch": 0.15573793451163834, "grad_norm": 1.3550820350646973, "learning_rate": 0.0001845291479820628, "loss": 1.4775, "step": 1390 }, { "epoch": 0.1558499761911431, "grad_norm": 0.9304806590080261, "learning_rate": 0.00018451793721973095, "loss": 1.5482, "step": 1391 }, { "epoch": 0.15596201787064787, "grad_norm": 2.1508865356445312, "learning_rate": 0.00018450672645739912, "loss": 1.2856, "step": 1392 }, { "epoch": 0.15607405955015266, "grad_norm": 1.9539718627929688, "learning_rate": 0.00018449551569506726, "loss": 1.1117, "step": 1393 }, { "epoch": 0.15618610122965743, "grad_norm": 1.9992141723632812, "learning_rate": 0.00018448430493273542, "loss": 1.8601, "step": 1394 }, { "epoch": 0.15629814290916222, "grad_norm": 1.889536738395691, "learning_rate": 0.0001844730941704036, "loss": 1.8433, "step": 1395 }, { "epoch": 0.15641018458866698, "grad_norm": 1.2983920574188232, "learning_rate": 0.00018446188340807176, "loss": 1.7558, "step": 1396 }, { "epoch": 0.15652222626817175, "grad_norm": 0.9525499939918518, "learning_rate": 0.00018445067264573993, "loss": 1.6011, "step": 1397 }, { "epoch": 0.15663426794767654, "grad_norm": 2.2679953575134277, "learning_rate": 0.0001844394618834081, "loss": 1.6444, "step": 1398 }, { "epoch": 0.1567463096271813, "grad_norm": 1.9386553764343262, "learning_rate": 0.00018442825112107623, "loss": 1.3071, "step": 1399 }, { "epoch": 0.1568583513066861, "grad_norm": 1.0146160125732422, "learning_rate": 0.0001844170403587444, "loss": 1.6554, "step": 1400 }, { "epoch": 0.15697039298619087, "grad_norm": 1.1025561094284058, "learning_rate": 0.00018440582959641257, "loss": 0.9078, "step": 1401 }, { "epoch": 0.15708243466569563, "grad_norm": 2.511662006378174, "learning_rate": 0.00018439461883408074, "loss": 1.186, "step": 1402 }, { "epoch": 0.15719447634520042, "grad_norm": 1.2678337097167969, "learning_rate": 0.0001843834080717489, "loss": 1.1977, "step": 1403 }, { "epoch": 0.1573065180247052, "grad_norm": 1.3386790752410889, "learning_rate": 0.00018437219730941704, "loss": 1.9369, "step": 1404 }, { "epoch": 0.15741855970420995, "grad_norm": 1.8746459484100342, "learning_rate": 0.0001843609865470852, "loss": 1.8696, "step": 1405 }, { "epoch": 0.15753060138371475, "grad_norm": 0.8595908880233765, "learning_rate": 0.00018434977578475335, "loss": 1.3576, "step": 1406 }, { "epoch": 0.1576426430632195, "grad_norm": 1.6027456521987915, "learning_rate": 0.00018433856502242152, "loss": 1.5661, "step": 1407 }, { "epoch": 0.1577546847427243, "grad_norm": 1.2260022163391113, "learning_rate": 0.0001843273542600897, "loss": 2.1163, "step": 1408 }, { "epoch": 0.15786672642222907, "grad_norm": 1.291271686553955, "learning_rate": 0.00018431614349775785, "loss": 2.2524, "step": 1409 }, { "epoch": 0.15797876810173384, "grad_norm": 1.607008695602417, "learning_rate": 0.00018430493273542602, "loss": 1.7731, "step": 1410 }, { "epoch": 0.15809080978123863, "grad_norm": 1.2611327171325684, "learning_rate": 0.0001842937219730942, "loss": 1.9225, "step": 1411 }, { "epoch": 0.1582028514607434, "grad_norm": 1.4136589765548706, "learning_rate": 0.00018428251121076236, "loss": 1.8418, "step": 1412 }, { "epoch": 0.15831489314024816, "grad_norm": 1.1042768955230713, "learning_rate": 0.0001842713004484305, "loss": 1.2244, "step": 1413 }, { "epoch": 0.15842693481975295, "grad_norm": 0.9072338938713074, "learning_rate": 0.00018426008968609866, "loss": 1.5496, "step": 1414 }, { "epoch": 0.15853897649925772, "grad_norm": 1.4273467063903809, "learning_rate": 0.00018424887892376683, "loss": 0.851, "step": 1415 }, { "epoch": 0.1586510181787625, "grad_norm": 1.2392058372497559, "learning_rate": 0.000184237668161435, "loss": 1.9027, "step": 1416 }, { "epoch": 0.15876305985826727, "grad_norm": 1.5387285947799683, "learning_rate": 0.00018422645739910317, "loss": 1.1788, "step": 1417 }, { "epoch": 0.15887510153777204, "grad_norm": 1.1787759065628052, "learning_rate": 0.0001842152466367713, "loss": 1.0696, "step": 1418 }, { "epoch": 0.15898714321727683, "grad_norm": 1.536649465560913, "learning_rate": 0.00018420403587443947, "loss": 1.263, "step": 1419 }, { "epoch": 0.1590991848967816, "grad_norm": 1.6520637273788452, "learning_rate": 0.00018419282511210761, "loss": 1.4892, "step": 1420 }, { "epoch": 0.1592112265762864, "grad_norm": 1.510938286781311, "learning_rate": 0.00018418161434977578, "loss": 0.8566, "step": 1421 }, { "epoch": 0.15932326825579116, "grad_norm": 2.2676897048950195, "learning_rate": 0.00018417040358744395, "loss": 1.7034, "step": 1422 }, { "epoch": 0.15943530993529592, "grad_norm": 1.3120766878128052, "learning_rate": 0.00018415919282511212, "loss": 1.1621, "step": 1423 }, { "epoch": 0.15954735161480071, "grad_norm": 0.9869388341903687, "learning_rate": 0.00018414798206278028, "loss": 1.3609, "step": 1424 }, { "epoch": 0.15965939329430548, "grad_norm": 1.606332778930664, "learning_rate": 0.00018413677130044845, "loss": 1.8535, "step": 1425 }, { "epoch": 0.15977143497381024, "grad_norm": 1.840029001235962, "learning_rate": 0.0001841255605381166, "loss": 1.1076, "step": 1426 }, { "epoch": 0.15988347665331504, "grad_norm": 1.913089394569397, "learning_rate": 0.00018411434977578476, "loss": 1.7127, "step": 1427 }, { "epoch": 0.1599955183328198, "grad_norm": 1.5036598443984985, "learning_rate": 0.00018410313901345293, "loss": 0.7786, "step": 1428 }, { "epoch": 0.1601075600123246, "grad_norm": 0.9580232501029968, "learning_rate": 0.0001840919282511211, "loss": 1.7034, "step": 1429 }, { "epoch": 0.16021960169182936, "grad_norm": 0.9741318225860596, "learning_rate": 0.00018408071748878926, "loss": 1.3153, "step": 1430 }, { "epoch": 0.16033164337133413, "grad_norm": 1.5546228885650635, "learning_rate": 0.0001840695067264574, "loss": 1.0278, "step": 1431 }, { "epoch": 0.16044368505083892, "grad_norm": 1.2572184801101685, "learning_rate": 0.00018405829596412557, "loss": 1.35, "step": 1432 }, { "epoch": 0.16055572673034368, "grad_norm": 1.5313972234725952, "learning_rate": 0.00018404708520179374, "loss": 1.6348, "step": 1433 }, { "epoch": 0.16066776840984848, "grad_norm": 1.4443055391311646, "learning_rate": 0.00018403587443946188, "loss": 0.998, "step": 1434 }, { "epoch": 0.16077981008935324, "grad_norm": 1.4208911657333374, "learning_rate": 0.00018402466367713004, "loss": 1.2684, "step": 1435 }, { "epoch": 0.160891851768858, "grad_norm": 1.1181085109710693, "learning_rate": 0.0001840134529147982, "loss": 0.8208, "step": 1436 }, { "epoch": 0.1610038934483628, "grad_norm": 2.1076841354370117, "learning_rate": 0.00018400224215246638, "loss": 1.4313, "step": 1437 }, { "epoch": 0.16111593512786757, "grad_norm": 2.1902852058410645, "learning_rate": 0.00018399103139013454, "loss": 1.6097, "step": 1438 }, { "epoch": 0.16122797680737233, "grad_norm": 1.5900945663452148, "learning_rate": 0.0001839798206278027, "loss": 1.6297, "step": 1439 }, { "epoch": 0.16134001848687712, "grad_norm": 2.014253854751587, "learning_rate": 0.00018396860986547085, "loss": 1.1901, "step": 1440 }, { "epoch": 0.1614520601663819, "grad_norm": 1.6648170948028564, "learning_rate": 0.00018395739910313902, "loss": 1.171, "step": 1441 }, { "epoch": 0.16156410184588668, "grad_norm": 1.158216118812561, "learning_rate": 0.0001839461883408072, "loss": 1.0261, "step": 1442 }, { "epoch": 0.16167614352539145, "grad_norm": 1.3818362951278687, "learning_rate": 0.00018393497757847535, "loss": 1.8386, "step": 1443 }, { "epoch": 0.1617881852048962, "grad_norm": 1.2692238092422485, "learning_rate": 0.00018392376681614352, "loss": 1.1579, "step": 1444 }, { "epoch": 0.161900226884401, "grad_norm": 1.5591059923171997, "learning_rate": 0.00018391255605381166, "loss": 1.2936, "step": 1445 }, { "epoch": 0.16201226856390577, "grad_norm": 1.946919560432434, "learning_rate": 0.00018390134529147983, "loss": 1.1645, "step": 1446 }, { "epoch": 0.16212431024341054, "grad_norm": 2.1683173179626465, "learning_rate": 0.000183890134529148, "loss": 1.2994, "step": 1447 }, { "epoch": 0.16223635192291533, "grad_norm": 0.9816479086875916, "learning_rate": 0.00018387892376681614, "loss": 1.79, "step": 1448 }, { "epoch": 0.1623483936024201, "grad_norm": 1.6052570343017578, "learning_rate": 0.0001838677130044843, "loss": 1.6427, "step": 1449 }, { "epoch": 0.1624604352819249, "grad_norm": 1.597557783126831, "learning_rate": 0.00018385650224215247, "loss": 1.0748, "step": 1450 }, { "epoch": 0.16257247696142965, "grad_norm": 1.769805908203125, "learning_rate": 0.00018384529147982064, "loss": 1.2317, "step": 1451 }, { "epoch": 0.16268451864093442, "grad_norm": 2.013019561767578, "learning_rate": 0.0001838340807174888, "loss": 1.6738, "step": 1452 }, { "epoch": 0.1627965603204392, "grad_norm": 1.697911024093628, "learning_rate": 0.00018382286995515697, "loss": 1.7583, "step": 1453 }, { "epoch": 0.16290860199994397, "grad_norm": 1.333603858947754, "learning_rate": 0.00018381165919282511, "loss": 1.4985, "step": 1454 }, { "epoch": 0.16302064367944877, "grad_norm": 1.555256724357605, "learning_rate": 0.00018380044843049328, "loss": 2.1545, "step": 1455 }, { "epoch": 0.16313268535895353, "grad_norm": 1.9548388719558716, "learning_rate": 0.00018378923766816145, "loss": 1.0992, "step": 1456 }, { "epoch": 0.1632447270384583, "grad_norm": 1.2780531644821167, "learning_rate": 0.00018377802690582962, "loss": 0.8367, "step": 1457 }, { "epoch": 0.1633567687179631, "grad_norm": 1.5377811193466187, "learning_rate": 0.00018376681614349778, "loss": 0.9834, "step": 1458 }, { "epoch": 0.16346881039746786, "grad_norm": 1.8027440309524536, "learning_rate": 0.00018375560538116592, "loss": 1.1148, "step": 1459 }, { "epoch": 0.16358085207697262, "grad_norm": 1.723570466041565, "learning_rate": 0.0001837443946188341, "loss": 1.3781, "step": 1460 }, { "epoch": 0.16369289375647741, "grad_norm": 1.0718812942504883, "learning_rate": 0.00018373318385650223, "loss": 1.5542, "step": 1461 }, { "epoch": 0.16380493543598218, "grad_norm": 1.5260387659072876, "learning_rate": 0.0001837219730941704, "loss": 1.2519, "step": 1462 }, { "epoch": 0.16391697711548697, "grad_norm": 2.214653730392456, "learning_rate": 0.00018371076233183857, "loss": 1.7441, "step": 1463 }, { "epoch": 0.16402901879499174, "grad_norm": 1.3564742803573608, "learning_rate": 0.00018369955156950673, "loss": 2.1588, "step": 1464 }, { "epoch": 0.1641410604744965, "grad_norm": 1.2701201438903809, "learning_rate": 0.0001836883408071749, "loss": 1.2702, "step": 1465 }, { "epoch": 0.1642531021540013, "grad_norm": 1.4644241333007812, "learning_rate": 0.00018367713004484307, "loss": 1.6326, "step": 1466 }, { "epoch": 0.16436514383350606, "grad_norm": 1.79185152053833, "learning_rate": 0.00018366591928251124, "loss": 1.1794, "step": 1467 }, { "epoch": 0.16447718551301083, "grad_norm": 1.3113638162612915, "learning_rate": 0.00018365470852017938, "loss": 1.5112, "step": 1468 }, { "epoch": 0.16458922719251562, "grad_norm": 1.191855549812317, "learning_rate": 0.00018364349775784754, "loss": 1.3337, "step": 1469 }, { "epoch": 0.16470126887202038, "grad_norm": 1.386404275894165, "learning_rate": 0.0001836322869955157, "loss": 1.8786, "step": 1470 }, { "epoch": 0.16481331055152518, "grad_norm": 0.7273179292678833, "learning_rate": 0.00018362107623318388, "loss": 1.8865, "step": 1471 }, { "epoch": 0.16492535223102994, "grad_norm": 1.412330985069275, "learning_rate": 0.00018360986547085202, "loss": 1.2747, "step": 1472 }, { "epoch": 0.1650373939105347, "grad_norm": 1.638340950012207, "learning_rate": 0.00018359865470852019, "loss": 1.3811, "step": 1473 }, { "epoch": 0.1651494355900395, "grad_norm": 1.32552969455719, "learning_rate": 0.00018358744394618835, "loss": 1.5436, "step": 1474 }, { "epoch": 0.16526147726954427, "grad_norm": 1.9235819578170776, "learning_rate": 0.0001835762331838565, "loss": 1.2643, "step": 1475 }, { "epoch": 0.16537351894904906, "grad_norm": 1.329628586769104, "learning_rate": 0.00018356502242152466, "loss": 1.4444, "step": 1476 }, { "epoch": 0.16548556062855382, "grad_norm": 1.8046598434448242, "learning_rate": 0.00018355381165919283, "loss": 1.6267, "step": 1477 }, { "epoch": 0.1655976023080586, "grad_norm": 1.0765407085418701, "learning_rate": 0.000183542600896861, "loss": 1.9944, "step": 1478 }, { "epoch": 0.16570964398756338, "grad_norm": 1.5871212482452393, "learning_rate": 0.00018353139013452916, "loss": 1.8459, "step": 1479 }, { "epoch": 0.16582168566706815, "grad_norm": 1.2659369707107544, "learning_rate": 0.00018352017937219733, "loss": 1.713, "step": 1480 }, { "epoch": 0.1659337273465729, "grad_norm": 1.301551342010498, "learning_rate": 0.00018350896860986547, "loss": 1.5346, "step": 1481 }, { "epoch": 0.1660457690260777, "grad_norm": 1.5892798900604248, "learning_rate": 0.00018349775784753364, "loss": 1.5491, "step": 1482 }, { "epoch": 0.16615781070558247, "grad_norm": 1.656078815460205, "learning_rate": 0.0001834865470852018, "loss": 2.1991, "step": 1483 }, { "epoch": 0.16626985238508726, "grad_norm": 1.6886074542999268, "learning_rate": 0.00018347533632286997, "loss": 1.6294, "step": 1484 }, { "epoch": 0.16638189406459203, "grad_norm": 1.7337063550949097, "learning_rate": 0.00018346412556053814, "loss": 1.5529, "step": 1485 }, { "epoch": 0.1664939357440968, "grad_norm": 1.8382784128189087, "learning_rate": 0.00018345291479820628, "loss": 1.2872, "step": 1486 }, { "epoch": 0.1666059774236016, "grad_norm": 1.1795817613601685, "learning_rate": 0.00018344170403587445, "loss": 1.5482, "step": 1487 }, { "epoch": 0.16671801910310635, "grad_norm": 1.0276936292648315, "learning_rate": 0.00018343049327354261, "loss": 1.245, "step": 1488 }, { "epoch": 0.16683006078261114, "grad_norm": 1.8286960124969482, "learning_rate": 0.00018341928251121076, "loss": 1.7802, "step": 1489 }, { "epoch": 0.1669421024621159, "grad_norm": 1.175987958908081, "learning_rate": 0.00018340807174887892, "loss": 0.8232, "step": 1490 }, { "epoch": 0.16705414414162068, "grad_norm": 0.9822572469711304, "learning_rate": 0.0001833968609865471, "loss": 1.3604, "step": 1491 }, { "epoch": 0.16716618582112547, "grad_norm": 1.4658969640731812, "learning_rate": 0.00018338565022421526, "loss": 1.1335, "step": 1492 }, { "epoch": 0.16727822750063023, "grad_norm": 1.0906667709350586, "learning_rate": 0.00018337443946188342, "loss": 1.1897, "step": 1493 }, { "epoch": 0.167390269180135, "grad_norm": 2.8399572372436523, "learning_rate": 0.0001833632286995516, "loss": 1.3907, "step": 1494 }, { "epoch": 0.1675023108596398, "grad_norm": 1.0496121644973755, "learning_rate": 0.00018335201793721973, "loss": 1.2802, "step": 1495 }, { "epoch": 0.16761435253914456, "grad_norm": 1.1912118196487427, "learning_rate": 0.0001833408071748879, "loss": 1.7033, "step": 1496 }, { "epoch": 0.16772639421864935, "grad_norm": 1.9492340087890625, "learning_rate": 0.00018332959641255607, "loss": 1.0758, "step": 1497 }, { "epoch": 0.16783843589815411, "grad_norm": 0.6578432321548462, "learning_rate": 0.00018331838565022423, "loss": 1.4673, "step": 1498 }, { "epoch": 0.16795047757765888, "grad_norm": 1.4698289632797241, "learning_rate": 0.00018330717488789237, "loss": 1.3172, "step": 1499 }, { "epoch": 0.16806251925716367, "grad_norm": 1.413185954093933, "learning_rate": 0.00018329596412556054, "loss": 1.797, "step": 1500 }, { "epoch": 0.16817456093666844, "grad_norm": 0.990799605846405, "learning_rate": 0.0001832847533632287, "loss": 1.3196, "step": 1501 }, { "epoch": 0.1682866026161732, "grad_norm": 2.21759295463562, "learning_rate": 0.00018327354260089688, "loss": 0.926, "step": 1502 }, { "epoch": 0.168398644295678, "grad_norm": 1.852136492729187, "learning_rate": 0.00018326233183856502, "loss": 1.7952, "step": 1503 }, { "epoch": 0.16851068597518276, "grad_norm": 1.3354382514953613, "learning_rate": 0.00018325112107623318, "loss": 1.8431, "step": 1504 }, { "epoch": 0.16862272765468755, "grad_norm": 1.0527989864349365, "learning_rate": 0.00018323991031390135, "loss": 1.6152, "step": 1505 }, { "epoch": 0.16873476933419232, "grad_norm": 2.338576078414917, "learning_rate": 0.00018322869955156952, "loss": 1.5695, "step": 1506 }, { "epoch": 0.16884681101369708, "grad_norm": 1.5897364616394043, "learning_rate": 0.00018321748878923769, "loss": 1.2247, "step": 1507 }, { "epoch": 0.16895885269320188, "grad_norm": 1.138485074043274, "learning_rate": 0.00018320627802690585, "loss": 1.671, "step": 1508 }, { "epoch": 0.16907089437270664, "grad_norm": 1.3903974294662476, "learning_rate": 0.000183195067264574, "loss": 1.4279, "step": 1509 }, { "epoch": 0.16918293605221144, "grad_norm": 1.3690763711929321, "learning_rate": 0.00018318385650224216, "loss": 1.7438, "step": 1510 }, { "epoch": 0.1692949777317162, "grad_norm": 1.7609137296676636, "learning_rate": 0.00018317264573991033, "loss": 1.3592, "step": 1511 }, { "epoch": 0.16940701941122097, "grad_norm": 1.5816168785095215, "learning_rate": 0.0001831614349775785, "loss": 1.5541, "step": 1512 }, { "epoch": 0.16951906109072576, "grad_norm": 1.9555151462554932, "learning_rate": 0.00018315022421524664, "loss": 1.973, "step": 1513 }, { "epoch": 0.16963110277023052, "grad_norm": 1.4801336526870728, "learning_rate": 0.0001831390134529148, "loss": 1.1437, "step": 1514 }, { "epoch": 0.1697431444497353, "grad_norm": 1.518924355506897, "learning_rate": 0.00018312780269058297, "loss": 1.7002, "step": 1515 }, { "epoch": 0.16985518612924008, "grad_norm": 1.7891069650650024, "learning_rate": 0.0001831165919282511, "loss": 0.9963, "step": 1516 }, { "epoch": 0.16996722780874485, "grad_norm": 1.5574733018875122, "learning_rate": 0.00018310538116591928, "loss": 1.3554, "step": 1517 }, { "epoch": 0.17007926948824964, "grad_norm": 1.9836798906326294, "learning_rate": 0.00018309417040358745, "loss": 1.4913, "step": 1518 }, { "epoch": 0.1701913111677544, "grad_norm": 2.035609483718872, "learning_rate": 0.0001830829596412556, "loss": 1.4399, "step": 1519 }, { "epoch": 0.17030335284725917, "grad_norm": 1.0411056280136108, "learning_rate": 0.00018307174887892378, "loss": 1.8523, "step": 1520 }, { "epoch": 0.17041539452676396, "grad_norm": 1.8169398307800293, "learning_rate": 0.00018306053811659195, "loss": 1.4211, "step": 1521 }, { "epoch": 0.17052743620626873, "grad_norm": 1.9573005437850952, "learning_rate": 0.00018304932735426012, "loss": 1.8646, "step": 1522 }, { "epoch": 0.1706394778857735, "grad_norm": 1.2641253471374512, "learning_rate": 0.00018303811659192826, "loss": 1.3229, "step": 1523 }, { "epoch": 0.1707515195652783, "grad_norm": 1.3319339752197266, "learning_rate": 0.00018302690582959642, "loss": 0.9662, "step": 1524 }, { "epoch": 0.17086356124478305, "grad_norm": 1.2101835012435913, "learning_rate": 0.0001830156950672646, "loss": 1.1831, "step": 1525 }, { "epoch": 0.17097560292428785, "grad_norm": 1.9852532148361206, "learning_rate": 0.00018300448430493276, "loss": 1.978, "step": 1526 }, { "epoch": 0.1710876446037926, "grad_norm": 3.0406219959259033, "learning_rate": 0.0001829932735426009, "loss": 1.8338, "step": 1527 }, { "epoch": 0.17119968628329738, "grad_norm": 1.7813526391983032, "learning_rate": 0.00018298206278026907, "loss": 1.1026, "step": 1528 }, { "epoch": 0.17131172796280217, "grad_norm": 1.2701486349105835, "learning_rate": 0.00018297085201793723, "loss": 1.4178, "step": 1529 }, { "epoch": 0.17142376964230693, "grad_norm": 1.9425747394561768, "learning_rate": 0.00018295964125560537, "loss": 1.8902, "step": 1530 }, { "epoch": 0.17153581132181173, "grad_norm": 2.2324118614196777, "learning_rate": 0.00018294843049327354, "loss": 1.1758, "step": 1531 }, { "epoch": 0.1716478530013165, "grad_norm": 1.6583709716796875, "learning_rate": 0.0001829372197309417, "loss": 0.877, "step": 1532 }, { "epoch": 0.17175989468082126, "grad_norm": 1.5388211011886597, "learning_rate": 0.00018292600896860988, "loss": 1.7251, "step": 1533 }, { "epoch": 0.17187193636032605, "grad_norm": 1.3635694980621338, "learning_rate": 0.00018291479820627804, "loss": 1.4213, "step": 1534 }, { "epoch": 0.17198397803983081, "grad_norm": 1.4790900945663452, "learning_rate": 0.0001829035874439462, "loss": 1.4484, "step": 1535 }, { "epoch": 0.17209601971933558, "grad_norm": 1.4879271984100342, "learning_rate": 0.00018289237668161438, "loss": 1.0486, "step": 1536 }, { "epoch": 0.17220806139884037, "grad_norm": 1.4301261901855469, "learning_rate": 0.00018288116591928252, "loss": 1.0166, "step": 1537 }, { "epoch": 0.17232010307834514, "grad_norm": 1.691550374031067, "learning_rate": 0.00018286995515695068, "loss": 1.2269, "step": 1538 }, { "epoch": 0.17243214475784993, "grad_norm": 1.8763976097106934, "learning_rate": 0.00018285874439461885, "loss": 0.7538, "step": 1539 }, { "epoch": 0.1725441864373547, "grad_norm": 2.0933451652526855, "learning_rate": 0.000182847533632287, "loss": 1.7903, "step": 1540 }, { "epoch": 0.17265622811685946, "grad_norm": 1.0984221696853638, "learning_rate": 0.00018283632286995516, "loss": 1.2386, "step": 1541 }, { "epoch": 0.17276826979636425, "grad_norm": 1.1371593475341797, "learning_rate": 0.00018282511210762333, "loss": 1.1229, "step": 1542 }, { "epoch": 0.17288031147586902, "grad_norm": 1.7911226749420166, "learning_rate": 0.0001828139013452915, "loss": 1.2688, "step": 1543 }, { "epoch": 0.1729923531553738, "grad_norm": 1.5790252685546875, "learning_rate": 0.00018280269058295963, "loss": 1.2384, "step": 1544 }, { "epoch": 0.17310439483487858, "grad_norm": 1.9888460636138916, "learning_rate": 0.0001827914798206278, "loss": 0.9881, "step": 1545 }, { "epoch": 0.17321643651438334, "grad_norm": 1.2398663759231567, "learning_rate": 0.00018278026905829597, "loss": 1.7116, "step": 1546 }, { "epoch": 0.17332847819388814, "grad_norm": 1.2173709869384766, "learning_rate": 0.00018276905829596414, "loss": 1.7244, "step": 1547 }, { "epoch": 0.1734405198733929, "grad_norm": 3.534224033355713, "learning_rate": 0.0001827578475336323, "loss": 1.4289, "step": 1548 }, { "epoch": 0.17355256155289767, "grad_norm": 1.9029350280761719, "learning_rate": 0.00018274663677130047, "loss": 1.2923, "step": 1549 }, { "epoch": 0.17366460323240246, "grad_norm": 1.4388689994812012, "learning_rate": 0.0001827354260089686, "loss": 1.5869, "step": 1550 }, { "epoch": 0.17377664491190722, "grad_norm": 1.7717909812927246, "learning_rate": 0.00018272421524663678, "loss": 1.9691, "step": 1551 }, { "epoch": 0.17388868659141202, "grad_norm": 1.9781231880187988, "learning_rate": 0.00018271300448430495, "loss": 1.8639, "step": 1552 }, { "epoch": 0.17400072827091678, "grad_norm": 1.4955172538757324, "learning_rate": 0.00018270179372197311, "loss": 1.3824, "step": 1553 }, { "epoch": 0.17411276995042155, "grad_norm": 1.0433539152145386, "learning_rate": 0.00018269058295964125, "loss": 1.6625, "step": 1554 }, { "epoch": 0.17422481162992634, "grad_norm": 2.138007640838623, "learning_rate": 0.00018267937219730942, "loss": 2.0962, "step": 1555 }, { "epoch": 0.1743368533094311, "grad_norm": 2.663966655731201, "learning_rate": 0.0001826681614349776, "loss": 1.4288, "step": 1556 }, { "epoch": 0.17444889498893587, "grad_norm": 1.701526165008545, "learning_rate": 0.00018265695067264576, "loss": 1.8716, "step": 1557 }, { "epoch": 0.17456093666844066, "grad_norm": 2.2684929370880127, "learning_rate": 0.0001826457399103139, "loss": 1.8743, "step": 1558 }, { "epoch": 0.17467297834794543, "grad_norm": 1.502577304840088, "learning_rate": 0.00018263452914798206, "loss": 1.5237, "step": 1559 }, { "epoch": 0.17478502002745022, "grad_norm": 1.064112663269043, "learning_rate": 0.00018262331838565023, "loss": 2.2038, "step": 1560 }, { "epoch": 0.174897061706955, "grad_norm": 1.8059871196746826, "learning_rate": 0.0001826121076233184, "loss": 1.8928, "step": 1561 }, { "epoch": 0.17500910338645975, "grad_norm": 1.8564492464065552, "learning_rate": 0.00018260089686098657, "loss": 1.5628, "step": 1562 }, { "epoch": 0.17512114506596455, "grad_norm": 1.3561211824417114, "learning_rate": 0.00018258968609865473, "loss": 1.705, "step": 1563 }, { "epoch": 0.1752331867454693, "grad_norm": 3.4120335578918457, "learning_rate": 0.00018257847533632287, "loss": 2.173, "step": 1564 }, { "epoch": 0.1753452284249741, "grad_norm": 2.6757538318634033, "learning_rate": 0.00018256726457399104, "loss": 2.1104, "step": 1565 }, { "epoch": 0.17545727010447887, "grad_norm": 1.932557225227356, "learning_rate": 0.0001825560538116592, "loss": 1.6985, "step": 1566 }, { "epoch": 0.17556931178398363, "grad_norm": 1.7716313600540161, "learning_rate": 0.00018254484304932735, "loss": 1.4791, "step": 1567 }, { "epoch": 0.17568135346348843, "grad_norm": 1.7375937700271606, "learning_rate": 0.00018253363228699552, "loss": 1.4271, "step": 1568 }, { "epoch": 0.1757933951429932, "grad_norm": 5.283396244049072, "learning_rate": 0.00018252242152466368, "loss": 1.8474, "step": 1569 }, { "epoch": 0.17590543682249796, "grad_norm": 3.0716800689697266, "learning_rate": 0.00018251121076233185, "loss": 1.5496, "step": 1570 }, { "epoch": 0.17601747850200275, "grad_norm": 2.1374895572662354, "learning_rate": 0.0001825, "loss": 0.9408, "step": 1571 }, { "epoch": 0.17612952018150752, "grad_norm": 1.403450608253479, "learning_rate": 0.00018248878923766816, "loss": 1.6116, "step": 1572 }, { "epoch": 0.1762415618610123, "grad_norm": 2.621577262878418, "learning_rate": 0.00018247757847533633, "loss": 1.5746, "step": 1573 }, { "epoch": 0.17635360354051707, "grad_norm": 1.5588138103485107, "learning_rate": 0.0001824663677130045, "loss": 1.3847, "step": 1574 }, { "epoch": 0.17646564522002184, "grad_norm": 1.0408475399017334, "learning_rate": 0.00018245515695067266, "loss": 1.2945, "step": 1575 }, { "epoch": 0.17657768689952663, "grad_norm": 1.6548418998718262, "learning_rate": 0.00018244394618834083, "loss": 0.9927, "step": 1576 }, { "epoch": 0.1766897285790314, "grad_norm": 1.0105589628219604, "learning_rate": 0.000182432735426009, "loss": 1.2347, "step": 1577 }, { "epoch": 0.17680177025853616, "grad_norm": 1.470354437828064, "learning_rate": 0.00018242152466367714, "loss": 1.2246, "step": 1578 }, { "epoch": 0.17691381193804095, "grad_norm": 1.1530768871307373, "learning_rate": 0.0001824103139013453, "loss": 1.6822, "step": 1579 }, { "epoch": 0.17702585361754572, "grad_norm": 1.285150408744812, "learning_rate": 0.00018239910313901347, "loss": 1.5692, "step": 1580 }, { "epoch": 0.1771378952970505, "grad_norm": 1.7636886835098267, "learning_rate": 0.0001823878923766816, "loss": 1.4156, "step": 1581 }, { "epoch": 0.17724993697655528, "grad_norm": 1.5233204364776611, "learning_rate": 0.00018237668161434978, "loss": 1.9241, "step": 1582 }, { "epoch": 0.17736197865606004, "grad_norm": 2.109220027923584, "learning_rate": 0.00018236547085201794, "loss": 1.3381, "step": 1583 }, { "epoch": 0.17747402033556484, "grad_norm": 1.5319582223892212, "learning_rate": 0.0001823542600896861, "loss": 1.2068, "step": 1584 }, { "epoch": 0.1775860620150696, "grad_norm": 1.0390812158584595, "learning_rate": 0.00018234304932735425, "loss": 1.3588, "step": 1585 }, { "epoch": 0.1776981036945744, "grad_norm": 1.6747058629989624, "learning_rate": 0.00018233183856502242, "loss": 1.5858, "step": 1586 }, { "epoch": 0.17781014537407916, "grad_norm": 3.1873302459716797, "learning_rate": 0.0001823206278026906, "loss": 0.935, "step": 1587 }, { "epoch": 0.17792218705358392, "grad_norm": 1.5428591966629028, "learning_rate": 0.00018230941704035875, "loss": 1.6785, "step": 1588 }, { "epoch": 0.17803422873308872, "grad_norm": 1.2542240619659424, "learning_rate": 0.00018229820627802692, "loss": 1.4941, "step": 1589 }, { "epoch": 0.17814627041259348, "grad_norm": 1.6616257429122925, "learning_rate": 0.0001822869955156951, "loss": 1.421, "step": 1590 }, { "epoch": 0.17825831209209825, "grad_norm": 2.552248477935791, "learning_rate": 0.00018227578475336326, "loss": 1.6115, "step": 1591 }, { "epoch": 0.17837035377160304, "grad_norm": 0.9247952103614807, "learning_rate": 0.0001822645739910314, "loss": 1.1591, "step": 1592 }, { "epoch": 0.1784823954511078, "grad_norm": 2.3127493858337402, "learning_rate": 0.00018225336322869956, "loss": 1.9275, "step": 1593 }, { "epoch": 0.1785944371306126, "grad_norm": 1.5678995847702026, "learning_rate": 0.00018224215246636773, "loss": 1.2642, "step": 1594 }, { "epoch": 0.17870647881011736, "grad_norm": 1.18511164188385, "learning_rate": 0.00018223094170403587, "loss": 1.5681, "step": 1595 }, { "epoch": 0.17881852048962213, "grad_norm": 1.8435550928115845, "learning_rate": 0.00018221973094170404, "loss": 1.559, "step": 1596 }, { "epoch": 0.17893056216912692, "grad_norm": 1.7712337970733643, "learning_rate": 0.0001822085201793722, "loss": 1.7549, "step": 1597 }, { "epoch": 0.1790426038486317, "grad_norm": 1.7126870155334473, "learning_rate": 0.00018219730941704037, "loss": 1.7879, "step": 1598 }, { "epoch": 0.17915464552813648, "grad_norm": 2.228342056274414, "learning_rate": 0.00018218609865470851, "loss": 1.9279, "step": 1599 }, { "epoch": 0.17926668720764125, "grad_norm": 1.6070306301116943, "learning_rate": 0.00018217488789237668, "loss": 1.2672, "step": 1600 }, { "epoch": 0.179378728887146, "grad_norm": 1.8585717678070068, "learning_rate": 0.00018216367713004485, "loss": 1.946, "step": 1601 }, { "epoch": 0.1794907705666508, "grad_norm": 1.5246399641036987, "learning_rate": 0.00018215246636771302, "loss": 1.3227, "step": 1602 }, { "epoch": 0.17960281224615557, "grad_norm": 1.3526431322097778, "learning_rate": 0.00018214125560538118, "loss": 1.6056, "step": 1603 }, { "epoch": 0.17971485392566033, "grad_norm": 2.115983009338379, "learning_rate": 0.00018213004484304935, "loss": 1.9229, "step": 1604 }, { "epoch": 0.17982689560516513, "grad_norm": 1.981210708618164, "learning_rate": 0.0001821188340807175, "loss": 1.4987, "step": 1605 }, { "epoch": 0.1799389372846699, "grad_norm": 3.446843385696411, "learning_rate": 0.00018210762331838566, "loss": 1.334, "step": 1606 }, { "epoch": 0.18005097896417468, "grad_norm": 1.1886974573135376, "learning_rate": 0.00018209641255605383, "loss": 2.0519, "step": 1607 }, { "epoch": 0.18016302064367945, "grad_norm": 1.4691685438156128, "learning_rate": 0.00018208520179372197, "loss": 1.5075, "step": 1608 }, { "epoch": 0.18027506232318422, "grad_norm": 2.210169553756714, "learning_rate": 0.00018207399103139013, "loss": 1.6717, "step": 1609 }, { "epoch": 0.180387104002689, "grad_norm": 1.7390984296798706, "learning_rate": 0.0001820627802690583, "loss": 1.6262, "step": 1610 }, { "epoch": 0.18049914568219377, "grad_norm": 1.7382349967956543, "learning_rate": 0.00018205156950672647, "loss": 1.5379, "step": 1611 }, { "epoch": 0.18061118736169854, "grad_norm": 1.8855663537979126, "learning_rate": 0.00018204035874439464, "loss": 1.0955, "step": 1612 }, { "epoch": 0.18072322904120333, "grad_norm": 2.0650863647460938, "learning_rate": 0.00018202914798206278, "loss": 1.7766, "step": 1613 }, { "epoch": 0.1808352707207081, "grad_norm": 1.2225654125213623, "learning_rate": 0.00018201793721973094, "loss": 1.8228, "step": 1614 }, { "epoch": 0.1809473124002129, "grad_norm": 1.8436907529830933, "learning_rate": 0.0001820067264573991, "loss": 2.1793, "step": 1615 }, { "epoch": 0.18105935407971765, "grad_norm": 2.187063455581665, "learning_rate": 0.00018199551569506728, "loss": 1.6412, "step": 1616 }, { "epoch": 0.18117139575922242, "grad_norm": 2.0125179290771484, "learning_rate": 0.00018198430493273545, "loss": 1.3446, "step": 1617 }, { "epoch": 0.1812834374387272, "grad_norm": 0.8175727725028992, "learning_rate": 0.0001819730941704036, "loss": 1.4491, "step": 1618 }, { "epoch": 0.18139547911823198, "grad_norm": 1.0991135835647583, "learning_rate": 0.00018196188340807175, "loss": 1.5535, "step": 1619 }, { "epoch": 0.18150752079773677, "grad_norm": 1.0122016668319702, "learning_rate": 0.00018195067264573992, "loss": 1.1775, "step": 1620 }, { "epoch": 0.18161956247724154, "grad_norm": 1.686361312866211, "learning_rate": 0.0001819394618834081, "loss": 1.5747, "step": 1621 }, { "epoch": 0.1817316041567463, "grad_norm": 1.712575912475586, "learning_rate": 0.00018192825112107623, "loss": 1.9438, "step": 1622 }, { "epoch": 0.1818436458362511, "grad_norm": 1.8753660917282104, "learning_rate": 0.0001819170403587444, "loss": 2.0592, "step": 1623 }, { "epoch": 0.18195568751575586, "grad_norm": 0.9508956074714661, "learning_rate": 0.00018190582959641256, "loss": 1.5987, "step": 1624 }, { "epoch": 0.18206772919526062, "grad_norm": 2.790085792541504, "learning_rate": 0.00018189461883408073, "loss": 1.9977, "step": 1625 }, { "epoch": 0.18217977087476542, "grad_norm": 2.4126927852630615, "learning_rate": 0.0001818834080717489, "loss": 1.4459, "step": 1626 }, { "epoch": 0.18229181255427018, "grad_norm": 1.7399758100509644, "learning_rate": 0.00018187219730941704, "loss": 1.337, "step": 1627 }, { "epoch": 0.18240385423377498, "grad_norm": 1.4598666429519653, "learning_rate": 0.0001818609865470852, "loss": 1.5062, "step": 1628 }, { "epoch": 0.18251589591327974, "grad_norm": 2.100229501724243, "learning_rate": 0.00018184977578475337, "loss": 1.2043, "step": 1629 }, { "epoch": 0.1826279375927845, "grad_norm": 0.9248847365379333, "learning_rate": 0.00018183856502242154, "loss": 1.0183, "step": 1630 }, { "epoch": 0.1827399792722893, "grad_norm": 2.1479856967926025, "learning_rate": 0.0001818273542600897, "loss": 2.112, "step": 1631 }, { "epoch": 0.18285202095179406, "grad_norm": 2.4541172981262207, "learning_rate": 0.00018181614349775787, "loss": 1.3579, "step": 1632 }, { "epoch": 0.18296406263129883, "grad_norm": 2.325071096420288, "learning_rate": 0.00018180493273542601, "loss": 1.3112, "step": 1633 }, { "epoch": 0.18307610431080362, "grad_norm": 1.2949467897415161, "learning_rate": 0.00018179372197309418, "loss": 1.764, "step": 1634 }, { "epoch": 0.1831881459903084, "grad_norm": 2.407471179962158, "learning_rate": 0.00018178251121076232, "loss": 1.7482, "step": 1635 }, { "epoch": 0.18330018766981318, "grad_norm": 1.4536856412887573, "learning_rate": 0.0001817713004484305, "loss": 2.0826, "step": 1636 }, { "epoch": 0.18341222934931795, "grad_norm": 1.2285767793655396, "learning_rate": 0.00018176008968609866, "loss": 1.2355, "step": 1637 }, { "epoch": 0.1835242710288227, "grad_norm": 2.129831552505493, "learning_rate": 0.00018174887892376682, "loss": 2.0753, "step": 1638 }, { "epoch": 0.1836363127083275, "grad_norm": 1.1330775022506714, "learning_rate": 0.000181737668161435, "loss": 1.1287, "step": 1639 }, { "epoch": 0.18374835438783227, "grad_norm": 1.3071669340133667, "learning_rate": 0.00018172645739910313, "loss": 1.3497, "step": 1640 }, { "epoch": 0.18386039606733706, "grad_norm": 2.1469411849975586, "learning_rate": 0.0001817152466367713, "loss": 1.7617, "step": 1641 }, { "epoch": 0.18397243774684183, "grad_norm": 2.3130743503570557, "learning_rate": 0.00018170403587443947, "loss": 1.3498, "step": 1642 }, { "epoch": 0.1840844794263466, "grad_norm": 1.5571949481964111, "learning_rate": 0.00018169282511210763, "loss": 1.9076, "step": 1643 }, { "epoch": 0.18419652110585139, "grad_norm": 1.1969375610351562, "learning_rate": 0.0001816816143497758, "loss": 1.5944, "step": 1644 }, { "epoch": 0.18430856278535615, "grad_norm": 1.893578052520752, "learning_rate": 0.00018167040358744397, "loss": 1.723, "step": 1645 }, { "epoch": 0.18442060446486092, "grad_norm": 2.315589189529419, "learning_rate": 0.00018165919282511214, "loss": 1.49, "step": 1646 }, { "epoch": 0.1845326461443657, "grad_norm": 1.4582254886627197, "learning_rate": 0.00018164798206278028, "loss": 1.0246, "step": 1647 }, { "epoch": 0.18464468782387047, "grad_norm": 1.2517777681350708, "learning_rate": 0.00018163677130044844, "loss": 1.2446, "step": 1648 }, { "epoch": 0.18475672950337527, "grad_norm": 1.620779037475586, "learning_rate": 0.00018162556053811658, "loss": 1.3814, "step": 1649 }, { "epoch": 0.18486877118288003, "grad_norm": 2.1484127044677734, "learning_rate": 0.00018161434977578475, "loss": 2.1212, "step": 1650 }, { "epoch": 0.1849808128623848, "grad_norm": 1.2447823286056519, "learning_rate": 0.00018160313901345292, "loss": 0.9948, "step": 1651 }, { "epoch": 0.1850928545418896, "grad_norm": 3.475999355316162, "learning_rate": 0.00018159192825112109, "loss": 1.1893, "step": 1652 }, { "epoch": 0.18520489622139436, "grad_norm": 1.2707011699676514, "learning_rate": 0.00018158071748878925, "loss": 1.9084, "step": 1653 }, { "epoch": 0.18531693790089915, "grad_norm": 1.7747670412063599, "learning_rate": 0.0001815695067264574, "loss": 1.663, "step": 1654 }, { "epoch": 0.1854289795804039, "grad_norm": 1.6993193626403809, "learning_rate": 0.00018155829596412556, "loss": 1.6366, "step": 1655 }, { "epoch": 0.18554102125990868, "grad_norm": 0.9219907522201538, "learning_rate": 0.00018154708520179373, "loss": 1.3856, "step": 1656 }, { "epoch": 0.18565306293941347, "grad_norm": 1.7372359037399292, "learning_rate": 0.0001815358744394619, "loss": 1.3464, "step": 1657 }, { "epoch": 0.18576510461891824, "grad_norm": 1.5150718688964844, "learning_rate": 0.00018152466367713006, "loss": 1.9206, "step": 1658 }, { "epoch": 0.185877146298423, "grad_norm": 1.4194999933242798, "learning_rate": 0.00018151345291479823, "loss": 1.7114, "step": 1659 }, { "epoch": 0.1859891879779278, "grad_norm": 1.067771553993225, "learning_rate": 0.00018150224215246637, "loss": 1.0627, "step": 1660 }, { "epoch": 0.18610122965743256, "grad_norm": 3.8066630363464355, "learning_rate": 0.00018149103139013454, "loss": 1.949, "step": 1661 }, { "epoch": 0.18621327133693735, "grad_norm": 1.5302711725234985, "learning_rate": 0.0001814798206278027, "loss": 1.8473, "step": 1662 }, { "epoch": 0.18632531301644212, "grad_norm": 1.729211449623108, "learning_rate": 0.00018146860986547085, "loss": 1.1968, "step": 1663 }, { "epoch": 0.18643735469594688, "grad_norm": 2.173419952392578, "learning_rate": 0.000181457399103139, "loss": 1.6243, "step": 1664 }, { "epoch": 0.18654939637545168, "grad_norm": 1.9120025634765625, "learning_rate": 0.00018144618834080718, "loss": 1.3776, "step": 1665 }, { "epoch": 0.18666143805495644, "grad_norm": 0.9334443211555481, "learning_rate": 0.00018143497757847535, "loss": 1.3072, "step": 1666 }, { "epoch": 0.1867734797344612, "grad_norm": 1.0132437944412231, "learning_rate": 0.00018142376681614352, "loss": 1.4825, "step": 1667 }, { "epoch": 0.186885521413966, "grad_norm": 1.6699738502502441, "learning_rate": 0.00018141255605381166, "loss": 1.5285, "step": 1668 }, { "epoch": 0.18699756309347076, "grad_norm": 1.033102035522461, "learning_rate": 0.00018140134529147982, "loss": 1.3069, "step": 1669 }, { "epoch": 0.18710960477297556, "grad_norm": 2.424725294113159, "learning_rate": 0.000181390134529148, "loss": 1.4406, "step": 1670 }, { "epoch": 0.18722164645248032, "grad_norm": 1.9403090476989746, "learning_rate": 0.00018137892376681616, "loss": 1.8268, "step": 1671 }, { "epoch": 0.1873336881319851, "grad_norm": 1.824560284614563, "learning_rate": 0.00018136771300448433, "loss": 1.3296, "step": 1672 }, { "epoch": 0.18744572981148988, "grad_norm": 1.5011614561080933, "learning_rate": 0.0001813565022421525, "loss": 1.318, "step": 1673 }, { "epoch": 0.18755777149099465, "grad_norm": 0.9056119918823242, "learning_rate": 0.00018134529147982063, "loss": 1.8129, "step": 1674 }, { "epoch": 0.18766981317049944, "grad_norm": 2.3090596199035645, "learning_rate": 0.0001813340807174888, "loss": 1.6949, "step": 1675 }, { "epoch": 0.1877818548500042, "grad_norm": 3.6758055686950684, "learning_rate": 0.00018132286995515694, "loss": 1.4646, "step": 1676 }, { "epoch": 0.18789389652950897, "grad_norm": 2.041908025741577, "learning_rate": 0.0001813116591928251, "loss": 1.3752, "step": 1677 }, { "epoch": 0.18800593820901376, "grad_norm": 1.0144339799880981, "learning_rate": 0.00018130044843049328, "loss": 1.0794, "step": 1678 }, { "epoch": 0.18811797988851853, "grad_norm": 1.5284099578857422, "learning_rate": 0.00018128923766816144, "loss": 1.1353, "step": 1679 }, { "epoch": 0.1882300215680233, "grad_norm": 2.2290523052215576, "learning_rate": 0.0001812780269058296, "loss": 0.8609, "step": 1680 }, { "epoch": 0.18834206324752809, "grad_norm": 1.9376591444015503, "learning_rate": 0.00018126681614349778, "loss": 1.5244, "step": 1681 }, { "epoch": 0.18845410492703285, "grad_norm": 1.8338649272918701, "learning_rate": 0.00018125560538116592, "loss": 1.3142, "step": 1682 }, { "epoch": 0.18856614660653764, "grad_norm": 1.3870604038238525, "learning_rate": 0.00018124439461883408, "loss": 1.4117, "step": 1683 }, { "epoch": 0.1886781882860424, "grad_norm": 1.8202948570251465, "learning_rate": 0.00018123318385650225, "loss": 1.6249, "step": 1684 }, { "epoch": 0.18879022996554717, "grad_norm": 1.8744208812713623, "learning_rate": 0.00018122197309417042, "loss": 1.545, "step": 1685 }, { "epoch": 0.18890227164505197, "grad_norm": 0.936476469039917, "learning_rate": 0.0001812107623318386, "loss": 1.9202, "step": 1686 }, { "epoch": 0.18901431332455673, "grad_norm": 1.5301246643066406, "learning_rate": 0.00018119955156950675, "loss": 1.8181, "step": 1687 }, { "epoch": 0.1891263550040615, "grad_norm": 1.8908886909484863, "learning_rate": 0.0001811883408071749, "loss": 1.4608, "step": 1688 }, { "epoch": 0.1892383966835663, "grad_norm": 1.68300199508667, "learning_rate": 0.00018117713004484306, "loss": 2.0475, "step": 1689 }, { "epoch": 0.18935043836307106, "grad_norm": 1.4881280660629272, "learning_rate": 0.0001811659192825112, "loss": 1.1545, "step": 1690 }, { "epoch": 0.18946248004257585, "grad_norm": 1.590811848640442, "learning_rate": 0.00018115470852017937, "loss": 1.5668, "step": 1691 }, { "epoch": 0.1895745217220806, "grad_norm": 1.4912766218185425, "learning_rate": 0.00018114349775784754, "loss": 1.8161, "step": 1692 }, { "epoch": 0.18968656340158538, "grad_norm": 1.017862319946289, "learning_rate": 0.0001811322869955157, "loss": 1.4447, "step": 1693 }, { "epoch": 0.18979860508109017, "grad_norm": 1.6721965074539185, "learning_rate": 0.00018112107623318387, "loss": 1.7151, "step": 1694 }, { "epoch": 0.18991064676059494, "grad_norm": 1.3725577592849731, "learning_rate": 0.000181109865470852, "loss": 0.9752, "step": 1695 }, { "epoch": 0.19002268844009973, "grad_norm": 2.555344820022583, "learning_rate": 0.00018109865470852018, "loss": 1.3247, "step": 1696 }, { "epoch": 0.1901347301196045, "grad_norm": 1.3780598640441895, "learning_rate": 0.00018108744394618835, "loss": 1.8947, "step": 1697 }, { "epoch": 0.19024677179910926, "grad_norm": 2.0449910163879395, "learning_rate": 0.00018107623318385651, "loss": 1.5754, "step": 1698 }, { "epoch": 0.19035881347861405, "grad_norm": 1.171738862991333, "learning_rate": 0.00018106502242152468, "loss": 1.5533, "step": 1699 }, { "epoch": 0.19047085515811882, "grad_norm": 1.3335314989089966, "learning_rate": 0.00018105381165919285, "loss": 1.3513, "step": 1700 }, { "epoch": 0.19058289683762358, "grad_norm": 2.7387280464172363, "learning_rate": 0.00018104260089686102, "loss": 1.6032, "step": 1701 }, { "epoch": 0.19069493851712838, "grad_norm": 1.1875395774841309, "learning_rate": 0.00018103139013452916, "loss": 2.0185, "step": 1702 }, { "epoch": 0.19080698019663314, "grad_norm": 0.9890737533569336, "learning_rate": 0.0001810201793721973, "loss": 1.3658, "step": 1703 }, { "epoch": 0.19091902187613793, "grad_norm": 1.6466283798217773, "learning_rate": 0.00018100896860986546, "loss": 1.072, "step": 1704 }, { "epoch": 0.1910310635556427, "grad_norm": 1.9556056261062622, "learning_rate": 0.00018099775784753363, "loss": 1.5382, "step": 1705 }, { "epoch": 0.19114310523514746, "grad_norm": 1.0384567975997925, "learning_rate": 0.0001809865470852018, "loss": 1.2517, "step": 1706 }, { "epoch": 0.19125514691465226, "grad_norm": 1.6002236604690552, "learning_rate": 0.00018097533632286997, "loss": 1.0722, "step": 1707 }, { "epoch": 0.19136718859415702, "grad_norm": 1.4033840894699097, "learning_rate": 0.00018096412556053813, "loss": 1.1501, "step": 1708 }, { "epoch": 0.19147923027366182, "grad_norm": 1.6027247905731201, "learning_rate": 0.00018095291479820627, "loss": 0.8778, "step": 1709 }, { "epoch": 0.19159127195316658, "grad_norm": 1.4240169525146484, "learning_rate": 0.00018094170403587444, "loss": 1.5288, "step": 1710 }, { "epoch": 0.19170331363267135, "grad_norm": 1.8711841106414795, "learning_rate": 0.0001809304932735426, "loss": 1.4498, "step": 1711 }, { "epoch": 0.19181535531217614, "grad_norm": 2.368931770324707, "learning_rate": 0.00018091928251121078, "loss": 1.5839, "step": 1712 }, { "epoch": 0.1919273969916809, "grad_norm": 1.1613301038742065, "learning_rate": 0.00018090807174887894, "loss": 1.2167, "step": 1713 }, { "epoch": 0.19203943867118567, "grad_norm": 1.4324872493743896, "learning_rate": 0.0001808968609865471, "loss": 1.5329, "step": 1714 }, { "epoch": 0.19215148035069046, "grad_norm": 2.185842275619507, "learning_rate": 0.00018088565022421525, "loss": 1.8019, "step": 1715 }, { "epoch": 0.19226352203019523, "grad_norm": 1.2101887464523315, "learning_rate": 0.00018087443946188342, "loss": 1.1144, "step": 1716 }, { "epoch": 0.19237556370970002, "grad_norm": 0.8061571717262268, "learning_rate": 0.00018086322869955156, "loss": 1.2297, "step": 1717 }, { "epoch": 0.19248760538920479, "grad_norm": 1.040459394454956, "learning_rate": 0.00018085201793721973, "loss": 1.8678, "step": 1718 }, { "epoch": 0.19259964706870955, "grad_norm": 2.3125596046447754, "learning_rate": 0.0001808408071748879, "loss": 1.8838, "step": 1719 }, { "epoch": 0.19271168874821434, "grad_norm": 2.6361887454986572, "learning_rate": 0.00018082959641255606, "loss": 1.3793, "step": 1720 }, { "epoch": 0.1928237304277191, "grad_norm": 2.8635785579681396, "learning_rate": 0.00018081838565022423, "loss": 1.6416, "step": 1721 }, { "epoch": 0.19293577210722387, "grad_norm": 1.7486282587051392, "learning_rate": 0.0001808071748878924, "loss": 1.3909, "step": 1722 }, { "epoch": 0.19304781378672867, "grad_norm": 1.3921085596084595, "learning_rate": 0.00018079596412556054, "loss": 0.8964, "step": 1723 }, { "epoch": 0.19315985546623343, "grad_norm": 1.5965625047683716, "learning_rate": 0.0001807847533632287, "loss": 1.4447, "step": 1724 }, { "epoch": 0.19327189714573823, "grad_norm": 1.9785382747650146, "learning_rate": 0.00018077354260089687, "loss": 1.696, "step": 1725 }, { "epoch": 0.193383938825243, "grad_norm": 1.17860746383667, "learning_rate": 0.00018076233183856504, "loss": 1.1893, "step": 1726 }, { "epoch": 0.19349598050474776, "grad_norm": 1.5895090103149414, "learning_rate": 0.0001807511210762332, "loss": 1.0587, "step": 1727 }, { "epoch": 0.19360802218425255, "grad_norm": 1.3763586282730103, "learning_rate": 0.00018073991031390137, "loss": 0.7975, "step": 1728 }, { "epoch": 0.1937200638637573, "grad_norm": 1.5766597986221313, "learning_rate": 0.0001807286995515695, "loss": 1.8574, "step": 1729 }, { "epoch": 0.1938321055432621, "grad_norm": 1.1627898216247559, "learning_rate": 0.00018071748878923768, "loss": 1.5527, "step": 1730 }, { "epoch": 0.19394414722276687, "grad_norm": 1.0556070804595947, "learning_rate": 0.00018070627802690582, "loss": 0.7802, "step": 1731 }, { "epoch": 0.19405618890227164, "grad_norm": 2.020914316177368, "learning_rate": 0.000180695067264574, "loss": 1.1308, "step": 1732 }, { "epoch": 0.19416823058177643, "grad_norm": 1.933242678642273, "learning_rate": 0.00018068385650224215, "loss": 1.4594, "step": 1733 }, { "epoch": 0.1942802722612812, "grad_norm": 1.626237154006958, "learning_rate": 0.00018067264573991032, "loss": 1.3112, "step": 1734 }, { "epoch": 0.19439231394078596, "grad_norm": 1.479982614517212, "learning_rate": 0.0001806614349775785, "loss": 1.4445, "step": 1735 }, { "epoch": 0.19450435562029075, "grad_norm": 1.3006569147109985, "learning_rate": 0.00018065022421524666, "loss": 1.4168, "step": 1736 }, { "epoch": 0.19461639729979552, "grad_norm": 1.1111482381820679, "learning_rate": 0.0001806390134529148, "loss": 1.1335, "step": 1737 }, { "epoch": 0.1947284389793003, "grad_norm": 1.2311307191848755, "learning_rate": 0.00018062780269058296, "loss": 1.5776, "step": 1738 }, { "epoch": 0.19484048065880508, "grad_norm": 1.7336584329605103, "learning_rate": 0.00018061659192825113, "loss": 1.2724, "step": 1739 }, { "epoch": 0.19495252233830984, "grad_norm": 3.3085153102874756, "learning_rate": 0.0001806053811659193, "loss": 1.6348, "step": 1740 }, { "epoch": 0.19506456401781463, "grad_norm": 0.9748929142951965, "learning_rate": 0.00018059417040358747, "loss": 1.3708, "step": 1741 }, { "epoch": 0.1951766056973194, "grad_norm": 1.2503916025161743, "learning_rate": 0.00018058295964125563, "loss": 1.312, "step": 1742 }, { "epoch": 0.19528864737682416, "grad_norm": 1.05488920211792, "learning_rate": 0.00018057174887892377, "loss": 0.9397, "step": 1743 }, { "epoch": 0.19540068905632896, "grad_norm": 1.2139266729354858, "learning_rate": 0.00018056053811659191, "loss": 1.3217, "step": 1744 }, { "epoch": 0.19551273073583372, "grad_norm": 2.048985242843628, "learning_rate": 0.00018054932735426008, "loss": 1.9266, "step": 1745 }, { "epoch": 0.19562477241533852, "grad_norm": 1.1995795965194702, "learning_rate": 0.00018053811659192825, "loss": 1.5864, "step": 1746 }, { "epoch": 0.19573681409484328, "grad_norm": 1.435536503791809, "learning_rate": 0.00018052690582959642, "loss": 1.4584, "step": 1747 }, { "epoch": 0.19584885577434805, "grad_norm": 1.7108381986618042, "learning_rate": 0.00018051569506726458, "loss": 2.175, "step": 1748 }, { "epoch": 0.19596089745385284, "grad_norm": 2.0006818771362305, "learning_rate": 0.00018050448430493275, "loss": 1.4319, "step": 1749 }, { "epoch": 0.1960729391333576, "grad_norm": 2.0118446350097656, "learning_rate": 0.0001804932735426009, "loss": 1.4809, "step": 1750 }, { "epoch": 0.1961849808128624, "grad_norm": 1.4628947973251343, "learning_rate": 0.00018048206278026906, "loss": 1.3508, "step": 1751 }, { "epoch": 0.19629702249236716, "grad_norm": 1.8972599506378174, "learning_rate": 0.00018047085201793723, "loss": 1.1778, "step": 1752 }, { "epoch": 0.19640906417187193, "grad_norm": 1.3561058044433594, "learning_rate": 0.0001804596412556054, "loss": 1.1656, "step": 1753 }, { "epoch": 0.19652110585137672, "grad_norm": 1.7208757400512695, "learning_rate": 0.00018044843049327356, "loss": 1.716, "step": 1754 }, { "epoch": 0.19663314753088149, "grad_norm": 1.3500288724899292, "learning_rate": 0.00018043721973094173, "loss": 1.778, "step": 1755 }, { "epoch": 0.19674518921038625, "grad_norm": 1.6891847848892212, "learning_rate": 0.0001804260089686099, "loss": 1.6736, "step": 1756 }, { "epoch": 0.19685723088989104, "grad_norm": 1.0302332639694214, "learning_rate": 0.00018041479820627804, "loss": 1.7028, "step": 1757 }, { "epoch": 0.1969692725693958, "grad_norm": 2.3134493827819824, "learning_rate": 0.00018040358744394618, "loss": 1.6799, "step": 1758 }, { "epoch": 0.1970813142489006, "grad_norm": 2.3940887451171875, "learning_rate": 0.00018039237668161434, "loss": 0.9566, "step": 1759 }, { "epoch": 0.19719335592840537, "grad_norm": 1.2643054723739624, "learning_rate": 0.0001803811659192825, "loss": 1.5852, "step": 1760 }, { "epoch": 0.19730539760791013, "grad_norm": 1.2039997577667236, "learning_rate": 0.00018036995515695068, "loss": 1.7753, "step": 1761 }, { "epoch": 0.19741743928741493, "grad_norm": 1.7785491943359375, "learning_rate": 0.00018035874439461885, "loss": 1.0814, "step": 1762 }, { "epoch": 0.1975294809669197, "grad_norm": 2.826028347015381, "learning_rate": 0.000180347533632287, "loss": 1.8159, "step": 1763 }, { "epoch": 0.19764152264642448, "grad_norm": 1.3592725992202759, "learning_rate": 0.00018033632286995515, "loss": 0.6257, "step": 1764 }, { "epoch": 0.19775356432592925, "grad_norm": 2.1041905879974365, "learning_rate": 0.00018032511210762332, "loss": 1.4744, "step": 1765 }, { "epoch": 0.197865606005434, "grad_norm": 0.8406914472579956, "learning_rate": 0.0001803139013452915, "loss": 1.9173, "step": 1766 }, { "epoch": 0.1979776476849388, "grad_norm": 1.4054983854293823, "learning_rate": 0.00018030269058295966, "loss": 1.4566, "step": 1767 }, { "epoch": 0.19808968936444357, "grad_norm": 1.3622665405273438, "learning_rate": 0.00018029147982062782, "loss": 1.6901, "step": 1768 }, { "epoch": 0.19820173104394834, "grad_norm": 1.4518219232559204, "learning_rate": 0.000180280269058296, "loss": 1.1665, "step": 1769 }, { "epoch": 0.19831377272345313, "grad_norm": 1.0980442762374878, "learning_rate": 0.00018026905829596416, "loss": 0.6499, "step": 1770 }, { "epoch": 0.1984258144029579, "grad_norm": 1.3195980787277222, "learning_rate": 0.0001802578475336323, "loss": 1.089, "step": 1771 }, { "epoch": 0.1985378560824627, "grad_norm": 2.1436245441436768, "learning_rate": 0.00018024663677130044, "loss": 1.4666, "step": 1772 }, { "epoch": 0.19864989776196745, "grad_norm": 1.421644926071167, "learning_rate": 0.0001802354260089686, "loss": 1.4565, "step": 1773 }, { "epoch": 0.19876193944147222, "grad_norm": 1.4131979942321777, "learning_rate": 0.00018022421524663677, "loss": 0.992, "step": 1774 }, { "epoch": 0.198873981120977, "grad_norm": 1.8771066665649414, "learning_rate": 0.00018021300448430494, "loss": 1.3671, "step": 1775 }, { "epoch": 0.19898602280048178, "grad_norm": 0.9425510168075562, "learning_rate": 0.0001802017937219731, "loss": 2.0039, "step": 1776 }, { "epoch": 0.19909806447998654, "grad_norm": 1.5099050998687744, "learning_rate": 0.00018019058295964127, "loss": 1.2181, "step": 1777 }, { "epoch": 0.19921010615949133, "grad_norm": 2.4603168964385986, "learning_rate": 0.00018017937219730941, "loss": 2.0242, "step": 1778 }, { "epoch": 0.1993221478389961, "grad_norm": 1.6915079355239868, "learning_rate": 0.00018016816143497758, "loss": 1.0127, "step": 1779 }, { "epoch": 0.1994341895185009, "grad_norm": 0.9954845309257507, "learning_rate": 0.00018015695067264575, "loss": 1.5073, "step": 1780 }, { "epoch": 0.19954623119800566, "grad_norm": 1.3834517002105713, "learning_rate": 0.00018014573991031392, "loss": 1.2816, "step": 1781 }, { "epoch": 0.19965827287751042, "grad_norm": 1.8671817779541016, "learning_rate": 0.00018013452914798208, "loss": 2.4138, "step": 1782 }, { "epoch": 0.19977031455701522, "grad_norm": 2.4641191959381104, "learning_rate": 0.00018012331838565025, "loss": 1.718, "step": 1783 }, { "epoch": 0.19988235623651998, "grad_norm": 1.8003531694412231, "learning_rate": 0.0001801121076233184, "loss": 1.4666, "step": 1784 }, { "epoch": 0.19999439791602477, "grad_norm": 1.9365266561508179, "learning_rate": 0.00018010089686098653, "loss": 2.0092, "step": 1785 }, { "epoch": 0.20010643959552954, "grad_norm": 1.581733226776123, "learning_rate": 0.0001800896860986547, "loss": 1.2751, "step": 1786 }, { "epoch": 0.2002184812750343, "grad_norm": 1.7085609436035156, "learning_rate": 0.00018007847533632287, "loss": 1.4857, "step": 1787 }, { "epoch": 0.2003305229545391, "grad_norm": 2.463107109069824, "learning_rate": 0.00018006726457399103, "loss": 1.5663, "step": 1788 }, { "epoch": 0.20044256463404386, "grad_norm": 1.162453532218933, "learning_rate": 0.0001800560538116592, "loss": 1.3549, "step": 1789 }, { "epoch": 0.20055460631354863, "grad_norm": 2.579047203063965, "learning_rate": 0.00018004484304932737, "loss": 1.2523, "step": 1790 }, { "epoch": 0.20066664799305342, "grad_norm": 1.4472559690475464, "learning_rate": 0.00018003363228699554, "loss": 1.8477, "step": 1791 }, { "epoch": 0.20077868967255819, "grad_norm": 2.1441471576690674, "learning_rate": 0.00018002242152466368, "loss": 1.2388, "step": 1792 }, { "epoch": 0.20089073135206298, "grad_norm": 1.5368895530700684, "learning_rate": 0.00018001121076233184, "loss": 0.8972, "step": 1793 }, { "epoch": 0.20100277303156774, "grad_norm": 2.1784934997558594, "learning_rate": 0.00018, "loss": 1.1985, "step": 1794 }, { "epoch": 0.2011148147110725, "grad_norm": 1.3437546491622925, "learning_rate": 0.00017998878923766818, "loss": 1.347, "step": 1795 }, { "epoch": 0.2012268563905773, "grad_norm": 4.698522090911865, "learning_rate": 0.00017997757847533635, "loss": 1.574, "step": 1796 }, { "epoch": 0.20133889807008207, "grad_norm": 1.2091940641403198, "learning_rate": 0.0001799663677130045, "loss": 1.7784, "step": 1797 }, { "epoch": 0.20145093974958683, "grad_norm": 1.4155324697494507, "learning_rate": 0.00017995515695067265, "loss": 1.0201, "step": 1798 }, { "epoch": 0.20156298142909163, "grad_norm": 1.0874507427215576, "learning_rate": 0.0001799439461883408, "loss": 1.4544, "step": 1799 }, { "epoch": 0.2016750231085964, "grad_norm": 2.0191211700439453, "learning_rate": 0.00017993273542600896, "loss": 0.8587, "step": 1800 }, { "epoch": 0.20178706478810118, "grad_norm": 1.313207745552063, "learning_rate": 0.00017992152466367713, "loss": 1.3789, "step": 1801 }, { "epoch": 0.20189910646760595, "grad_norm": 1.355238437652588, "learning_rate": 0.0001799103139013453, "loss": 1.6632, "step": 1802 }, { "epoch": 0.20201114814711071, "grad_norm": 3.083904504776001, "learning_rate": 0.00017989910313901346, "loss": 1.8266, "step": 1803 }, { "epoch": 0.2021231898266155, "grad_norm": 1.2274266481399536, "learning_rate": 0.00017988789237668163, "loss": 1.9193, "step": 1804 }, { "epoch": 0.20223523150612027, "grad_norm": 1.4547547101974487, "learning_rate": 0.0001798766816143498, "loss": 0.6167, "step": 1805 }, { "epoch": 0.20234727318562507, "grad_norm": 1.6761064529418945, "learning_rate": 0.00017986547085201794, "loss": 1.6899, "step": 1806 }, { "epoch": 0.20245931486512983, "grad_norm": 1.63422429561615, "learning_rate": 0.0001798542600896861, "loss": 0.7758, "step": 1807 }, { "epoch": 0.2025713565446346, "grad_norm": 1.984563946723938, "learning_rate": 0.00017984304932735427, "loss": 1.6353, "step": 1808 }, { "epoch": 0.2026833982241394, "grad_norm": 0.9661206007003784, "learning_rate": 0.00017983183856502244, "loss": 1.6697, "step": 1809 }, { "epoch": 0.20279543990364415, "grad_norm": 0.95804363489151, "learning_rate": 0.0001798206278026906, "loss": 1.7892, "step": 1810 }, { "epoch": 0.20290748158314892, "grad_norm": 1.5836058855056763, "learning_rate": 0.00017980941704035878, "loss": 1.9288, "step": 1811 }, { "epoch": 0.2030195232626537, "grad_norm": 0.8995055556297302, "learning_rate": 0.00017979820627802692, "loss": 1.5825, "step": 1812 }, { "epoch": 0.20313156494215848, "grad_norm": 1.1860733032226562, "learning_rate": 0.00017978699551569506, "loss": 1.4638, "step": 1813 }, { "epoch": 0.20324360662166327, "grad_norm": 1.2416683435440063, "learning_rate": 0.00017977578475336322, "loss": 1.2284, "step": 1814 }, { "epoch": 0.20335564830116803, "grad_norm": 1.790514588356018, "learning_rate": 0.0001797645739910314, "loss": 1.7595, "step": 1815 }, { "epoch": 0.2034676899806728, "grad_norm": 2.5158045291900635, "learning_rate": 0.00017975336322869956, "loss": 1.7655, "step": 1816 }, { "epoch": 0.2035797316601776, "grad_norm": 1.679612159729004, "learning_rate": 0.00017974215246636773, "loss": 1.8018, "step": 1817 }, { "epoch": 0.20369177333968236, "grad_norm": 1.45846688747406, "learning_rate": 0.0001797309417040359, "loss": 1.5917, "step": 1818 }, { "epoch": 0.20380381501918715, "grad_norm": 1.4264174699783325, "learning_rate": 0.00017971973094170403, "loss": 1.4046, "step": 1819 }, { "epoch": 0.20391585669869192, "grad_norm": 1.494156002998352, "learning_rate": 0.0001797085201793722, "loss": 1.3659, "step": 1820 }, { "epoch": 0.20402789837819668, "grad_norm": 1.9159314632415771, "learning_rate": 0.00017969730941704037, "loss": 1.3381, "step": 1821 }, { "epoch": 0.20413994005770147, "grad_norm": 2.428243398666382, "learning_rate": 0.00017968609865470853, "loss": 1.3558, "step": 1822 }, { "epoch": 0.20425198173720624, "grad_norm": 0.8685066103935242, "learning_rate": 0.0001796748878923767, "loss": 1.8583, "step": 1823 }, { "epoch": 0.204364023416711, "grad_norm": 1.46707022190094, "learning_rate": 0.00017966367713004487, "loss": 1.2342, "step": 1824 }, { "epoch": 0.2044760650962158, "grad_norm": 1.6089460849761963, "learning_rate": 0.00017965246636771304, "loss": 1.5653, "step": 1825 }, { "epoch": 0.20458810677572056, "grad_norm": 1.6510190963745117, "learning_rate": 0.00017964125560538118, "loss": 2.2085, "step": 1826 }, { "epoch": 0.20470014845522536, "grad_norm": 1.7074508666992188, "learning_rate": 0.00017963004484304932, "loss": 1.1076, "step": 1827 }, { "epoch": 0.20481219013473012, "grad_norm": 1.8651657104492188, "learning_rate": 0.00017961883408071748, "loss": 1.5442, "step": 1828 }, { "epoch": 0.2049242318142349, "grad_norm": 1.7825952768325806, "learning_rate": 0.00017960762331838565, "loss": 1.3989, "step": 1829 }, { "epoch": 0.20503627349373968, "grad_norm": 1.1968122720718384, "learning_rate": 0.00017959641255605382, "loss": 1.3568, "step": 1830 }, { "epoch": 0.20514831517324444, "grad_norm": 1.1848400831222534, "learning_rate": 0.000179585201793722, "loss": 1.6873, "step": 1831 }, { "epoch": 0.2052603568527492, "grad_norm": 1.5878314971923828, "learning_rate": 0.00017957399103139015, "loss": 1.2913, "step": 1832 }, { "epoch": 0.205372398532254, "grad_norm": 3.6087279319763184, "learning_rate": 0.0001795627802690583, "loss": 1.8472, "step": 1833 }, { "epoch": 0.20548444021175877, "grad_norm": 1.8331087827682495, "learning_rate": 0.00017955156950672646, "loss": 1.9372, "step": 1834 }, { "epoch": 0.20559648189126356, "grad_norm": 1.6804227828979492, "learning_rate": 0.00017954035874439463, "loss": 1.4554, "step": 1835 }, { "epoch": 0.20570852357076833, "grad_norm": 1.580568552017212, "learning_rate": 0.0001795291479820628, "loss": 1.1662, "step": 1836 }, { "epoch": 0.2058205652502731, "grad_norm": 1.5122134685516357, "learning_rate": 0.00017951793721973096, "loss": 1.2661, "step": 1837 }, { "epoch": 0.20593260692977788, "grad_norm": 1.511322021484375, "learning_rate": 0.00017950672645739913, "loss": 1.3354, "step": 1838 }, { "epoch": 0.20604464860928265, "grad_norm": 1.38935387134552, "learning_rate": 0.00017949551569506727, "loss": 1.6694, "step": 1839 }, { "epoch": 0.20615669028878744, "grad_norm": 1.620784878730774, "learning_rate": 0.0001794843049327354, "loss": 2.21, "step": 1840 }, { "epoch": 0.2062687319682922, "grad_norm": 1.7228413820266724, "learning_rate": 0.00017947309417040358, "loss": 1.4653, "step": 1841 }, { "epoch": 0.20638077364779697, "grad_norm": 1.2183958292007446, "learning_rate": 0.00017946188340807175, "loss": 1.5369, "step": 1842 }, { "epoch": 0.20649281532730177, "grad_norm": 1.820860505104065, "learning_rate": 0.00017945067264573991, "loss": 1.4043, "step": 1843 }, { "epoch": 0.20660485700680653, "grad_norm": 1.432909369468689, "learning_rate": 0.00017943946188340808, "loss": 1.1385, "step": 1844 }, { "epoch": 0.2067168986863113, "grad_norm": 2.067331314086914, "learning_rate": 0.00017942825112107625, "loss": 1.6943, "step": 1845 }, { "epoch": 0.2068289403658161, "grad_norm": 1.6762957572937012, "learning_rate": 0.00017941704035874442, "loss": 1.89, "step": 1846 }, { "epoch": 0.20694098204532085, "grad_norm": 1.4825987815856934, "learning_rate": 0.00017940582959641256, "loss": 0.6748, "step": 1847 }, { "epoch": 0.20705302372482565, "grad_norm": 1.9304664134979248, "learning_rate": 0.00017939461883408072, "loss": 1.7356, "step": 1848 }, { "epoch": 0.2071650654043304, "grad_norm": 2.7311489582061768, "learning_rate": 0.0001793834080717489, "loss": 2.0512, "step": 1849 }, { "epoch": 0.20727710708383518, "grad_norm": 1.1098415851593018, "learning_rate": 0.00017937219730941706, "loss": 1.0175, "step": 1850 }, { "epoch": 0.20738914876333997, "grad_norm": 1.785562515258789, "learning_rate": 0.00017936098654708523, "loss": 1.7721, "step": 1851 }, { "epoch": 0.20750119044284474, "grad_norm": 1.9980581998825073, "learning_rate": 0.0001793497757847534, "loss": 1.3855, "step": 1852 }, { "epoch": 0.2076132321223495, "grad_norm": 1.0713224411010742, "learning_rate": 0.00017933856502242153, "loss": 1.2215, "step": 1853 }, { "epoch": 0.2077252738018543, "grad_norm": 1.9062979221343994, "learning_rate": 0.00017932735426008967, "loss": 1.1394, "step": 1854 }, { "epoch": 0.20783731548135906, "grad_norm": 1.985835313796997, "learning_rate": 0.00017931614349775784, "loss": 1.7918, "step": 1855 }, { "epoch": 0.20794935716086385, "grad_norm": 1.8488730192184448, "learning_rate": 0.000179304932735426, "loss": 1.5386, "step": 1856 }, { "epoch": 0.20806139884036862, "grad_norm": 1.8676483631134033, "learning_rate": 0.00017929372197309418, "loss": 1.3991, "step": 1857 }, { "epoch": 0.20817344051987338, "grad_norm": 1.3486284017562866, "learning_rate": 0.00017928251121076234, "loss": 1.0692, "step": 1858 }, { "epoch": 0.20828548219937817, "grad_norm": 1.5511566400527954, "learning_rate": 0.0001792713004484305, "loss": 1.2137, "step": 1859 }, { "epoch": 0.20839752387888294, "grad_norm": 1.4116778373718262, "learning_rate": 0.00017926008968609868, "loss": 0.9139, "step": 1860 }, { "epoch": 0.20850956555838773, "grad_norm": 2.0944299697875977, "learning_rate": 0.00017924887892376682, "loss": 1.4524, "step": 1861 }, { "epoch": 0.2086216072378925, "grad_norm": 3.6198740005493164, "learning_rate": 0.00017923766816143499, "loss": 1.8219, "step": 1862 }, { "epoch": 0.20873364891739726, "grad_norm": 1.4004775285720825, "learning_rate": 0.00017922645739910315, "loss": 1.1835, "step": 1863 }, { "epoch": 0.20884569059690206, "grad_norm": 2.395087480545044, "learning_rate": 0.00017921524663677132, "loss": 1.6477, "step": 1864 }, { "epoch": 0.20895773227640682, "grad_norm": 2.1491360664367676, "learning_rate": 0.0001792040358744395, "loss": 1.5505, "step": 1865 }, { "epoch": 0.2090697739559116, "grad_norm": 1.6590564250946045, "learning_rate": 0.00017919282511210765, "loss": 1.4863, "step": 1866 }, { "epoch": 0.20918181563541638, "grad_norm": 2.5951015949249268, "learning_rate": 0.0001791816143497758, "loss": 1.7263, "step": 1867 }, { "epoch": 0.20929385731492114, "grad_norm": 1.3630061149597168, "learning_rate": 0.00017917040358744394, "loss": 0.568, "step": 1868 }, { "epoch": 0.20940589899442594, "grad_norm": 1.3892757892608643, "learning_rate": 0.0001791591928251121, "loss": 1.4607, "step": 1869 }, { "epoch": 0.2095179406739307, "grad_norm": 1.2216769456863403, "learning_rate": 0.00017914798206278027, "loss": 0.7994, "step": 1870 }, { "epoch": 0.20962998235343547, "grad_norm": 1.6796531677246094, "learning_rate": 0.00017913677130044844, "loss": 1.9782, "step": 1871 }, { "epoch": 0.20974202403294026, "grad_norm": 2.282275438308716, "learning_rate": 0.0001791255605381166, "loss": 2.4604, "step": 1872 }, { "epoch": 0.20985406571244503, "grad_norm": 1.0631881952285767, "learning_rate": 0.00017911434977578477, "loss": 1.3314, "step": 1873 }, { "epoch": 0.20996610739194982, "grad_norm": 1.9222917556762695, "learning_rate": 0.0001791031390134529, "loss": 1.4757, "step": 1874 }, { "epoch": 0.21007814907145458, "grad_norm": 1.7482991218566895, "learning_rate": 0.00017909192825112108, "loss": 1.0303, "step": 1875 }, { "epoch": 0.21019019075095935, "grad_norm": 2.1185200214385986, "learning_rate": 0.00017908071748878925, "loss": 0.9507, "step": 1876 }, { "epoch": 0.21030223243046414, "grad_norm": 1.9360709190368652, "learning_rate": 0.00017906950672645741, "loss": 1.5922, "step": 1877 }, { "epoch": 0.2104142741099689, "grad_norm": 1.1815026998519897, "learning_rate": 0.00017905829596412558, "loss": 1.6317, "step": 1878 }, { "epoch": 0.21052631578947367, "grad_norm": 1.6381725072860718, "learning_rate": 0.00017904708520179375, "loss": 1.4791, "step": 1879 }, { "epoch": 0.21063835746897847, "grad_norm": 2.8969004154205322, "learning_rate": 0.0001790358744394619, "loss": 1.4685, "step": 1880 }, { "epoch": 0.21075039914848323, "grad_norm": 1.6768348217010498, "learning_rate": 0.00017902466367713006, "loss": 1.5281, "step": 1881 }, { "epoch": 0.21086244082798802, "grad_norm": 1.6819629669189453, "learning_rate": 0.0001790134529147982, "loss": 1.4296, "step": 1882 }, { "epoch": 0.2109744825074928, "grad_norm": 1.4003517627716064, "learning_rate": 0.00017900224215246636, "loss": 1.7223, "step": 1883 }, { "epoch": 0.21108652418699755, "grad_norm": 1.97770094871521, "learning_rate": 0.00017899103139013453, "loss": 1.8731, "step": 1884 }, { "epoch": 0.21119856586650235, "grad_norm": 1.1702316999435425, "learning_rate": 0.0001789798206278027, "loss": 1.9352, "step": 1885 }, { "epoch": 0.2113106075460071, "grad_norm": 2.4684929847717285, "learning_rate": 0.00017896860986547087, "loss": 1.6838, "step": 1886 }, { "epoch": 0.21142264922551188, "grad_norm": 2.462400197982788, "learning_rate": 0.00017895739910313903, "loss": 1.8949, "step": 1887 }, { "epoch": 0.21153469090501667, "grad_norm": 2.1928279399871826, "learning_rate": 0.00017894618834080717, "loss": 1.1196, "step": 1888 }, { "epoch": 0.21164673258452144, "grad_norm": 1.3780969381332397, "learning_rate": 0.00017893497757847534, "loss": 1.8488, "step": 1889 }, { "epoch": 0.21175877426402623, "grad_norm": 2.937269687652588, "learning_rate": 0.0001789237668161435, "loss": 1.4008, "step": 1890 }, { "epoch": 0.211870815943531, "grad_norm": 1.4109565019607544, "learning_rate": 0.00017891255605381168, "loss": 1.3241, "step": 1891 }, { "epoch": 0.21198285762303576, "grad_norm": 1.1407241821289062, "learning_rate": 0.00017890134529147984, "loss": 1.899, "step": 1892 }, { "epoch": 0.21209489930254055, "grad_norm": 1.2214478254318237, "learning_rate": 0.000178890134529148, "loss": 1.489, "step": 1893 }, { "epoch": 0.21220694098204532, "grad_norm": 2.1088483333587646, "learning_rate": 0.00017887892376681615, "loss": 2.0499, "step": 1894 }, { "epoch": 0.2123189826615501, "grad_norm": 1.8910596370697021, "learning_rate": 0.00017886771300448432, "loss": 1.4119, "step": 1895 }, { "epoch": 0.21243102434105487, "grad_norm": 1.4492192268371582, "learning_rate": 0.00017885650224215246, "loss": 1.0724, "step": 1896 }, { "epoch": 0.21254306602055964, "grad_norm": 1.8173985481262207, "learning_rate": 0.00017884529147982063, "loss": 1.2184, "step": 1897 }, { "epoch": 0.21265510770006443, "grad_norm": 1.9782499074935913, "learning_rate": 0.0001788340807174888, "loss": 1.6853, "step": 1898 }, { "epoch": 0.2127671493795692, "grad_norm": 2.2964465618133545, "learning_rate": 0.00017882286995515696, "loss": 1.7097, "step": 1899 }, { "epoch": 0.21287919105907396, "grad_norm": 1.6521002054214478, "learning_rate": 0.00017881165919282513, "loss": 1.8442, "step": 1900 }, { "epoch": 0.21299123273857876, "grad_norm": 0.7144984602928162, "learning_rate": 0.0001788004484304933, "loss": 0.8919, "step": 1901 }, { "epoch": 0.21310327441808352, "grad_norm": 1.3440473079681396, "learning_rate": 0.00017878923766816144, "loss": 1.0664, "step": 1902 }, { "epoch": 0.21321531609758831, "grad_norm": 1.833707571029663, "learning_rate": 0.0001787780269058296, "loss": 1.4648, "step": 1903 }, { "epoch": 0.21332735777709308, "grad_norm": 1.3096188306808472, "learning_rate": 0.00017876681614349777, "loss": 1.5839, "step": 1904 }, { "epoch": 0.21343939945659784, "grad_norm": 1.5853313207626343, "learning_rate": 0.00017875560538116594, "loss": 1.2128, "step": 1905 }, { "epoch": 0.21355144113610264, "grad_norm": 1.55018949508667, "learning_rate": 0.0001787443946188341, "loss": 0.9677, "step": 1906 }, { "epoch": 0.2136634828156074, "grad_norm": 1.4666664600372314, "learning_rate": 0.00017873318385650225, "loss": 1.6914, "step": 1907 }, { "epoch": 0.21377552449511217, "grad_norm": 1.9743741750717163, "learning_rate": 0.0001787219730941704, "loss": 1.614, "step": 1908 }, { "epoch": 0.21388756617461696, "grad_norm": 1.9478893280029297, "learning_rate": 0.00017871076233183855, "loss": 2.2264, "step": 1909 }, { "epoch": 0.21399960785412173, "grad_norm": 1.783950686454773, "learning_rate": 0.00017869955156950672, "loss": 0.9599, "step": 1910 }, { "epoch": 0.21411164953362652, "grad_norm": 1.640621542930603, "learning_rate": 0.0001786883408071749, "loss": 1.8358, "step": 1911 }, { "epoch": 0.21422369121313128, "grad_norm": 1.3592640161514282, "learning_rate": 0.00017867713004484306, "loss": 1.4056, "step": 1912 }, { "epoch": 0.21433573289263605, "grad_norm": 2.0158400535583496, "learning_rate": 0.00017866591928251122, "loss": 1.883, "step": 1913 }, { "epoch": 0.21444777457214084, "grad_norm": 1.4300200939178467, "learning_rate": 0.0001786547085201794, "loss": 1.4303, "step": 1914 }, { "epoch": 0.2145598162516456, "grad_norm": 0.90268474817276, "learning_rate": 0.00017864349775784756, "loss": 1.3399, "step": 1915 }, { "epoch": 0.2146718579311504, "grad_norm": 1.4666378498077393, "learning_rate": 0.0001786322869955157, "loss": 1.5026, "step": 1916 }, { "epoch": 0.21478389961065517, "grad_norm": 2.1818363666534424, "learning_rate": 0.00017862107623318386, "loss": 1.7552, "step": 1917 }, { "epoch": 0.21489594129015993, "grad_norm": 2.3176987171173096, "learning_rate": 0.00017860986547085203, "loss": 1.4719, "step": 1918 }, { "epoch": 0.21500798296966472, "grad_norm": 2.2218403816223145, "learning_rate": 0.0001785986547085202, "loss": 1.5222, "step": 1919 }, { "epoch": 0.2151200246491695, "grad_norm": 1.0938738584518433, "learning_rate": 0.00017858744394618837, "loss": 1.1828, "step": 1920 }, { "epoch": 0.21523206632867425, "grad_norm": 0.9817928671836853, "learning_rate": 0.0001785762331838565, "loss": 1.6546, "step": 1921 }, { "epoch": 0.21534410800817905, "grad_norm": 1.5904203653335571, "learning_rate": 0.00017856502242152467, "loss": 1.5326, "step": 1922 }, { "epoch": 0.2154561496876838, "grad_norm": 2.171383857727051, "learning_rate": 0.00017855381165919281, "loss": 1.756, "step": 1923 }, { "epoch": 0.2155681913671886, "grad_norm": 1.0565105676651, "learning_rate": 0.00017854260089686098, "loss": 1.3673, "step": 1924 }, { "epoch": 0.21568023304669337, "grad_norm": 1.040130376815796, "learning_rate": 0.00017853139013452915, "loss": 1.5153, "step": 1925 }, { "epoch": 0.21579227472619814, "grad_norm": 1.6916874647140503, "learning_rate": 0.00017852017937219732, "loss": 1.9662, "step": 1926 }, { "epoch": 0.21590431640570293, "grad_norm": 1.7368310689926147, "learning_rate": 0.00017850896860986548, "loss": 2.1472, "step": 1927 }, { "epoch": 0.2160163580852077, "grad_norm": 1.7405825853347778, "learning_rate": 0.00017849775784753365, "loss": 1.7461, "step": 1928 }, { "epoch": 0.2161283997647125, "grad_norm": 2.4322752952575684, "learning_rate": 0.0001784865470852018, "loss": 1.0433, "step": 1929 }, { "epoch": 0.21624044144421725, "grad_norm": 1.2156100273132324, "learning_rate": 0.00017847533632286996, "loss": 0.9149, "step": 1930 }, { "epoch": 0.21635248312372202, "grad_norm": 1.1841840744018555, "learning_rate": 0.00017846412556053813, "loss": 1.2399, "step": 1931 }, { "epoch": 0.2164645248032268, "grad_norm": 1.5805927515029907, "learning_rate": 0.0001784529147982063, "loss": 1.1444, "step": 1932 }, { "epoch": 0.21657656648273158, "grad_norm": 2.6840877532958984, "learning_rate": 0.00017844170403587446, "loss": 1.5807, "step": 1933 }, { "epoch": 0.21668860816223634, "grad_norm": 1.3034788370132446, "learning_rate": 0.00017843049327354263, "loss": 0.785, "step": 1934 }, { "epoch": 0.21680064984174113, "grad_norm": 1.5442471504211426, "learning_rate": 0.00017841928251121077, "loss": 1.844, "step": 1935 }, { "epoch": 0.2169126915212459, "grad_norm": 1.11683988571167, "learning_rate": 0.00017840807174887894, "loss": 1.3003, "step": 1936 }, { "epoch": 0.2170247332007507, "grad_norm": 2.777272939682007, "learning_rate": 0.00017839686098654708, "loss": 1.675, "step": 1937 }, { "epoch": 0.21713677488025546, "grad_norm": 1.1287521123886108, "learning_rate": 0.00017838565022421524, "loss": 0.8167, "step": 1938 }, { "epoch": 0.21724881655976022, "grad_norm": 2.2091634273529053, "learning_rate": 0.0001783744394618834, "loss": 1.5776, "step": 1939 }, { "epoch": 0.21736085823926501, "grad_norm": 1.7853686809539795, "learning_rate": 0.00017836322869955158, "loss": 1.4012, "step": 1940 }, { "epoch": 0.21747289991876978, "grad_norm": 1.5337635278701782, "learning_rate": 0.00017835201793721975, "loss": 2.1166, "step": 1941 }, { "epoch": 0.21758494159827454, "grad_norm": 1.8027456998825073, "learning_rate": 0.0001783408071748879, "loss": 1.3293, "step": 1942 }, { "epoch": 0.21769698327777934, "grad_norm": 1.5861016511917114, "learning_rate": 0.00017832959641255605, "loss": 1.5183, "step": 1943 }, { "epoch": 0.2178090249572841, "grad_norm": 1.728875994682312, "learning_rate": 0.00017831838565022422, "loss": 1.5032, "step": 1944 }, { "epoch": 0.2179210666367889, "grad_norm": 1.425559639930725, "learning_rate": 0.0001783071748878924, "loss": 1.7171, "step": 1945 }, { "epoch": 0.21803310831629366, "grad_norm": 2.279707193374634, "learning_rate": 0.00017829596412556056, "loss": 1.5723, "step": 1946 }, { "epoch": 0.21814514999579843, "grad_norm": 1.4842787981033325, "learning_rate": 0.00017828475336322872, "loss": 1.6372, "step": 1947 }, { "epoch": 0.21825719167530322, "grad_norm": 1.6392126083374023, "learning_rate": 0.00017827354260089686, "loss": 1.8428, "step": 1948 }, { "epoch": 0.21836923335480798, "grad_norm": 1.1719589233398438, "learning_rate": 0.00017826233183856503, "loss": 1.7531, "step": 1949 }, { "epoch": 0.21848127503431278, "grad_norm": 1.3626068830490112, "learning_rate": 0.0001782511210762332, "loss": 0.867, "step": 1950 }, { "epoch": 0.21859331671381754, "grad_norm": 1.6156997680664062, "learning_rate": 0.00017823991031390134, "loss": 2.0965, "step": 1951 }, { "epoch": 0.2187053583933223, "grad_norm": 2.4228835105895996, "learning_rate": 0.0001782286995515695, "loss": 1.4623, "step": 1952 }, { "epoch": 0.2188174000728271, "grad_norm": 2.0259640216827393, "learning_rate": 0.00017821748878923767, "loss": 1.509, "step": 1953 }, { "epoch": 0.21892944175233187, "grad_norm": 2.38797926902771, "learning_rate": 0.00017820627802690584, "loss": 1.9634, "step": 1954 }, { "epoch": 0.21904148343183663, "grad_norm": 1.7956459522247314, "learning_rate": 0.000178195067264574, "loss": 1.7674, "step": 1955 }, { "epoch": 0.21915352511134142, "grad_norm": 1.7120578289031982, "learning_rate": 0.00017818385650224218, "loss": 1.4677, "step": 1956 }, { "epoch": 0.2192655667908462, "grad_norm": 1.639975666999817, "learning_rate": 0.00017817264573991032, "loss": 1.9255, "step": 1957 }, { "epoch": 0.21937760847035098, "grad_norm": 2.72259521484375, "learning_rate": 0.00017816143497757848, "loss": 1.9043, "step": 1958 }, { "epoch": 0.21948965014985575, "grad_norm": 1.128398060798645, "learning_rate": 0.00017815022421524665, "loss": 0.8697, "step": 1959 }, { "epoch": 0.2196016918293605, "grad_norm": 1.987424612045288, "learning_rate": 0.00017813901345291482, "loss": 1.4794, "step": 1960 }, { "epoch": 0.2197137335088653, "grad_norm": 2.853107213973999, "learning_rate": 0.00017812780269058298, "loss": 1.5626, "step": 1961 }, { "epoch": 0.21982577518837007, "grad_norm": 1.3174608945846558, "learning_rate": 0.00017811659192825113, "loss": 1.5231, "step": 1962 }, { "epoch": 0.21993781686787484, "grad_norm": 2.0444326400756836, "learning_rate": 0.0001781053811659193, "loss": 1.2242, "step": 1963 }, { "epoch": 0.22004985854737963, "grad_norm": 1.1034706830978394, "learning_rate": 0.00017809417040358743, "loss": 0.7187, "step": 1964 }, { "epoch": 0.2201619002268844, "grad_norm": 0.8703362345695496, "learning_rate": 0.0001780829596412556, "loss": 1.1719, "step": 1965 }, { "epoch": 0.2202739419063892, "grad_norm": 1.2512787580490112, "learning_rate": 0.00017807174887892377, "loss": 1.2218, "step": 1966 }, { "epoch": 0.22038598358589395, "grad_norm": 1.44651460647583, "learning_rate": 0.00017806053811659193, "loss": 1.4128, "step": 1967 }, { "epoch": 0.22049802526539872, "grad_norm": 3.7442429065704346, "learning_rate": 0.0001780493273542601, "loss": 2.0432, "step": 1968 }, { "epoch": 0.2206100669449035, "grad_norm": 2.63895320892334, "learning_rate": 0.00017803811659192827, "loss": 1.449, "step": 1969 }, { "epoch": 0.22072210862440828, "grad_norm": 1.0601733922958374, "learning_rate": 0.00017802690582959644, "loss": 1.7487, "step": 1970 }, { "epoch": 0.22083415030391307, "grad_norm": 2.5111541748046875, "learning_rate": 0.00017801569506726458, "loss": 1.3407, "step": 1971 }, { "epoch": 0.22094619198341783, "grad_norm": 2.230844497680664, "learning_rate": 0.00017800448430493274, "loss": 1.7138, "step": 1972 }, { "epoch": 0.2210582336629226, "grad_norm": 1.9996036291122437, "learning_rate": 0.0001779932735426009, "loss": 1.6704, "step": 1973 }, { "epoch": 0.2211702753424274, "grad_norm": 1.2678370475769043, "learning_rate": 0.00017798206278026908, "loss": 1.982, "step": 1974 }, { "epoch": 0.22128231702193216, "grad_norm": 1.9111874103546143, "learning_rate": 0.00017797085201793722, "loss": 1.3163, "step": 1975 }, { "epoch": 0.22139435870143692, "grad_norm": 1.8089767694473267, "learning_rate": 0.0001779596412556054, "loss": 0.8731, "step": 1976 }, { "epoch": 0.22150640038094171, "grad_norm": 1.8151017427444458, "learning_rate": 0.00017794843049327355, "loss": 1.2462, "step": 1977 }, { "epoch": 0.22161844206044648, "grad_norm": 1.4595400094985962, "learning_rate": 0.0001779372197309417, "loss": 1.2994, "step": 1978 }, { "epoch": 0.22173048373995127, "grad_norm": 1.2989768981933594, "learning_rate": 0.00017792600896860986, "loss": 1.4777, "step": 1979 }, { "epoch": 0.22184252541945604, "grad_norm": 1.2181116342544556, "learning_rate": 0.00017791479820627803, "loss": 1.2459, "step": 1980 }, { "epoch": 0.2219545670989608, "grad_norm": 1.6123982667922974, "learning_rate": 0.0001779035874439462, "loss": 1.9541, "step": 1981 }, { "epoch": 0.2220666087784656, "grad_norm": 0.7435643076896667, "learning_rate": 0.00017789237668161436, "loss": 1.7353, "step": 1982 }, { "epoch": 0.22217865045797036, "grad_norm": 1.4552640914916992, "learning_rate": 0.00017788116591928253, "loss": 1.7617, "step": 1983 }, { "epoch": 0.22229069213747515, "grad_norm": 1.2474133968353271, "learning_rate": 0.00017786995515695067, "loss": 1.5308, "step": 1984 }, { "epoch": 0.22240273381697992, "grad_norm": 2.376652956008911, "learning_rate": 0.00017785874439461884, "loss": 2.1911, "step": 1985 }, { "epoch": 0.22251477549648468, "grad_norm": 1.5521727800369263, "learning_rate": 0.000177847533632287, "loss": 0.9824, "step": 1986 }, { "epoch": 0.22262681717598948, "grad_norm": 1.8883696794509888, "learning_rate": 0.00017783632286995517, "loss": 1.4666, "step": 1987 }, { "epoch": 0.22273885885549424, "grad_norm": 1.136358380317688, "learning_rate": 0.00017782511210762334, "loss": 2.0136, "step": 1988 }, { "epoch": 0.222850900534999, "grad_norm": 1.6227328777313232, "learning_rate": 0.00017781390134529148, "loss": 1.2455, "step": 1989 }, { "epoch": 0.2229629422145038, "grad_norm": 1.6556684970855713, "learning_rate": 0.00017780269058295965, "loss": 0.8911, "step": 1990 }, { "epoch": 0.22307498389400857, "grad_norm": 2.790158271789551, "learning_rate": 0.00017779147982062782, "loss": 1.476, "step": 1991 }, { "epoch": 0.22318702557351336, "grad_norm": 1.5780198574066162, "learning_rate": 0.00017778026905829596, "loss": 1.5427, "step": 1992 }, { "epoch": 0.22329906725301812, "grad_norm": 1.3716769218444824, "learning_rate": 0.00017776905829596412, "loss": 1.1249, "step": 1993 }, { "epoch": 0.2234111089325229, "grad_norm": 3.047131299972534, "learning_rate": 0.0001777578475336323, "loss": 2.4492, "step": 1994 }, { "epoch": 0.22352315061202768, "grad_norm": 1.130710482597351, "learning_rate": 0.00017774663677130046, "loss": 1.8119, "step": 1995 }, { "epoch": 0.22363519229153245, "grad_norm": 1.542148232460022, "learning_rate": 0.00017773542600896863, "loss": 1.2442, "step": 1996 }, { "epoch": 0.2237472339710372, "grad_norm": 1.307167410850525, "learning_rate": 0.0001777242152466368, "loss": 1.4577, "step": 1997 }, { "epoch": 0.223859275650542, "grad_norm": 2.2471230030059814, "learning_rate": 0.00017771300448430493, "loss": 1.4378, "step": 1998 }, { "epoch": 0.22397131733004677, "grad_norm": 1.6478996276855469, "learning_rate": 0.0001777017937219731, "loss": 1.697, "step": 1999 }, { "epoch": 0.22408335900955156, "grad_norm": 1.2611740827560425, "learning_rate": 0.00017769058295964127, "loss": 1.5123, "step": 2000 } ], "logging_steps": 1, "max_steps": 17850, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 250, "total_flos": 5.634011722364191e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }