{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2502839833396441, "eval_steps": 661, "global_step": 661, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003786444528587656, "grad_norm": 10.434814453125, "learning_rate": 1e-05, "loss": 5.8683, "step": 1 }, { "epoch": 0.0003786444528587656, "eval_loss": 0.8921470642089844, "eval_runtime": 901.0053, "eval_samples_per_second": 4.937, "eval_steps_per_second": 1.234, "step": 1 }, { "epoch": 0.0007572889057175312, "grad_norm": 11.7908353805542, "learning_rate": 2e-05, "loss": 6.5457, "step": 2 }, { "epoch": 0.001135933358576297, "grad_norm": 12.84145736694336, "learning_rate": 3e-05, "loss": 6.2297, "step": 3 }, { "epoch": 0.0015145778114350624, "grad_norm": 12.087944984436035, "learning_rate": 4e-05, "loss": 5.6302, "step": 4 }, { "epoch": 0.001893222264293828, "grad_norm": 11.6513090133667, "learning_rate": 5e-05, "loss": 5.7574, "step": 5 }, { "epoch": 0.002271866717152594, "grad_norm": 11.966485977172852, "learning_rate": 6e-05, "loss": 6.4193, "step": 6 }, { "epoch": 0.0026505111700113595, "grad_norm": 14.711249351501465, "learning_rate": 7e-05, "loss": 5.9372, "step": 7 }, { "epoch": 0.003029155622870125, "grad_norm": 14.61629581451416, "learning_rate": 8e-05, "loss": 5.8455, "step": 8 }, { "epoch": 0.0034078000757288905, "grad_norm": 16.477096557617188, "learning_rate": 9e-05, "loss": 6.3195, "step": 9 }, { "epoch": 0.003786444528587656, "grad_norm": 17.702224731445312, "learning_rate": 0.0001, "loss": 6.2648, "step": 10 }, { "epoch": 0.0041650889814464215, "grad_norm": 18.931350708007812, "learning_rate": 0.00011000000000000002, "loss": 5.413, "step": 11 }, { "epoch": 0.004543733434305188, "grad_norm": 18.159408569335938, "learning_rate": 0.00012, "loss": 5.7632, "step": 12 }, { "epoch": 0.004922377887163953, "grad_norm": 15.117518424987793, "learning_rate": 0.00013000000000000002, "loss": 5.1309, "step": 13 }, { "epoch": 0.005301022340022719, "grad_norm": 13.553899765014648, "learning_rate": 0.00014, "loss": 5.5902, "step": 14 }, { "epoch": 0.005679666792881484, "grad_norm": 14.156839370727539, "learning_rate": 0.00015000000000000001, "loss": 5.8427, "step": 15 }, { "epoch": 0.00605831124574025, "grad_norm": 14.818575859069824, "learning_rate": 0.00016, "loss": 5.5943, "step": 16 }, { "epoch": 0.006436955698599016, "grad_norm": 14.836395263671875, "learning_rate": 0.00017, "loss": 5.6252, "step": 17 }, { "epoch": 0.006815600151457781, "grad_norm": 16.203628540039062, "learning_rate": 0.00018, "loss": 5.1571, "step": 18 }, { "epoch": 0.007194244604316547, "grad_norm": 16.7708797454834, "learning_rate": 0.00019, "loss": 5.1433, "step": 19 }, { "epoch": 0.007572889057175312, "grad_norm": 25.078933715820312, "learning_rate": 0.0002, "loss": 5.6986, "step": 20 }, { "epoch": 0.007951533510034078, "grad_norm": 29.939088821411133, "learning_rate": 0.00019999992816507284, "loss": 5.1664, "step": 21 }, { "epoch": 0.008330177962892843, "grad_norm": 18.42095947265625, "learning_rate": 0.0001999997126603945, "loss": 5.1388, "step": 22 }, { "epoch": 0.00870882241575161, "grad_norm": 25.811283111572266, "learning_rate": 0.00019999935348627464, "loss": 5.3499, "step": 23 }, { "epoch": 0.009087466868610375, "grad_norm": 33.81028747558594, "learning_rate": 0.00019999885064322928, "loss": 5.0324, "step": 24 }, { "epoch": 0.00946611132146914, "grad_norm": 40.47433853149414, "learning_rate": 0.00019999820413198083, "loss": 4.3909, "step": 25 }, { "epoch": 0.009844755774327906, "grad_norm": 19.015504837036133, "learning_rate": 0.00019999741395345812, "loss": 6.7567, "step": 26 }, { "epoch": 0.010223400227186671, "grad_norm": 12.075000762939453, "learning_rate": 0.00019999648010879647, "loss": 7.2374, "step": 27 }, { "epoch": 0.010602044680045438, "grad_norm": 12.168185234069824, "learning_rate": 0.00019999540259933745, "loss": 5.8711, "step": 28 }, { "epoch": 0.010980689132904203, "grad_norm": 9.592961311340332, "learning_rate": 0.00019999418142662917, "loss": 5.6626, "step": 29 }, { "epoch": 0.011359333585762969, "grad_norm": 8.446025848388672, "learning_rate": 0.00019999281659242608, "loss": 5.5054, "step": 30 }, { "epoch": 0.011737978038621734, "grad_norm": 8.979402542114258, "learning_rate": 0.000199991308098689, "loss": 5.268, "step": 31 }, { "epoch": 0.0121166224914805, "grad_norm": 11.317891120910645, "learning_rate": 0.00019998965594758523, "loss": 5.7779, "step": 32 }, { "epoch": 0.012495266944339266, "grad_norm": 9.691301345825195, "learning_rate": 0.00019998786014148838, "loss": 5.4439, "step": 33 }, { "epoch": 0.012873911397198031, "grad_norm": 9.59664249420166, "learning_rate": 0.0001999859206829785, "loss": 5.0315, "step": 34 }, { "epoch": 0.013252555850056797, "grad_norm": 12.453145980834961, "learning_rate": 0.000199983837574842, "loss": 5.7834, "step": 35 }, { "epoch": 0.013631200302915562, "grad_norm": 10.473176956176758, "learning_rate": 0.00019998161082007164, "loss": 5.4829, "step": 36 }, { "epoch": 0.014009844755774327, "grad_norm": 9.838224411010742, "learning_rate": 0.0001999792404218667, "loss": 5.0974, "step": 37 }, { "epoch": 0.014388489208633094, "grad_norm": 12.153512954711914, "learning_rate": 0.00019997672638363262, "loss": 5.4264, "step": 38 }, { "epoch": 0.01476713366149186, "grad_norm": 11.956207275390625, "learning_rate": 0.00019997406870898133, "loss": 5.5113, "step": 39 }, { "epoch": 0.015145778114350625, "grad_norm": 12.791664123535156, "learning_rate": 0.00019997126740173114, "loss": 5.2499, "step": 40 }, { "epoch": 0.01552442256720939, "grad_norm": 9.361381530761719, "learning_rate": 0.0001999683224659067, "loss": 4.8344, "step": 41 }, { "epoch": 0.015903067020068155, "grad_norm": 10.928776741027832, "learning_rate": 0.000199965233905739, "loss": 6.2701, "step": 42 }, { "epoch": 0.01628171147292692, "grad_norm": 12.90079402923584, "learning_rate": 0.00019996200172566527, "loss": 5.0639, "step": 43 }, { "epoch": 0.016660355925785686, "grad_norm": 13.444856643676758, "learning_rate": 0.0001999586259303293, "loss": 4.7613, "step": 44 }, { "epoch": 0.01703900037864445, "grad_norm": 13.767925262451172, "learning_rate": 0.00019995510652458105, "loss": 4.0273, "step": 45 }, { "epoch": 0.01741764483150322, "grad_norm": 15.626961708068848, "learning_rate": 0.00019995144351347678, "loss": 4.7746, "step": 46 }, { "epoch": 0.017796289284361985, "grad_norm": 26.331802368164062, "learning_rate": 0.00019994763690227925, "loss": 4.8851, "step": 47 }, { "epoch": 0.01817493373722075, "grad_norm": 22.59031867980957, "learning_rate": 0.0001999436866964573, "loss": 4.2097, "step": 48 }, { "epoch": 0.018553578190079516, "grad_norm": 17.976661682128906, "learning_rate": 0.00019993959290168627, "loss": 2.8742, "step": 49 }, { "epoch": 0.01893222264293828, "grad_norm": 41.0118408203125, "learning_rate": 0.00019993535552384766, "loss": 5.117, "step": 50 }, { "epoch": 0.019310867095797046, "grad_norm": 16.585399627685547, "learning_rate": 0.0001999309745690293, "loss": 6.8806, "step": 51 }, { "epoch": 0.01968951154865581, "grad_norm": 11.035297393798828, "learning_rate": 0.00019992645004352535, "loss": 6.3081, "step": 52 }, { "epoch": 0.020068156001514577, "grad_norm": 8.331199645996094, "learning_rate": 0.00019992178195383614, "loss": 5.6585, "step": 53 }, { "epoch": 0.020446800454373342, "grad_norm": 9.50080394744873, "learning_rate": 0.00019991697030666833, "loss": 6.0226, "step": 54 }, { "epoch": 0.020825444907232107, "grad_norm": 8.85689640045166, "learning_rate": 0.00019991201510893483, "loss": 6.2203, "step": 55 }, { "epoch": 0.021204089360090876, "grad_norm": 10.029090881347656, "learning_rate": 0.00019990691636775473, "loss": 6.0392, "step": 56 }, { "epoch": 0.02158273381294964, "grad_norm": 8.855071067810059, "learning_rate": 0.0001999016740904534, "loss": 5.3339, "step": 57 }, { "epoch": 0.021961378265808407, "grad_norm": 9.614381790161133, "learning_rate": 0.00019989628828456237, "loss": 4.4834, "step": 58 }, { "epoch": 0.022340022718667172, "grad_norm": 11.6102933883667, "learning_rate": 0.00019989075895781948, "loss": 5.65, "step": 59 }, { "epoch": 0.022718667171525937, "grad_norm": 10.178080558776855, "learning_rate": 0.00019988508611816868, "loss": 5.7066, "step": 60 }, { "epoch": 0.023097311624384703, "grad_norm": 10.321159362792969, "learning_rate": 0.00019987926977376014, "loss": 6.1788, "step": 61 }, { "epoch": 0.023475956077243468, "grad_norm": 9.083809852600098, "learning_rate": 0.00019987330993295014, "loss": 4.5461, "step": 62 }, { "epoch": 0.023854600530102233, "grad_norm": 10.712754249572754, "learning_rate": 0.00019986720660430124, "loss": 5.7984, "step": 63 }, { "epoch": 0.024233244982961, "grad_norm": 11.429099082946777, "learning_rate": 0.0001998609597965821, "loss": 5.7685, "step": 64 }, { "epoch": 0.024611889435819764, "grad_norm": 11.242938995361328, "learning_rate": 0.00019985456951876742, "loss": 4.7109, "step": 65 }, { "epoch": 0.024990533888678532, "grad_norm": 13.852066993713379, "learning_rate": 0.00019984803578003817, "loss": 5.1454, "step": 66 }, { "epoch": 0.025369178341537298, "grad_norm": 11.528448104858398, "learning_rate": 0.00019984135858978132, "loss": 4.6155, "step": 67 }, { "epoch": 0.025747822794396063, "grad_norm": 15.846125602722168, "learning_rate": 0.00019983453795759, "loss": 6.0121, "step": 68 }, { "epoch": 0.026126467247254828, "grad_norm": 14.896081924438477, "learning_rate": 0.00019982757389326342, "loss": 4.7377, "step": 69 }, { "epoch": 0.026505111700113593, "grad_norm": 19.171016693115234, "learning_rate": 0.0001998204664068068, "loss": 5.2492, "step": 70 }, { "epoch": 0.02688375615297236, "grad_norm": 17.842302322387695, "learning_rate": 0.0001998132155084315, "loss": 4.3207, "step": 71 }, { "epoch": 0.027262400605831124, "grad_norm": 15.214621543884277, "learning_rate": 0.00019980582120855483, "loss": 5.0446, "step": 72 }, { "epoch": 0.02764104505868989, "grad_norm": 18.836454391479492, "learning_rate": 0.0001997982835178002, "loss": 4.0261, "step": 73 }, { "epoch": 0.028019689511548655, "grad_norm": 37.965946197509766, "learning_rate": 0.00019979060244699698, "loss": 4.8663, "step": 74 }, { "epoch": 0.02839833396440742, "grad_norm": 56.08090591430664, "learning_rate": 0.00019978277800718054, "loss": 6.0144, "step": 75 }, { "epoch": 0.02877697841726619, "grad_norm": 10.698841094970703, "learning_rate": 0.0001997748102095923, "loss": 6.8784, "step": 76 }, { "epoch": 0.029155622870124954, "grad_norm": 9.394986152648926, "learning_rate": 0.00019976669906567954, "loss": 6.6554, "step": 77 }, { "epoch": 0.02953426732298372, "grad_norm": 7.796587944030762, "learning_rate": 0.00019975844458709557, "loss": 6.1919, "step": 78 }, { "epoch": 0.029912911775842484, "grad_norm": 9.376069068908691, "learning_rate": 0.0001997500467856995, "loss": 5.957, "step": 79 }, { "epoch": 0.03029155622870125, "grad_norm": 10.019186973571777, "learning_rate": 0.00019974150567355655, "loss": 6.4973, "step": 80 }, { "epoch": 0.030670200681560015, "grad_norm": 10.175936698913574, "learning_rate": 0.00019973282126293758, "loss": 6.0903, "step": 81 }, { "epoch": 0.03104884513441878, "grad_norm": 9.391570091247559, "learning_rate": 0.00019972399356631964, "loss": 5.2329, "step": 82 }, { "epoch": 0.03142748958727755, "grad_norm": 9.361041069030762, "learning_rate": 0.00019971502259638534, "loss": 6.0391, "step": 83 }, { "epoch": 0.03180613404013631, "grad_norm": 9.759671211242676, "learning_rate": 0.00019970590836602335, "loss": 6.0924, "step": 84 }, { "epoch": 0.03218477849299508, "grad_norm": 9.57455062866211, "learning_rate": 0.000199696650888328, "loss": 5.4275, "step": 85 }, { "epoch": 0.03256342294585384, "grad_norm": 9.738369941711426, "learning_rate": 0.00019968725017659953, "loss": 5.2149, "step": 86 }, { "epoch": 0.03294206739871261, "grad_norm": 10.45261287689209, "learning_rate": 0.00019967770624434387, "loss": 4.5745, "step": 87 }, { "epoch": 0.03332071185157137, "grad_norm": 11.853706359863281, "learning_rate": 0.00019966801910527288, "loss": 5.6621, "step": 88 }, { "epoch": 0.03369935630443014, "grad_norm": 9.945990562438965, "learning_rate": 0.000199658188773304, "loss": 4.975, "step": 89 }, { "epoch": 0.0340780007572889, "grad_norm": 9.144219398498535, "learning_rate": 0.00019964821526256043, "loss": 4.6526, "step": 90 }, { "epoch": 0.03445664521014767, "grad_norm": 10.96102237701416, "learning_rate": 0.00019963809858737115, "loss": 5.5929, "step": 91 }, { "epoch": 0.03483528966300644, "grad_norm": 12.377371788024902, "learning_rate": 0.0001996278387622707, "loss": 5.2634, "step": 92 }, { "epoch": 0.0352139341158652, "grad_norm": 12.394866943359375, "learning_rate": 0.00019961743580199946, "loss": 5.6475, "step": 93 }, { "epoch": 0.03559257856872397, "grad_norm": 15.493999481201172, "learning_rate": 0.00019960688972150327, "loss": 5.0573, "step": 94 }, { "epoch": 0.03597122302158273, "grad_norm": 14.89577865600586, "learning_rate": 0.00019959620053593366, "loss": 4.3286, "step": 95 }, { "epoch": 0.0363498674744415, "grad_norm": 17.289690017700195, "learning_rate": 0.00019958536826064784, "loss": 4.417, "step": 96 }, { "epoch": 0.03672851192730026, "grad_norm": 19.77994155883789, "learning_rate": 0.00019957439291120848, "loss": 5.0353, "step": 97 }, { "epoch": 0.03710715638015903, "grad_norm": 35.42997741699219, "learning_rate": 0.00019956327450338382, "loss": 4.8566, "step": 98 }, { "epoch": 0.03748580083301779, "grad_norm": 34.25278091430664, "learning_rate": 0.00019955201305314768, "loss": 5.0527, "step": 99 }, { "epoch": 0.03786444528587656, "grad_norm": 32.984031677246094, "learning_rate": 0.00019954060857667942, "loss": 4.1489, "step": 100 }, { "epoch": 0.03824308973873533, "grad_norm": 25.177045822143555, "learning_rate": 0.00019952906109036377, "loss": 7.8832, "step": 101 }, { "epoch": 0.03862173419159409, "grad_norm": 11.03814697265625, "learning_rate": 0.00019951737061079102, "loss": 6.4552, "step": 102 }, { "epoch": 0.03900037864445286, "grad_norm": 9.427952766418457, "learning_rate": 0.00019950553715475684, "loss": 6.2599, "step": 103 }, { "epoch": 0.03937902309731162, "grad_norm": 9.926714897155762, "learning_rate": 0.00019949356073926236, "loss": 5.2806, "step": 104 }, { "epoch": 0.03975766755017039, "grad_norm": 9.968575477600098, "learning_rate": 0.00019948144138151407, "loss": 5.6615, "step": 105 }, { "epoch": 0.040136312003029154, "grad_norm": 9.264948844909668, "learning_rate": 0.00019946917909892384, "loss": 5.1696, "step": 106 }, { "epoch": 0.04051495645588792, "grad_norm": 11.23089599609375, "learning_rate": 0.00019945677390910887, "loss": 5.7446, "step": 107 }, { "epoch": 0.040893600908746684, "grad_norm": 12.40099048614502, "learning_rate": 0.0001994442258298917, "loss": 5.274, "step": 108 }, { "epoch": 0.04127224536160545, "grad_norm": 10.000561714172363, "learning_rate": 0.00019943153487930005, "loss": 5.4166, "step": 109 }, { "epoch": 0.041650889814464215, "grad_norm": 9.609817504882812, "learning_rate": 0.00019941870107556713, "loss": 5.4101, "step": 110 }, { "epoch": 0.042029534267322984, "grad_norm": 11.121451377868652, "learning_rate": 0.00019940572443713115, "loss": 5.1717, "step": 111 }, { "epoch": 0.04240817872018175, "grad_norm": 9.745224952697754, "learning_rate": 0.0001993926049826356, "loss": 5.4372, "step": 112 }, { "epoch": 0.042786823173040514, "grad_norm": 10.347518920898438, "learning_rate": 0.00019937934273092932, "loss": 5.8101, "step": 113 }, { "epoch": 0.04316546762589928, "grad_norm": 12.633049011230469, "learning_rate": 0.00019936593770106603, "loss": 5.6172, "step": 114 }, { "epoch": 0.043544112078758045, "grad_norm": 11.7897310256958, "learning_rate": 0.00019935238991230473, "loss": 5.5913, "step": 115 }, { "epoch": 0.04392275653161681, "grad_norm": 11.529661178588867, "learning_rate": 0.0001993386993841096, "loss": 4.7866, "step": 116 }, { "epoch": 0.044301400984475575, "grad_norm": 9.104240417480469, "learning_rate": 0.00019932486613614972, "loss": 4.1991, "step": 117 }, { "epoch": 0.044680045437334344, "grad_norm": 12.118027687072754, "learning_rate": 0.00019931089018829934, "loss": 4.7862, "step": 118 }, { "epoch": 0.045058689890193106, "grad_norm": 12.452719688415527, "learning_rate": 0.00019929677156063766, "loss": 4.1519, "step": 119 }, { "epoch": 0.045437334343051874, "grad_norm": 15.220785140991211, "learning_rate": 0.00019928251027344888, "loss": 4.8224, "step": 120 }, { "epoch": 0.04581597879591064, "grad_norm": 19.72614097595215, "learning_rate": 0.0001992681063472222, "loss": 5.4116, "step": 121 }, { "epoch": 0.046194623248769405, "grad_norm": 15.22668170928955, "learning_rate": 0.00019925355980265176, "loss": 4.1883, "step": 122 }, { "epoch": 0.046573267701628174, "grad_norm": 22.6198673248291, "learning_rate": 0.00019923887066063643, "loss": 4.0129, "step": 123 }, { "epoch": 0.046951912154486936, "grad_norm": 38.672752380371094, "learning_rate": 0.0001992240389422802, "loss": 3.4216, "step": 124 }, { "epoch": 0.047330556607345704, "grad_norm": 32.17597198486328, "learning_rate": 0.00019920906466889174, "loss": 4.9508, "step": 125 }, { "epoch": 0.047709201060204466, "grad_norm": 14.039003372192383, "learning_rate": 0.00019919394786198453, "loss": 6.7088, "step": 126 }, { "epoch": 0.048087845513063235, "grad_norm": 9.275696754455566, "learning_rate": 0.00019917868854327692, "loss": 5.7713, "step": 127 }, { "epoch": 0.048466489965922, "grad_norm": 9.453801155090332, "learning_rate": 0.00019916328673469193, "loss": 5.5684, "step": 128 }, { "epoch": 0.048845134418780765, "grad_norm": 9.081092834472656, "learning_rate": 0.0001991477424583573, "loss": 6.0058, "step": 129 }, { "epoch": 0.04922377887163953, "grad_norm": 7.833642482757568, "learning_rate": 0.00019913205573660552, "loss": 5.4775, "step": 130 }, { "epoch": 0.049602423324498296, "grad_norm": 8.797674179077148, "learning_rate": 0.0001991162265919736, "loss": 6.0334, "step": 131 }, { "epoch": 0.049981067777357065, "grad_norm": 8.712818145751953, "learning_rate": 0.00019910025504720332, "loss": 5.0432, "step": 132 }, { "epoch": 0.050359712230215826, "grad_norm": 10.346916198730469, "learning_rate": 0.00019908414112524092, "loss": 5.2967, "step": 133 }, { "epoch": 0.050738356683074595, "grad_norm": 9.813155174255371, "learning_rate": 0.0001990678848492373, "loss": 5.2965, "step": 134 }, { "epoch": 0.05111700113593336, "grad_norm": 9.53530216217041, "learning_rate": 0.0001990514862425478, "loss": 5.3226, "step": 135 }, { "epoch": 0.051495645588792126, "grad_norm": 9.706903457641602, "learning_rate": 0.00019903494532873226, "loss": 5.1397, "step": 136 }, { "epoch": 0.05187429004165089, "grad_norm": 9.555363655090332, "learning_rate": 0.00019901826213155504, "loss": 4.7094, "step": 137 }, { "epoch": 0.052252934494509656, "grad_norm": 9.73580265045166, "learning_rate": 0.00019900143667498477, "loss": 4.8708, "step": 138 }, { "epoch": 0.05263157894736842, "grad_norm": 12.988117218017578, "learning_rate": 0.0001989844689831947, "loss": 5.4945, "step": 139 }, { "epoch": 0.05301022340022719, "grad_norm": 11.392786979675293, "learning_rate": 0.00019896735908056217, "loss": 4.9868, "step": 140 }, { "epoch": 0.053388867853085956, "grad_norm": 11.049524307250977, "learning_rate": 0.00019895010699166895, "loss": 5.6386, "step": 141 }, { "epoch": 0.05376751230594472, "grad_norm": 15.501945495605469, "learning_rate": 0.0001989327127413012, "loss": 4.9812, "step": 142 }, { "epoch": 0.054146156758803486, "grad_norm": 12.038402557373047, "learning_rate": 0.00019891517635444909, "loss": 4.5501, "step": 143 }, { "epoch": 0.05452480121166225, "grad_norm": 14.716442108154297, "learning_rate": 0.00019889749785630722, "loss": 5.4678, "step": 144 }, { "epoch": 0.05490344566452102, "grad_norm": 14.685711860656738, "learning_rate": 0.00019887967727227418, "loss": 4.0556, "step": 145 }, { "epoch": 0.05528209011737978, "grad_norm": 19.5123291015625, "learning_rate": 0.00019886171462795283, "loss": 4.4198, "step": 146 }, { "epoch": 0.05566073457023855, "grad_norm": 20.309396743774414, "learning_rate": 0.00019884360994915006, "loss": 5.0207, "step": 147 }, { "epoch": 0.05603937902309731, "grad_norm": 18.461915969848633, "learning_rate": 0.00019882536326187685, "loss": 4.3499, "step": 148 }, { "epoch": 0.05641802347595608, "grad_norm": 28.44086265563965, "learning_rate": 0.00019880697459234817, "loss": 3.1848, "step": 149 }, { "epoch": 0.05679666792881484, "grad_norm": 58.63621520996094, "learning_rate": 0.00019878844396698298, "loss": 5.8651, "step": 150 }, { "epoch": 0.05717531238167361, "grad_norm": 13.71030044555664, "learning_rate": 0.00019876977141240426, "loss": 6.241, "step": 151 }, { "epoch": 0.05755395683453238, "grad_norm": 10.938446044921875, "learning_rate": 0.00019875095695543875, "loss": 5.6771, "step": 152 }, { "epoch": 0.05793260128739114, "grad_norm": 10.96714973449707, "learning_rate": 0.00019873200062311725, "loss": 5.2314, "step": 153 }, { "epoch": 0.05831124574024991, "grad_norm": 7.606492519378662, "learning_rate": 0.00019871290244267425, "loss": 5.7249, "step": 154 }, { "epoch": 0.05868989019310867, "grad_norm": 8.784101486206055, "learning_rate": 0.00019869366244154804, "loss": 4.9694, "step": 155 }, { "epoch": 0.05906853464596744, "grad_norm": 11.263976097106934, "learning_rate": 0.00019867428064738077, "loss": 5.5875, "step": 156 }, { "epoch": 0.0594471790988262, "grad_norm": 9.343450546264648, "learning_rate": 0.0001986547570880182, "loss": 6.221, "step": 157 }, { "epoch": 0.05982582355168497, "grad_norm": 9.731782913208008, "learning_rate": 0.00019863509179150984, "loss": 6.2793, "step": 158 }, { "epoch": 0.06020446800454373, "grad_norm": 10.603925704956055, "learning_rate": 0.00019861528478610873, "loss": 5.226, "step": 159 }, { "epoch": 0.0605831124574025, "grad_norm": 8.70156192779541, "learning_rate": 0.00019859533610027162, "loss": 5.7189, "step": 160 }, { "epoch": 0.06096175691026127, "grad_norm": 11.445813179016113, "learning_rate": 0.00019857524576265872, "loss": 5.772, "step": 161 }, { "epoch": 0.06134040136312003, "grad_norm": 9.810565948486328, "learning_rate": 0.0001985550138021338, "loss": 5.2862, "step": 162 }, { "epoch": 0.0617190458159788, "grad_norm": 9.25048828125, "learning_rate": 0.00019853464024776406, "loss": 4.5556, "step": 163 }, { "epoch": 0.06209769026883756, "grad_norm": 9.317825317382812, "learning_rate": 0.00019851412512882023, "loss": 5.3411, "step": 164 }, { "epoch": 0.06247633472169633, "grad_norm": 11.587838172912598, "learning_rate": 0.0001984934684747763, "loss": 5.739, "step": 165 }, { "epoch": 0.0628549791745551, "grad_norm": 12.702302932739258, "learning_rate": 0.00019847267031530965, "loss": 4.9714, "step": 166 }, { "epoch": 0.06323362362741386, "grad_norm": 14.249470710754395, "learning_rate": 0.00019845173068030097, "loss": 4.5709, "step": 167 }, { "epoch": 0.06361226808027262, "grad_norm": 14.03624439239502, "learning_rate": 0.00019843064959983422, "loss": 4.487, "step": 168 }, { "epoch": 0.06399091253313138, "grad_norm": 12.434381484985352, "learning_rate": 0.00019840942710419658, "loss": 4.296, "step": 169 }, { "epoch": 0.06436955698599016, "grad_norm": 15.566539764404297, "learning_rate": 0.00019838806322387828, "loss": 4.1964, "step": 170 }, { "epoch": 0.06474820143884892, "grad_norm": 12.214476585388184, "learning_rate": 0.0001983665579895729, "loss": 3.9038, "step": 171 }, { "epoch": 0.06512684589170768, "grad_norm": 16.57448387145996, "learning_rate": 0.0001983449114321769, "loss": 4.5337, "step": 172 }, { "epoch": 0.06550549034456646, "grad_norm": 18.46966552734375, "learning_rate": 0.0001983231235827899, "loss": 4.4065, "step": 173 }, { "epoch": 0.06588413479742522, "grad_norm": 24.38216781616211, "learning_rate": 0.00019830119447271442, "loss": 2.9628, "step": 174 }, { "epoch": 0.06626277925028398, "grad_norm": 23.528114318847656, "learning_rate": 0.00019827912413345603, "loss": 3.3465, "step": 175 }, { "epoch": 0.06664142370314274, "grad_norm": 10.8902587890625, "learning_rate": 0.00019825691259672313, "loss": 6.1824, "step": 176 }, { "epoch": 0.06702006815600152, "grad_norm": 10.006114959716797, "learning_rate": 0.000198234559894427, "loss": 5.6762, "step": 177 }, { "epoch": 0.06739871260886028, "grad_norm": 9.918802261352539, "learning_rate": 0.00019821206605868174, "loss": 5.5663, "step": 178 }, { "epoch": 0.06777735706171904, "grad_norm": 8.497994422912598, "learning_rate": 0.00019818943112180423, "loss": 5.8234, "step": 179 }, { "epoch": 0.0681560015145778, "grad_norm": 9.795154571533203, "learning_rate": 0.00019816665511631403, "loss": 5.3252, "step": 180 }, { "epoch": 0.06853464596743658, "grad_norm": 11.03689193725586, "learning_rate": 0.0001981437380749334, "loss": 6.0853, "step": 181 }, { "epoch": 0.06891329042029534, "grad_norm": 9.795255661010742, "learning_rate": 0.00019812068003058721, "loss": 5.0421, "step": 182 }, { "epoch": 0.0692919348731541, "grad_norm": 10.504554748535156, "learning_rate": 0.00019809748101640295, "loss": 5.2529, "step": 183 }, { "epoch": 0.06967057932601288, "grad_norm": 9.605035781860352, "learning_rate": 0.0001980741410657106, "loss": 5.0307, "step": 184 }, { "epoch": 0.07004922377887164, "grad_norm": 10.972379684448242, "learning_rate": 0.00019805066021204258, "loss": 5.13, "step": 185 }, { "epoch": 0.0704278682317304, "grad_norm": 10.463446617126465, "learning_rate": 0.00019802703848913384, "loss": 4.6112, "step": 186 }, { "epoch": 0.07080651268458917, "grad_norm": 11.090287208557129, "learning_rate": 0.0001980032759309217, "loss": 5.1514, "step": 187 }, { "epoch": 0.07118515713744794, "grad_norm": 11.830557823181152, "learning_rate": 0.00019797937257154573, "loss": 5.6081, "step": 188 }, { "epoch": 0.0715638015903067, "grad_norm": 10.591259002685547, "learning_rate": 0.00019795532844534792, "loss": 4.729, "step": 189 }, { "epoch": 0.07194244604316546, "grad_norm": 10.960124015808105, "learning_rate": 0.00019793114358687236, "loss": 4.6169, "step": 190 }, { "epoch": 0.07232109049602424, "grad_norm": 11.412923812866211, "learning_rate": 0.00019790681803086548, "loss": 4.6233, "step": 191 }, { "epoch": 0.072699734948883, "grad_norm": 11.271405220031738, "learning_rate": 0.00019788235181227574, "loss": 4.5077, "step": 192 }, { "epoch": 0.07307837940174176, "grad_norm": 11.715191841125488, "learning_rate": 0.00019785774496625366, "loss": 4.5266, "step": 193 }, { "epoch": 0.07345702385460053, "grad_norm": 14.390351295471191, "learning_rate": 0.00019783299752815196, "loss": 5.2515, "step": 194 }, { "epoch": 0.0738356683074593, "grad_norm": 11.806098937988281, "learning_rate": 0.00019780810953352518, "loss": 3.7989, "step": 195 }, { "epoch": 0.07421431276031806, "grad_norm": 13.76820182800293, "learning_rate": 0.00019778308101812988, "loss": 3.8526, "step": 196 }, { "epoch": 0.07459295721317682, "grad_norm": 16.82176399230957, "learning_rate": 0.0001977579120179245, "loss": 4.024, "step": 197 }, { "epoch": 0.07497160166603559, "grad_norm": 27.145509719848633, "learning_rate": 0.0001977326025690693, "loss": 4.9692, "step": 198 }, { "epoch": 0.07535024611889436, "grad_norm": 17.646276473999023, "learning_rate": 0.00019770715270792634, "loss": 2.3489, "step": 199 }, { "epoch": 0.07572889057175312, "grad_norm": 56.70100021362305, "learning_rate": 0.00019768156247105937, "loss": 3.9912, "step": 200 }, { "epoch": 0.07610753502461189, "grad_norm": 9.720958709716797, "learning_rate": 0.0001976558318952339, "loss": 6.4383, "step": 201 }, { "epoch": 0.07648617947747066, "grad_norm": 10.620763778686523, "learning_rate": 0.00019762996101741696, "loss": 7.2243, "step": 202 }, { "epoch": 0.07686482393032942, "grad_norm": 8.535510063171387, "learning_rate": 0.00019760394987477722, "loss": 5.143, "step": 203 }, { "epoch": 0.07724346838318819, "grad_norm": 9.765297889709473, "learning_rate": 0.00019757779850468484, "loss": 5.1503, "step": 204 }, { "epoch": 0.07762211283604695, "grad_norm": 9.695032119750977, "learning_rate": 0.00019755150694471146, "loss": 6.0913, "step": 205 }, { "epoch": 0.07800075728890572, "grad_norm": 8.690482139587402, "learning_rate": 0.00019752507523263015, "loss": 5.1187, "step": 206 }, { "epoch": 0.07837940174176448, "grad_norm": 8.73969554901123, "learning_rate": 0.0001974985034064153, "loss": 5.0969, "step": 207 }, { "epoch": 0.07875804619462325, "grad_norm": 9.594573020935059, "learning_rate": 0.0001974717915042426, "loss": 4.5138, "step": 208 }, { "epoch": 0.07913669064748201, "grad_norm": 10.64561653137207, "learning_rate": 0.00019744493956448897, "loss": 6.0733, "step": 209 }, { "epoch": 0.07951533510034078, "grad_norm": 10.740833282470703, "learning_rate": 0.00019741794762573266, "loss": 4.8035, "step": 210 }, { "epoch": 0.07989397955319955, "grad_norm": 11.910998344421387, "learning_rate": 0.0001973908157267528, "loss": 4.9078, "step": 211 }, { "epoch": 0.08027262400605831, "grad_norm": 10.62619400024414, "learning_rate": 0.00019736354390652988, "loss": 4.7867, "step": 212 }, { "epoch": 0.08065126845891708, "grad_norm": 12.65106201171875, "learning_rate": 0.00019733613220424524, "loss": 4.7825, "step": 213 }, { "epoch": 0.08102991291177584, "grad_norm": 10.566100120544434, "learning_rate": 0.0001973085806592812, "loss": 4.808, "step": 214 }, { "epoch": 0.0814085573646346, "grad_norm": 14.50074291229248, "learning_rate": 0.00019728088931122105, "loss": 5.8235, "step": 215 }, { "epoch": 0.08178720181749337, "grad_norm": 11.592037200927734, "learning_rate": 0.00019725305819984893, "loss": 4.4702, "step": 216 }, { "epoch": 0.08216584627035214, "grad_norm": 11.895447731018066, "learning_rate": 0.00019722508736514974, "loss": 4.6943, "step": 217 }, { "epoch": 0.0825444907232109, "grad_norm": 13.651464462280273, "learning_rate": 0.00019719697684730914, "loss": 4.6499, "step": 218 }, { "epoch": 0.08292313517606967, "grad_norm": 14.508546829223633, "learning_rate": 0.00019716872668671344, "loss": 4.4073, "step": 219 }, { "epoch": 0.08330177962892843, "grad_norm": 12.980317115783691, "learning_rate": 0.00019714033692394965, "loss": 4.389, "step": 220 }, { "epoch": 0.0836804240817872, "grad_norm": 17.773025512695312, "learning_rate": 0.00019711180759980529, "loss": 3.8144, "step": 221 }, { "epoch": 0.08405906853464597, "grad_norm": 16.80002784729004, "learning_rate": 0.00019708313875526834, "loss": 4.2691, "step": 222 }, { "epoch": 0.08443771298750473, "grad_norm": 16.477399826049805, "learning_rate": 0.00019705433043152736, "loss": 3.5554, "step": 223 }, { "epoch": 0.0848163574403635, "grad_norm": 26.655338287353516, "learning_rate": 0.00019702538266997124, "loss": 3.5923, "step": 224 }, { "epoch": 0.08519500189322227, "grad_norm": 28.16261863708496, "learning_rate": 0.0001969962955121891, "loss": 3.247, "step": 225 }, { "epoch": 0.08557364634608103, "grad_norm": 10.550549507141113, "learning_rate": 0.00019696706899997052, "loss": 6.8701, "step": 226 }, { "epoch": 0.08595229079893979, "grad_norm": 8.839621543884277, "learning_rate": 0.0001969377031753051, "loss": 5.5944, "step": 227 }, { "epoch": 0.08633093525179857, "grad_norm": 8.947502136230469, "learning_rate": 0.00019690819808038272, "loss": 5.7622, "step": 228 }, { "epoch": 0.08670957970465733, "grad_norm": 9.0411376953125, "learning_rate": 0.00019687855375759327, "loss": 4.649, "step": 229 }, { "epoch": 0.08708822415751609, "grad_norm": 8.791936874389648, "learning_rate": 0.0001968487702495268, "loss": 5.1811, "step": 230 }, { "epoch": 0.08746686861037486, "grad_norm": 9.556682586669922, "learning_rate": 0.00019681884759897308, "loss": 5.9121, "step": 231 }, { "epoch": 0.08784551306323363, "grad_norm": 10.459571838378906, "learning_rate": 0.00019678878584892208, "loss": 5.6164, "step": 232 }, { "epoch": 0.08822415751609239, "grad_norm": 11.417348861694336, "learning_rate": 0.00019675858504256344, "loss": 4.8234, "step": 233 }, { "epoch": 0.08860280196895115, "grad_norm": 8.151843070983887, "learning_rate": 0.00019672824522328655, "loss": 4.9158, "step": 234 }, { "epoch": 0.08898144642180993, "grad_norm": 10.862298965454102, "learning_rate": 0.00019669776643468066, "loss": 5.3044, "step": 235 }, { "epoch": 0.08936009087466869, "grad_norm": 11.182097434997559, "learning_rate": 0.00019666714872053454, "loss": 5.8071, "step": 236 }, { "epoch": 0.08973873532752745, "grad_norm": 10.572265625, "learning_rate": 0.00019663639212483665, "loss": 4.8596, "step": 237 }, { "epoch": 0.09011737978038621, "grad_norm": 9.833358764648438, "learning_rate": 0.00019660549669177495, "loss": 4.7743, "step": 238 }, { "epoch": 0.09049602423324499, "grad_norm": 10.828356742858887, "learning_rate": 0.00019657446246573685, "loss": 5.5859, "step": 239 }, { "epoch": 0.09087466868610375, "grad_norm": 9.41773796081543, "learning_rate": 0.00019654328949130916, "loss": 4.6524, "step": 240 }, { "epoch": 0.09125331313896251, "grad_norm": 10.468668937683105, "learning_rate": 0.0001965119778132781, "loss": 3.8107, "step": 241 }, { "epoch": 0.09163195759182129, "grad_norm": 13.526198387145996, "learning_rate": 0.00019648052747662907, "loss": 4.77, "step": 242 }, { "epoch": 0.09201060204468005, "grad_norm": 14.636107444763184, "learning_rate": 0.0001964489385265467, "loss": 5.4413, "step": 243 }, { "epoch": 0.09238924649753881, "grad_norm": 13.989765167236328, "learning_rate": 0.00019641721100841487, "loss": 4.2013, "step": 244 }, { "epoch": 0.09276789095039757, "grad_norm": 17.5861873626709, "learning_rate": 0.0001963853449678164, "loss": 3.5504, "step": 245 }, { "epoch": 0.09314653540325635, "grad_norm": 16.095678329467773, "learning_rate": 0.00019635334045053318, "loss": 4.5176, "step": 246 }, { "epoch": 0.09352517985611511, "grad_norm": 19.897119522094727, "learning_rate": 0.00019632119750254606, "loss": 4.4155, "step": 247 }, { "epoch": 0.09390382430897387, "grad_norm": 21.077598571777344, "learning_rate": 0.0001962889161700348, "loss": 4.0351, "step": 248 }, { "epoch": 0.09428246876183263, "grad_norm": 28.60135841369629, "learning_rate": 0.00019625649649937792, "loss": 4.0419, "step": 249 }, { "epoch": 0.09466111321469141, "grad_norm": 48.00245666503906, "learning_rate": 0.00019622393853715265, "loss": 4.1211, "step": 250 }, { "epoch": 0.09503975766755017, "grad_norm": 12.560955047607422, "learning_rate": 0.00019619124233013512, "loss": 6.4683, "step": 251 }, { "epoch": 0.09541840212040893, "grad_norm": 11.030938148498535, "learning_rate": 0.00019615840792529978, "loss": 6.5968, "step": 252 }, { "epoch": 0.09579704657326771, "grad_norm": 8.982504844665527, "learning_rate": 0.00019612543536981982, "loss": 5.2818, "step": 253 }, { "epoch": 0.09617569102612647, "grad_norm": 7.904403209686279, "learning_rate": 0.00019609232471106688, "loss": 5.9209, "step": 254 }, { "epoch": 0.09655433547898523, "grad_norm": 9.775873184204102, "learning_rate": 0.00019605907599661097, "loss": 5.3489, "step": 255 }, { "epoch": 0.096932979931844, "grad_norm": 8.759675979614258, "learning_rate": 0.0001960256892742205, "loss": 4.6493, "step": 256 }, { "epoch": 0.09731162438470277, "grad_norm": 11.45134449005127, "learning_rate": 0.0001959921645918621, "loss": 4.4718, "step": 257 }, { "epoch": 0.09769026883756153, "grad_norm": 10.730209350585938, "learning_rate": 0.0001959585019977006, "loss": 5.1965, "step": 258 }, { "epoch": 0.09806891329042029, "grad_norm": 10.355484962463379, "learning_rate": 0.0001959247015400991, "loss": 4.0992, "step": 259 }, { "epoch": 0.09844755774327905, "grad_norm": 11.505188941955566, "learning_rate": 0.00019589076326761854, "loss": 5.201, "step": 260 }, { "epoch": 0.09882620219613783, "grad_norm": 13.447498321533203, "learning_rate": 0.00019585668722901808, "loss": 6.0457, "step": 261 }, { "epoch": 0.09920484664899659, "grad_norm": 10.8496732711792, "learning_rate": 0.00019582247347325473, "loss": 4.9541, "step": 262 }, { "epoch": 0.09958349110185535, "grad_norm": 10.681647300720215, "learning_rate": 0.00019578812204948328, "loss": 4.8772, "step": 263 }, { "epoch": 0.09996213555471413, "grad_norm": 11.055303573608398, "learning_rate": 0.00019575363300705637, "loss": 4.7443, "step": 264 }, { "epoch": 0.10034078000757289, "grad_norm": 11.89393424987793, "learning_rate": 0.00019571900639552437, "loss": 4.4099, "step": 265 }, { "epoch": 0.10071942446043165, "grad_norm": 11.34334659576416, "learning_rate": 0.0001956842422646353, "loss": 4.9191, "step": 266 }, { "epoch": 0.10109806891329041, "grad_norm": 9.913498878479004, "learning_rate": 0.00019564934066433476, "loss": 3.6103, "step": 267 }, { "epoch": 0.10147671336614919, "grad_norm": 12.267012596130371, "learning_rate": 0.00019561430164476574, "loss": 4.3453, "step": 268 }, { "epoch": 0.10185535781900795, "grad_norm": 10.8731050491333, "learning_rate": 0.00019557912525626885, "loss": 3.7477, "step": 269 }, { "epoch": 0.10223400227186671, "grad_norm": 15.239813804626465, "learning_rate": 0.0001955438115493819, "loss": 4.467, "step": 270 }, { "epoch": 0.10261264672472548, "grad_norm": 17.561635971069336, "learning_rate": 0.00019550836057484003, "loss": 3.9279, "step": 271 }, { "epoch": 0.10299129117758425, "grad_norm": 14.543050765991211, "learning_rate": 0.00019547277238357564, "loss": 3.559, "step": 272 }, { "epoch": 0.10336993563044301, "grad_norm": 14.89653205871582, "learning_rate": 0.0001954370470267182, "loss": 2.3824, "step": 273 }, { "epoch": 0.10374858008330178, "grad_norm": 23.81206703186035, "learning_rate": 0.00019540118455559435, "loss": 3.3979, "step": 274 }, { "epoch": 0.10412722453616055, "grad_norm": 26.980783462524414, "learning_rate": 0.00019536518502172756, "loss": 4.0859, "step": 275 }, { "epoch": 0.10450586898901931, "grad_norm": 13.720477104187012, "learning_rate": 0.00019532904847683832, "loss": 6.7626, "step": 276 }, { "epoch": 0.10488451344187807, "grad_norm": 11.518758773803711, "learning_rate": 0.00019529277497284402, "loss": 5.9555, "step": 277 }, { "epoch": 0.10526315789473684, "grad_norm": 10.95704174041748, "learning_rate": 0.00019525636456185866, "loss": 6.4592, "step": 278 }, { "epoch": 0.10564180234759561, "grad_norm": 10.139583587646484, "learning_rate": 0.0001952198172961931, "loss": 4.9463, "step": 279 }, { "epoch": 0.10602044680045437, "grad_norm": 8.685033798217773, "learning_rate": 0.00019518313322835468, "loss": 4.8444, "step": 280 }, { "epoch": 0.10639909125331314, "grad_norm": 8.559419631958008, "learning_rate": 0.00019514631241104744, "loss": 5.7126, "step": 281 }, { "epoch": 0.10677773570617191, "grad_norm": 9.680383682250977, "learning_rate": 0.0001951093548971717, "loss": 5.0644, "step": 282 }, { "epoch": 0.10715638015903067, "grad_norm": 10.89194393157959, "learning_rate": 0.00019507226073982428, "loss": 4.7752, "step": 283 }, { "epoch": 0.10753502461188943, "grad_norm": 10.400115013122559, "learning_rate": 0.00019503502999229834, "loss": 4.8316, "step": 284 }, { "epoch": 0.1079136690647482, "grad_norm": 11.178241729736328, "learning_rate": 0.0001949976627080832, "loss": 5.167, "step": 285 }, { "epoch": 0.10829231351760697, "grad_norm": 11.463688850402832, "learning_rate": 0.00019496015894086445, "loss": 5.0064, "step": 286 }, { "epoch": 0.10867095797046573, "grad_norm": 12.74399471282959, "learning_rate": 0.00019492251874452364, "loss": 5.8686, "step": 287 }, { "epoch": 0.1090496024233245, "grad_norm": 10.723747253417969, "learning_rate": 0.0001948847421731384, "loss": 4.0739, "step": 288 }, { "epoch": 0.10942824687618326, "grad_norm": 10.196253776550293, "learning_rate": 0.00019484682928098225, "loss": 5.0363, "step": 289 }, { "epoch": 0.10980689132904203, "grad_norm": 14.19273853302002, "learning_rate": 0.00019480878012252464, "loss": 4.8781, "step": 290 }, { "epoch": 0.1101855357819008, "grad_norm": 12.241954803466797, "learning_rate": 0.00019477059475243072, "loss": 5.3741, "step": 291 }, { "epoch": 0.11056418023475956, "grad_norm": 8.766860008239746, "learning_rate": 0.00019473227322556132, "loss": 3.1036, "step": 292 }, { "epoch": 0.11094282468761833, "grad_norm": 13.060121536254883, "learning_rate": 0.00019469381559697295, "loss": 4.9652, "step": 293 }, { "epoch": 0.1113214691404771, "grad_norm": 12.351059913635254, "learning_rate": 0.00019465522192191762, "loss": 4.3625, "step": 294 }, { "epoch": 0.11170011359333586, "grad_norm": 13.481490135192871, "learning_rate": 0.00019461649225584285, "loss": 3.8786, "step": 295 }, { "epoch": 0.11207875804619462, "grad_norm": 14.277658462524414, "learning_rate": 0.00019457762665439144, "loss": 3.3642, "step": 296 }, { "epoch": 0.1124574024990534, "grad_norm": 15.829534530639648, "learning_rate": 0.00019453862517340156, "loss": 3.4833, "step": 297 }, { "epoch": 0.11283604695191216, "grad_norm": 18.78076934814453, "learning_rate": 0.00019449948786890656, "loss": 4.5821, "step": 298 }, { "epoch": 0.11321469140477092, "grad_norm": 19.851030349731445, "learning_rate": 0.000194460214797135, "loss": 3.411, "step": 299 }, { "epoch": 0.11359333585762968, "grad_norm": 38.37712478637695, "learning_rate": 0.00019442080601451042, "loss": 4.709, "step": 300 }, { "epoch": 0.11397198031048845, "grad_norm": 12.775144577026367, "learning_rate": 0.00019438126157765137, "loss": 6.2073, "step": 301 }, { "epoch": 0.11435062476334722, "grad_norm": 12.176517486572266, "learning_rate": 0.00019434158154337127, "loss": 5.3956, "step": 302 }, { "epoch": 0.11472926921620598, "grad_norm": 8.858514785766602, "learning_rate": 0.00019430176596867832, "loss": 4.7154, "step": 303 }, { "epoch": 0.11510791366906475, "grad_norm": 10.479050636291504, "learning_rate": 0.0001942618149107756, "loss": 4.7401, "step": 304 }, { "epoch": 0.11548655812192352, "grad_norm": 8.652934074401855, "learning_rate": 0.00019422172842706065, "loss": 5.2193, "step": 305 }, { "epoch": 0.11586520257478228, "grad_norm": 9.481331825256348, "learning_rate": 0.00019418150657512574, "loss": 4.7876, "step": 306 }, { "epoch": 0.11624384702764104, "grad_norm": 11.53617000579834, "learning_rate": 0.00019414114941275745, "loss": 4.7514, "step": 307 }, { "epoch": 0.11662249148049982, "grad_norm": 9.821549415588379, "learning_rate": 0.00019410065699793693, "loss": 4.9545, "step": 308 }, { "epoch": 0.11700113593335858, "grad_norm": 10.33404541015625, "learning_rate": 0.00019406002938883958, "loss": 4.8945, "step": 309 }, { "epoch": 0.11737978038621734, "grad_norm": 12.466033935546875, "learning_rate": 0.000194019266643835, "loss": 5.2858, "step": 310 }, { "epoch": 0.1177584248390761, "grad_norm": 11.954521179199219, "learning_rate": 0.00019397836882148695, "loss": 5.3408, "step": 311 }, { "epoch": 0.11813706929193488, "grad_norm": 12.428413391113281, "learning_rate": 0.00019393733598055328, "loss": 5.1357, "step": 312 }, { "epoch": 0.11851571374479364, "grad_norm": 11.977699279785156, "learning_rate": 0.00019389616817998582, "loss": 4.8637, "step": 313 }, { "epoch": 0.1188943581976524, "grad_norm": 12.119193077087402, "learning_rate": 0.00019385486547893028, "loss": 4.5933, "step": 314 }, { "epoch": 0.11927300265051118, "grad_norm": 11.337264060974121, "learning_rate": 0.00019381342793672624, "loss": 4.2781, "step": 315 }, { "epoch": 0.11965164710336994, "grad_norm": 10.534137725830078, "learning_rate": 0.00019377185561290689, "loss": 4.0069, "step": 316 }, { "epoch": 0.1200302915562287, "grad_norm": 11.264338493347168, "learning_rate": 0.00019373014856719918, "loss": 4.7428, "step": 317 }, { "epoch": 0.12040893600908746, "grad_norm": 11.423133850097656, "learning_rate": 0.0001936883068595235, "loss": 4.2359, "step": 318 }, { "epoch": 0.12078758046194624, "grad_norm": 14.29877758026123, "learning_rate": 0.00019364633054999383, "loss": 4.4332, "step": 319 }, { "epoch": 0.121166224914805, "grad_norm": 17.930879592895508, "learning_rate": 0.00019360421969891745, "loss": 5.1328, "step": 320 }, { "epoch": 0.12154486936766376, "grad_norm": 14.796630859375, "learning_rate": 0.00019356197436679496, "loss": 2.7526, "step": 321 }, { "epoch": 0.12192351382052254, "grad_norm": 21.729623794555664, "learning_rate": 0.00019351959461432015, "loss": 3.8724, "step": 322 }, { "epoch": 0.1223021582733813, "grad_norm": 28.3909912109375, "learning_rate": 0.00019347708050237997, "loss": 4.0754, "step": 323 }, { "epoch": 0.12268080272624006, "grad_norm": 23.190147399902344, "learning_rate": 0.00019343443209205436, "loss": 3.1569, "step": 324 }, { "epoch": 0.12305944717909882, "grad_norm": 26.0140380859375, "learning_rate": 0.00019339164944461628, "loss": 4.3824, "step": 325 }, { "epoch": 0.1234380916319576, "grad_norm": 9.705833435058594, "learning_rate": 0.00019334873262153143, "loss": 5.5853, "step": 326 }, { "epoch": 0.12381673608481636, "grad_norm": 9.483610153198242, "learning_rate": 0.0001933056816844584, "loss": 6.2104, "step": 327 }, { "epoch": 0.12419538053767512, "grad_norm": 9.369464874267578, "learning_rate": 0.00019326249669524836, "loss": 5.1741, "step": 328 }, { "epoch": 0.12457402499053388, "grad_norm": 8.173629760742188, "learning_rate": 0.0001932191777159452, "loss": 5.8212, "step": 329 }, { "epoch": 0.12495266944339266, "grad_norm": 8.955101013183594, "learning_rate": 0.00019317572480878514, "loss": 4.8004, "step": 330 }, { "epoch": 0.1253313138962514, "grad_norm": 9.041908264160156, "learning_rate": 0.00019313213803619697, "loss": 5.6992, "step": 331 }, { "epoch": 0.1257099583491102, "grad_norm": 10.43190860748291, "learning_rate": 0.00019308841746080172, "loss": 4.9818, "step": 332 }, { "epoch": 0.12608860280196896, "grad_norm": 10.404027938842773, "learning_rate": 0.0001930445631454127, "loss": 5.1713, "step": 333 }, { "epoch": 0.12646724725482772, "grad_norm": 11.429192543029785, "learning_rate": 0.0001930005751530353, "loss": 4.9948, "step": 334 }, { "epoch": 0.12684589170768648, "grad_norm": 11.61748218536377, "learning_rate": 0.00019295645354686704, "loss": 4.8311, "step": 335 }, { "epoch": 0.12722453616054524, "grad_norm": 12.844706535339355, "learning_rate": 0.00019291219839029735, "loss": 5.2656, "step": 336 }, { "epoch": 0.127603180613404, "grad_norm": 11.768765449523926, "learning_rate": 0.00019286780974690754, "loss": 5.8559, "step": 337 }, { "epoch": 0.12798182506626277, "grad_norm": 11.951184272766113, "learning_rate": 0.00019282328768047076, "loss": 5.5301, "step": 338 }, { "epoch": 0.12836046951912156, "grad_norm": 11.188042640686035, "learning_rate": 0.0001927786322549517, "loss": 5.3409, "step": 339 }, { "epoch": 0.12873911397198032, "grad_norm": 12.103419303894043, "learning_rate": 0.00019273384353450687, "loss": 5.1385, "step": 340 }, { "epoch": 0.12911775842483908, "grad_norm": 13.991079330444336, "learning_rate": 0.00019268892158348408, "loss": 4.6209, "step": 341 }, { "epoch": 0.12949640287769784, "grad_norm": 31.928539276123047, "learning_rate": 0.00019264386646642266, "loss": 5.2701, "step": 342 }, { "epoch": 0.1298750473305566, "grad_norm": 12.100227355957031, "learning_rate": 0.00019259867824805317, "loss": 4.0977, "step": 343 }, { "epoch": 0.13025369178341537, "grad_norm": 26.673351287841797, "learning_rate": 0.00019255335699329754, "loss": 4.4181, "step": 344 }, { "epoch": 0.13063233623627413, "grad_norm": 12.389229774475098, "learning_rate": 0.0001925079027672687, "loss": 4.243, "step": 345 }, { "epoch": 0.13101098068913292, "grad_norm": 12.767072677612305, "learning_rate": 0.0001924623156352707, "loss": 4.1363, "step": 346 }, { "epoch": 0.13138962514199168, "grad_norm": 13.129895210266113, "learning_rate": 0.00019241659566279851, "loss": 3.5315, "step": 347 }, { "epoch": 0.13176826959485044, "grad_norm": 19.601654052734375, "learning_rate": 0.00019237074291553793, "loss": 3.4685, "step": 348 }, { "epoch": 0.1321469140477092, "grad_norm": 29.587953567504883, "learning_rate": 0.00019232475745936548, "loss": 3.344, "step": 349 }, { "epoch": 0.13252555850056796, "grad_norm": 19.867443084716797, "learning_rate": 0.00019227863936034848, "loss": 2.5789, "step": 350 }, { "epoch": 0.13290420295342673, "grad_norm": 11.518388748168945, "learning_rate": 0.00019223238868474476, "loss": 5.9315, "step": 351 }, { "epoch": 0.1332828474062855, "grad_norm": 10.65963363647461, "learning_rate": 0.0001921860054990025, "loss": 6.3699, "step": 352 }, { "epoch": 0.13366149185914428, "grad_norm": 11.396893501281738, "learning_rate": 0.0001921394898697604, "loss": 5.3195, "step": 353 }, { "epoch": 0.13404013631200304, "grad_norm": 9.930930137634277, "learning_rate": 0.00019209284186384742, "loss": 5.6726, "step": 354 }, { "epoch": 0.1344187807648618, "grad_norm": 8.916218757629395, "learning_rate": 0.00019204606154828264, "loss": 4.7663, "step": 355 }, { "epoch": 0.13479742521772056, "grad_norm": 9.720449447631836, "learning_rate": 0.00019199914899027532, "loss": 4.8931, "step": 356 }, { "epoch": 0.13517606967057932, "grad_norm": 9.971704483032227, "learning_rate": 0.00019195210425722463, "loss": 5.7539, "step": 357 }, { "epoch": 0.13555471412343809, "grad_norm": 11.575483322143555, "learning_rate": 0.00019190492741671968, "loss": 4.9698, "step": 358 }, { "epoch": 0.13593335857629685, "grad_norm": 10.350571632385254, "learning_rate": 0.00019185761853653935, "loss": 5.9123, "step": 359 }, { "epoch": 0.1363120030291556, "grad_norm": 12.966808319091797, "learning_rate": 0.00019181017768465225, "loss": 4.678, "step": 360 }, { "epoch": 0.1366906474820144, "grad_norm": 11.714643478393555, "learning_rate": 0.0001917626049292166, "loss": 5.3699, "step": 361 }, { "epoch": 0.13706929193487316, "grad_norm": 14.007036209106445, "learning_rate": 0.00019171490033858009, "loss": 5.6013, "step": 362 }, { "epoch": 0.13744793638773192, "grad_norm": 14.195833206176758, "learning_rate": 0.00019166706398127985, "loss": 5.4985, "step": 363 }, { "epoch": 0.13782658084059068, "grad_norm": 11.887788772583008, "learning_rate": 0.0001916190959260423, "loss": 4.0214, "step": 364 }, { "epoch": 0.13820522529344945, "grad_norm": 12.19540023803711, "learning_rate": 0.00019157099624178306, "loss": 4.935, "step": 365 }, { "epoch": 0.1385838697463082, "grad_norm": 12.359858512878418, "learning_rate": 0.0001915227649976069, "loss": 3.9815, "step": 366 }, { "epoch": 0.13896251419916697, "grad_norm": 11.449932098388672, "learning_rate": 0.00019147440226280753, "loss": 4.8552, "step": 367 }, { "epoch": 0.13934115865202576, "grad_norm": 14.259176254272461, "learning_rate": 0.0001914259081068677, "loss": 4.586, "step": 368 }, { "epoch": 0.13971980310488452, "grad_norm": 12.771395683288574, "learning_rate": 0.00019137728259945882, "loss": 4.048, "step": 369 }, { "epoch": 0.14009844755774328, "grad_norm": 15.282382011413574, "learning_rate": 0.00019132852581044114, "loss": 4.755, "step": 370 }, { "epoch": 0.14047709201060204, "grad_norm": 17.148212432861328, "learning_rate": 0.0001912796378098634, "loss": 4.2456, "step": 371 }, { "epoch": 0.1408557364634608, "grad_norm": 16.571382522583008, "learning_rate": 0.00019123061866796302, "loss": 3.6083, "step": 372 }, { "epoch": 0.14123438091631957, "grad_norm": 26.597492218017578, "learning_rate": 0.00019118146845516562, "loss": 2.7945, "step": 373 }, { "epoch": 0.14161302536917833, "grad_norm": 22.75865936279297, "learning_rate": 0.00019113218724208533, "loss": 3.9925, "step": 374 }, { "epoch": 0.14199166982203712, "grad_norm": 30.639041900634766, "learning_rate": 0.00019108277509952433, "loss": 4.4992, "step": 375 }, { "epoch": 0.14237031427489588, "grad_norm": 12.82249927520752, "learning_rate": 0.00019103323209847305, "loss": 5.3655, "step": 376 }, { "epoch": 0.14274895872775464, "grad_norm": 15.458394050598145, "learning_rate": 0.00019098355831010974, "loss": 5.8707, "step": 377 }, { "epoch": 0.1431276031806134, "grad_norm": 13.276158332824707, "learning_rate": 0.00019093375380580075, "loss": 5.574, "step": 378 }, { "epoch": 0.14350624763347217, "grad_norm": 12.430033683776855, "learning_rate": 0.00019088381865710007, "loss": 4.9323, "step": 379 }, { "epoch": 0.14388489208633093, "grad_norm": 10.998027801513672, "learning_rate": 0.0001908337529357495, "loss": 5.8475, "step": 380 }, { "epoch": 0.1442635365391897, "grad_norm": 11.086531639099121, "learning_rate": 0.00019078355671367842, "loss": 5.1857, "step": 381 }, { "epoch": 0.14464218099204848, "grad_norm": 10.455646514892578, "learning_rate": 0.00019073323006300362, "loss": 5.0082, "step": 382 }, { "epoch": 0.14502082544490724, "grad_norm": 13.326669692993164, "learning_rate": 0.00019068277305602936, "loss": 5.4143, "step": 383 }, { "epoch": 0.145399469897766, "grad_norm": 10.433615684509277, "learning_rate": 0.00019063218576524706, "loss": 4.5149, "step": 384 }, { "epoch": 0.14577811435062477, "grad_norm": 11.804021835327148, "learning_rate": 0.00019058146826333552, "loss": 4.7288, "step": 385 }, { "epoch": 0.14615675880348353, "grad_norm": 12.542786598205566, "learning_rate": 0.00019053062062316043, "loss": 5.7741, "step": 386 }, { "epoch": 0.1465354032563423, "grad_norm": 10.54061222076416, "learning_rate": 0.00019047964291777456, "loss": 4.7578, "step": 387 }, { "epoch": 0.14691404770920105, "grad_norm": 12.497821807861328, "learning_rate": 0.0001904285352204175, "loss": 4.6442, "step": 388 }, { "epoch": 0.1472926921620598, "grad_norm": 11.979016304016113, "learning_rate": 0.00019037729760451556, "loss": 3.495, "step": 389 }, { "epoch": 0.1476713366149186, "grad_norm": 13.17095947265625, "learning_rate": 0.0001903259301436818, "loss": 4.0682, "step": 390 }, { "epoch": 0.14804998106777736, "grad_norm": 16.20962905883789, "learning_rate": 0.00019027443291171574, "loss": 5.0638, "step": 391 }, { "epoch": 0.14842862552063613, "grad_norm": 12.081317901611328, "learning_rate": 0.0001902228059826034, "loss": 4.3665, "step": 392 }, { "epoch": 0.1488072699734949, "grad_norm": 15.122051239013672, "learning_rate": 0.0001901710494305171, "loss": 5.2251, "step": 393 }, { "epoch": 0.14918591442635365, "grad_norm": 12.867919921875, "learning_rate": 0.00019011916332981548, "loss": 3.2506, "step": 394 }, { "epoch": 0.1495645588792124, "grad_norm": 14.751724243164062, "learning_rate": 0.00019006714775504307, "loss": 4.2651, "step": 395 }, { "epoch": 0.14994320333207117, "grad_norm": 15.070958137512207, "learning_rate": 0.0001900150027809307, "loss": 3.4987, "step": 396 }, { "epoch": 0.15032184778492996, "grad_norm": 17.090017318725586, "learning_rate": 0.00018996272848239494, "loss": 4.3325, "step": 397 }, { "epoch": 0.15070049223778872, "grad_norm": 21.952957153320312, "learning_rate": 0.0001899103249345382, "loss": 3.5858, "step": 398 }, { "epoch": 0.1510791366906475, "grad_norm": 26.79568099975586, "learning_rate": 0.00018985779221264854, "loss": 4.2193, "step": 399 }, { "epoch": 0.15145778114350625, "grad_norm": 49.78997039794922, "learning_rate": 0.00018980513039219973, "loss": 5.5597, "step": 400 }, { "epoch": 0.151836425596365, "grad_norm": 8.66366195678711, "learning_rate": 0.00018975233954885082, "loss": 5.6595, "step": 401 }, { "epoch": 0.15221507004922377, "grad_norm": 9.13800048828125, "learning_rate": 0.00018969941975844644, "loss": 5.9827, "step": 402 }, { "epoch": 0.15259371450208253, "grad_norm": 9.065213203430176, "learning_rate": 0.00018964637109701636, "loss": 4.9153, "step": 403 }, { "epoch": 0.15297235895494132, "grad_norm": 9.054080963134766, "learning_rate": 0.00018959319364077545, "loss": 5.0867, "step": 404 }, { "epoch": 0.15335100340780008, "grad_norm": 9.656022071838379, "learning_rate": 0.00018953988746612372, "loss": 4.3978, "step": 405 }, { "epoch": 0.15372964786065885, "grad_norm": 9.716662406921387, "learning_rate": 0.00018948645264964609, "loss": 5.8155, "step": 406 }, { "epoch": 0.1541082923135176, "grad_norm": 10.407852172851562, "learning_rate": 0.00018943288926811226, "loss": 4.8687, "step": 407 }, { "epoch": 0.15448693676637637, "grad_norm": 11.507822036743164, "learning_rate": 0.0001893791973984767, "loss": 5.4475, "step": 408 }, { "epoch": 0.15486558121923513, "grad_norm": 10.88329029083252, "learning_rate": 0.0001893253771178784, "loss": 4.3342, "step": 409 }, { "epoch": 0.1552442256720939, "grad_norm": 10.991971015930176, "learning_rate": 0.00018927142850364088, "loss": 5.6984, "step": 410 }, { "epoch": 0.15562287012495266, "grad_norm": 11.597615242004395, "learning_rate": 0.00018921735163327205, "loss": 4.9601, "step": 411 }, { "epoch": 0.15600151457781145, "grad_norm": 9.658888816833496, "learning_rate": 0.0001891631465844641, "loss": 3.6239, "step": 412 }, { "epoch": 0.1563801590306702, "grad_norm": 14.736695289611816, "learning_rate": 0.00018910881343509327, "loss": 5.292, "step": 413 }, { "epoch": 0.15675880348352897, "grad_norm": 12.756696701049805, "learning_rate": 0.00018905435226322, "loss": 4.8917, "step": 414 }, { "epoch": 0.15713744793638773, "grad_norm": 11.116317749023438, "learning_rate": 0.0001889997631470885, "loss": 4.5305, "step": 415 }, { "epoch": 0.1575160923892465, "grad_norm": 14.593806266784668, "learning_rate": 0.0001889450461651269, "loss": 5.3616, "step": 416 }, { "epoch": 0.15789473684210525, "grad_norm": 12.08388614654541, "learning_rate": 0.00018889020139594705, "loss": 3.9476, "step": 417 }, { "epoch": 0.15827338129496402, "grad_norm": 13.409226417541504, "learning_rate": 0.00018883522891834434, "loss": 4.7586, "step": 418 }, { "epoch": 0.1586520257478228, "grad_norm": 15.655117988586426, "learning_rate": 0.00018878012881129758, "loss": 5.0882, "step": 419 }, { "epoch": 0.15903067020068157, "grad_norm": 13.614988327026367, "learning_rate": 0.00018872490115396908, "loss": 4.4696, "step": 420 }, { "epoch": 0.15940931465354033, "grad_norm": 13.35642147064209, "learning_rate": 0.0001886695460257043, "loss": 3.1605, "step": 421 }, { "epoch": 0.1597879591063991, "grad_norm": 21.85063934326172, "learning_rate": 0.0001886140635060319, "loss": 5.235, "step": 422 }, { "epoch": 0.16016660355925785, "grad_norm": 21.174924850463867, "learning_rate": 0.00018855845367466353, "loss": 4.3507, "step": 423 }, { "epoch": 0.16054524801211661, "grad_norm": 17.3688907623291, "learning_rate": 0.00018850271661149376, "loss": 2.2297, "step": 424 }, { "epoch": 0.16092389246497538, "grad_norm": 39.54337692260742, "learning_rate": 0.00018844685239659988, "loss": 2.7965, "step": 425 }, { "epoch": 0.16130253691783417, "grad_norm": 6.966166973114014, "learning_rate": 0.00018839086111024204, "loss": 4.994, "step": 426 }, { "epoch": 0.16168118137069293, "grad_norm": 8.462141036987305, "learning_rate": 0.00018833474283286273, "loss": 5.645, "step": 427 }, { "epoch": 0.1620598258235517, "grad_norm": 9.555349349975586, "learning_rate": 0.00018827849764508706, "loss": 4.6212, "step": 428 }, { "epoch": 0.16243847027641045, "grad_norm": 9.305408477783203, "learning_rate": 0.00018822212562772238, "loss": 4.4289, "step": 429 }, { "epoch": 0.1628171147292692, "grad_norm": 9.897664070129395, "learning_rate": 0.00018816562686175834, "loss": 5.1709, "step": 430 }, { "epoch": 0.16319575918212798, "grad_norm": 10.320230484008789, "learning_rate": 0.0001881090014283666, "loss": 5.0989, "step": 431 }, { "epoch": 0.16357440363498674, "grad_norm": 10.196443557739258, "learning_rate": 0.0001880522494089008, "loss": 5.5656, "step": 432 }, { "epoch": 0.16395304808784553, "grad_norm": 9.999099731445312, "learning_rate": 0.00018799537088489654, "loss": 5.2194, "step": 433 }, { "epoch": 0.1643316925407043, "grad_norm": 10.33040714263916, "learning_rate": 0.0001879383659380711, "loss": 5.2323, "step": 434 }, { "epoch": 0.16471033699356305, "grad_norm": 11.10696792602539, "learning_rate": 0.00018788123465032335, "loss": 4.6551, "step": 435 }, { "epoch": 0.1650889814464218, "grad_norm": 11.029682159423828, "learning_rate": 0.00018782397710373377, "loss": 5.0993, "step": 436 }, { "epoch": 0.16546762589928057, "grad_norm": 9.93604850769043, "learning_rate": 0.00018776659338056427, "loss": 3.7054, "step": 437 }, { "epoch": 0.16584627035213934, "grad_norm": 12.357624053955078, "learning_rate": 0.00018770908356325784, "loss": 4.4637, "step": 438 }, { "epoch": 0.1662249148049981, "grad_norm": 13.088432312011719, "learning_rate": 0.00018765144773443877, "loss": 4.3871, "step": 439 }, { "epoch": 0.16660355925785686, "grad_norm": 11.933065414428711, "learning_rate": 0.00018759368597691243, "loss": 4.3666, "step": 440 }, { "epoch": 0.16698220371071565, "grad_norm": 13.432201385498047, "learning_rate": 0.00018753579837366502, "loss": 3.8948, "step": 441 }, { "epoch": 0.1673608481635744, "grad_norm": 15.856587409973145, "learning_rate": 0.00018747778500786358, "loss": 4.3036, "step": 442 }, { "epoch": 0.16773949261643317, "grad_norm": 16.909637451171875, "learning_rate": 0.00018741964596285583, "loss": 4.5476, "step": 443 }, { "epoch": 0.16811813706929193, "grad_norm": 17.030946731567383, "learning_rate": 0.00018736138132217003, "loss": 3.0895, "step": 444 }, { "epoch": 0.1684967815221507, "grad_norm": 17.348941802978516, "learning_rate": 0.00018730299116951493, "loss": 4.9647, "step": 445 }, { "epoch": 0.16887542597500946, "grad_norm": 14.687129020690918, "learning_rate": 0.00018724447558877958, "loss": 2.7539, "step": 446 }, { "epoch": 0.16925407042786822, "grad_norm": 17.83780288696289, "learning_rate": 0.0001871858346640332, "loss": 3.5633, "step": 447 }, { "epoch": 0.169632714880727, "grad_norm": 30.752113342285156, "learning_rate": 0.00018712706847952515, "loss": 4.545, "step": 448 }, { "epoch": 0.17001135933358577, "grad_norm": 16.965770721435547, "learning_rate": 0.00018706817711968473, "loss": 2.9911, "step": 449 }, { "epoch": 0.17039000378644453, "grad_norm": 32.481101989746094, "learning_rate": 0.00018700916066912102, "loss": 2.4111, "step": 450 }, { "epoch": 0.1707686482393033, "grad_norm": 8.298652648925781, "learning_rate": 0.00018695001921262288, "loss": 6.4388, "step": 451 }, { "epoch": 0.17114729269216206, "grad_norm": 8.251225471496582, "learning_rate": 0.00018689075283515882, "loss": 5.6452, "step": 452 }, { "epoch": 0.17152593714502082, "grad_norm": 8.718252182006836, "learning_rate": 0.0001868313616218767, "loss": 5.2732, "step": 453 }, { "epoch": 0.17190458159787958, "grad_norm": 9.202814102172852, "learning_rate": 0.00018677184565810378, "loss": 5.1873, "step": 454 }, { "epoch": 0.17228322605073837, "grad_norm": 10.298333168029785, "learning_rate": 0.00018671220502934662, "loss": 5.4461, "step": 455 }, { "epoch": 0.17266187050359713, "grad_norm": 9.323832511901855, "learning_rate": 0.00018665243982129076, "loss": 5.1539, "step": 456 }, { "epoch": 0.1730405149564559, "grad_norm": 9.709290504455566, "learning_rate": 0.00018659255011980083, "loss": 4.5545, "step": 457 }, { "epoch": 0.17341915940931465, "grad_norm": 12.013558387756348, "learning_rate": 0.00018653253601092027, "loss": 4.8025, "step": 458 }, { "epoch": 0.17379780386217342, "grad_norm": 10.444851875305176, "learning_rate": 0.00018647239758087122, "loss": 4.3912, "step": 459 }, { "epoch": 0.17417644831503218, "grad_norm": 11.680195808410645, "learning_rate": 0.00018641213491605454, "loss": 4.0869, "step": 460 }, { "epoch": 0.17455509276789094, "grad_norm": 13.132681846618652, "learning_rate": 0.00018635174810304944, "loss": 3.7436, "step": 461 }, { "epoch": 0.17493373722074973, "grad_norm": 11.139749526977539, "learning_rate": 0.00018629123722861365, "loss": 3.2608, "step": 462 }, { "epoch": 0.1753123816736085, "grad_norm": 14.833144187927246, "learning_rate": 0.00018623060237968298, "loss": 4.9499, "step": 463 }, { "epoch": 0.17569102612646725, "grad_norm": 12.821535110473633, "learning_rate": 0.00018616984364337147, "loss": 4.1431, "step": 464 }, { "epoch": 0.17606967057932602, "grad_norm": 12.597731590270996, "learning_rate": 0.00018610896110697112, "loss": 4.4357, "step": 465 }, { "epoch": 0.17644831503218478, "grad_norm": 10.909318923950195, "learning_rate": 0.00018604795485795174, "loss": 4.1517, "step": 466 }, { "epoch": 0.17682695948504354, "grad_norm": 13.418599128723145, "learning_rate": 0.00018598682498396096, "loss": 4.525, "step": 467 }, { "epoch": 0.1772056039379023, "grad_norm": 14.079731941223145, "learning_rate": 0.00018592557157282393, "loss": 4.8267, "step": 468 }, { "epoch": 0.17758424839076106, "grad_norm": 13.757750511169434, "learning_rate": 0.00018586419471254337, "loss": 3.4362, "step": 469 }, { "epoch": 0.17796289284361985, "grad_norm": 14.358112335205078, "learning_rate": 0.00018580269449129934, "loss": 3.3671, "step": 470 }, { "epoch": 0.1783415372964786, "grad_norm": 15.010443687438965, "learning_rate": 0.0001857410709974491, "loss": 2.8819, "step": 471 }, { "epoch": 0.17872018174933738, "grad_norm": 16.135650634765625, "learning_rate": 0.00018567932431952703, "loss": 3.2814, "step": 472 }, { "epoch": 0.17909882620219614, "grad_norm": 17.758377075195312, "learning_rate": 0.00018561745454624448, "loss": 3.9894, "step": 473 }, { "epoch": 0.1794774706550549, "grad_norm": 18.282501220703125, "learning_rate": 0.00018555546176648972, "loss": 2.9159, "step": 474 }, { "epoch": 0.17985611510791366, "grad_norm": 59.88269805908203, "learning_rate": 0.00018549334606932763, "loss": 3.7333, "step": 475 }, { "epoch": 0.18023475956077242, "grad_norm": 8.664223670959473, "learning_rate": 0.00018543110754399975, "loss": 5.3577, "step": 476 }, { "epoch": 0.1806134040136312, "grad_norm": 9.118324279785156, "learning_rate": 0.00018536874627992408, "loss": 5.3931, "step": 477 }, { "epoch": 0.18099204846648997, "grad_norm": 8.87546157836914, "learning_rate": 0.00018530626236669498, "loss": 4.8429, "step": 478 }, { "epoch": 0.18137069291934874, "grad_norm": 9.68464183807373, "learning_rate": 0.00018524365589408297, "loss": 5.464, "step": 479 }, { "epoch": 0.1817493373722075, "grad_norm": 10.263278007507324, "learning_rate": 0.0001851809269520347, "loss": 4.618, "step": 480 }, { "epoch": 0.18212798182506626, "grad_norm": 11.374741554260254, "learning_rate": 0.00018511807563067274, "loss": 5.1748, "step": 481 }, { "epoch": 0.18250662627792502, "grad_norm": 8.904349327087402, "learning_rate": 0.00018505510202029547, "loss": 5.1542, "step": 482 }, { "epoch": 0.18288527073078378, "grad_norm": 9.953553199768066, "learning_rate": 0.00018499200621137701, "loss": 4.3502, "step": 483 }, { "epoch": 0.18326391518364257, "grad_norm": 12.080293655395508, "learning_rate": 0.00018492878829456702, "loss": 4.9385, "step": 484 }, { "epoch": 0.18364255963650133, "grad_norm": 15.62290096282959, "learning_rate": 0.00018486544836069063, "loss": 4.9754, "step": 485 }, { "epoch": 0.1840212040893601, "grad_norm": 15.642521858215332, "learning_rate": 0.00018480198650074812, "loss": 5.3047, "step": 486 }, { "epoch": 0.18439984854221886, "grad_norm": 11.466191291809082, "learning_rate": 0.00018473840280591513, "loss": 3.8782, "step": 487 }, { "epoch": 0.18477849299507762, "grad_norm": 13.736114501953125, "learning_rate": 0.00018467469736754225, "loss": 4.5983, "step": 488 }, { "epoch": 0.18515713744793638, "grad_norm": 13.423456192016602, "learning_rate": 0.00018461087027715498, "loss": 5.3427, "step": 489 }, { "epoch": 0.18553578190079514, "grad_norm": 12.900038719177246, "learning_rate": 0.00018454692162645363, "loss": 4.3188, "step": 490 }, { "epoch": 0.1859144263536539, "grad_norm": 10.984456062316895, "learning_rate": 0.0001844828515073131, "loss": 3.9218, "step": 491 }, { "epoch": 0.1862930708065127, "grad_norm": 14.656373977661133, "learning_rate": 0.00018441866001178285, "loss": 4.7434, "step": 492 }, { "epoch": 0.18667171525937146, "grad_norm": 12.752568244934082, "learning_rate": 0.00018435434723208674, "loss": 4.8496, "step": 493 }, { "epoch": 0.18705035971223022, "grad_norm": 11.184649467468262, "learning_rate": 0.0001842899132606228, "loss": 2.916, "step": 494 }, { "epoch": 0.18742900416508898, "grad_norm": 14.828624725341797, "learning_rate": 0.0001842253581899632, "loss": 3.9765, "step": 495 }, { "epoch": 0.18780764861794774, "grad_norm": 20.33783721923828, "learning_rate": 0.0001841606821128542, "loss": 4.2373, "step": 496 }, { "epoch": 0.1881862930708065, "grad_norm": 15.583966255187988, "learning_rate": 0.0001840958851222158, "loss": 4.2481, "step": 497 }, { "epoch": 0.18856493752366527, "grad_norm": 17.68903160095215, "learning_rate": 0.0001840309673111417, "loss": 2.7633, "step": 498 }, { "epoch": 0.18894358197652406, "grad_norm": 13.663451194763184, "learning_rate": 0.00018396592877289926, "loss": 1.3758, "step": 499 }, { "epoch": 0.18932222642938282, "grad_norm": 20.513288497924805, "learning_rate": 0.00018390076960092926, "loss": 2.9705, "step": 500 }, { "epoch": 0.18970087088224158, "grad_norm": 8.191598892211914, "learning_rate": 0.00018383548988884575, "loss": 5.3532, "step": 501 }, { "epoch": 0.19007951533510034, "grad_norm": 8.629252433776855, "learning_rate": 0.000183770089730436, "loss": 4.9063, "step": 502 }, { "epoch": 0.1904581597879591, "grad_norm": 10.426155090332031, "learning_rate": 0.0001837045692196604, "loss": 5.3042, "step": 503 }, { "epoch": 0.19083680424081786, "grad_norm": 9.645724296569824, "learning_rate": 0.00018363892845065207, "loss": 5.0665, "step": 504 }, { "epoch": 0.19121544869367663, "grad_norm": 13.083929061889648, "learning_rate": 0.00018357316751771704, "loss": 5.6006, "step": 505 }, { "epoch": 0.19159409314653542, "grad_norm": 11.490813255310059, "learning_rate": 0.00018350728651533396, "loss": 5.5563, "step": 506 }, { "epoch": 0.19197273759939418, "grad_norm": 9.692631721496582, "learning_rate": 0.00018344128553815397, "loss": 4.7903, "step": 507 }, { "epoch": 0.19235138205225294, "grad_norm": 10.534561157226562, "learning_rate": 0.0001833751646810006, "loss": 4.777, "step": 508 }, { "epoch": 0.1927300265051117, "grad_norm": 10.0263090133667, "learning_rate": 0.00018330892403886954, "loss": 4.7177, "step": 509 }, { "epoch": 0.19310867095797046, "grad_norm": 10.845623970031738, "learning_rate": 0.00018324256370692867, "loss": 4.4374, "step": 510 }, { "epoch": 0.19348731541082922, "grad_norm": 11.177064895629883, "learning_rate": 0.00018317608378051774, "loss": 4.9647, "step": 511 }, { "epoch": 0.193865959863688, "grad_norm": 13.082832336425781, "learning_rate": 0.00018310948435514842, "loss": 4.6881, "step": 512 }, { "epoch": 0.19424460431654678, "grad_norm": 13.74007797241211, "learning_rate": 0.00018304276552650394, "loss": 4.1824, "step": 513 }, { "epoch": 0.19462324876940554, "grad_norm": 12.836389541625977, "learning_rate": 0.00018297592739043917, "loss": 4.5745, "step": 514 }, { "epoch": 0.1950018932222643, "grad_norm": 13.968857765197754, "learning_rate": 0.00018290897004298037, "loss": 3.7023, "step": 515 }, { "epoch": 0.19538053767512306, "grad_norm": 16.355371475219727, "learning_rate": 0.00018284189358032507, "loss": 4.8914, "step": 516 }, { "epoch": 0.19575918212798182, "grad_norm": 12.298672676086426, "learning_rate": 0.0001827746980988419, "loss": 4.1704, "step": 517 }, { "epoch": 0.19613782658084059, "grad_norm": 12.946645736694336, "learning_rate": 0.00018270738369507056, "loss": 3.8664, "step": 518 }, { "epoch": 0.19651647103369935, "grad_norm": 12.944162368774414, "learning_rate": 0.00018263995046572152, "loss": 4.1455, "step": 519 }, { "epoch": 0.1968951154865581, "grad_norm": 13.202437400817871, "learning_rate": 0.00018257239850767598, "loss": 3.9128, "step": 520 }, { "epoch": 0.1972737599394169, "grad_norm": 15.04906940460205, "learning_rate": 0.00018250472791798576, "loss": 3.9912, "step": 521 }, { "epoch": 0.19765240439227566, "grad_norm": 15.138761520385742, "learning_rate": 0.00018243693879387314, "loss": 2.8543, "step": 522 }, { "epoch": 0.19803104884513442, "grad_norm": 20.136871337890625, "learning_rate": 0.00018236903123273058, "loss": 3.1619, "step": 523 }, { "epoch": 0.19840969329799318, "grad_norm": 17.752416610717773, "learning_rate": 0.00018230100533212084, "loss": 2.2304, "step": 524 }, { "epoch": 0.19878833775085195, "grad_norm": 40.383480072021484, "learning_rate": 0.00018223286118977664, "loss": 3.2921, "step": 525 }, { "epoch": 0.1991669822037107, "grad_norm": 7.091005802154541, "learning_rate": 0.0001821645989036005, "loss": 4.928, "step": 526 }, { "epoch": 0.19954562665656947, "grad_norm": 7.8941569328308105, "learning_rate": 0.00018209621857166475, "loss": 5.0736, "step": 527 }, { "epoch": 0.19992427110942826, "grad_norm": 8.998451232910156, "learning_rate": 0.0001820277202922114, "loss": 5.0461, "step": 528 }, { "epoch": 0.20030291556228702, "grad_norm": 9.947563171386719, "learning_rate": 0.00018195910416365173, "loss": 5.4385, "step": 529 }, { "epoch": 0.20068156001514578, "grad_norm": 9.720036506652832, "learning_rate": 0.00018189037028456653, "loss": 5.0625, "step": 530 }, { "epoch": 0.20106020446800454, "grad_norm": 10.58145523071289, "learning_rate": 0.00018182151875370558, "loss": 4.9718, "step": 531 }, { "epoch": 0.2014388489208633, "grad_norm": 11.047203063964844, "learning_rate": 0.0001817525496699878, "loss": 4.5654, "step": 532 }, { "epoch": 0.20181749337372207, "grad_norm": 10.208518981933594, "learning_rate": 0.00018168346313250097, "loss": 4.5213, "step": 533 }, { "epoch": 0.20219613782658083, "grad_norm": 11.760814666748047, "learning_rate": 0.00018161425924050165, "loss": 4.5826, "step": 534 }, { "epoch": 0.20257478227943962, "grad_norm": 9.820549011230469, "learning_rate": 0.00018154493809341494, "loss": 4.1414, "step": 535 }, { "epoch": 0.20295342673229838, "grad_norm": 9.911032676696777, "learning_rate": 0.00018147549979083443, "loss": 4.395, "step": 536 }, { "epoch": 0.20333207118515714, "grad_norm": 11.928678512573242, "learning_rate": 0.00018140594443252203, "loss": 4.5902, "step": 537 }, { "epoch": 0.2037107156380159, "grad_norm": 11.273340225219727, "learning_rate": 0.00018133627211840784, "loss": 4.099, "step": 538 }, { "epoch": 0.20408936009087467, "grad_norm": 12.365876197814941, "learning_rate": 0.00018126648294858994, "loss": 4.9772, "step": 539 }, { "epoch": 0.20446800454373343, "grad_norm": 13.742144584655762, "learning_rate": 0.00018119657702333436, "loss": 4.0028, "step": 540 }, { "epoch": 0.2048466489965922, "grad_norm": 12.48265266418457, "learning_rate": 0.00018112655444307485, "loss": 4.1452, "step": 541 }, { "epoch": 0.20522529344945095, "grad_norm": 14.862174034118652, "learning_rate": 0.0001810564153084127, "loss": 3.7215, "step": 542 }, { "epoch": 0.20560393790230974, "grad_norm": 14.677619934082031, "learning_rate": 0.00018098615972011675, "loss": 3.5887, "step": 543 }, { "epoch": 0.2059825823551685, "grad_norm": 12.201497077941895, "learning_rate": 0.00018091578777912307, "loss": 2.6873, "step": 544 }, { "epoch": 0.20636122680802726, "grad_norm": 14.59697151184082, "learning_rate": 0.00018084529958653492, "loss": 3.5864, "step": 545 }, { "epoch": 0.20673987126088603, "grad_norm": 26.02948760986328, "learning_rate": 0.00018077469524362263, "loss": 3.8935, "step": 546 }, { "epoch": 0.2071185157137448, "grad_norm": 18.998506546020508, "learning_rate": 0.0001807039748518233, "loss": 2.6402, "step": 547 }, { "epoch": 0.20749716016660355, "grad_norm": 25.073461532592773, "learning_rate": 0.00018063313851274089, "loss": 2.7662, "step": 548 }, { "epoch": 0.2078758046194623, "grad_norm": 23.39455223083496, "learning_rate": 0.00018056218632814575, "loss": 3.1726, "step": 549 }, { "epoch": 0.2082544490723211, "grad_norm": 27.59321403503418, "learning_rate": 0.0001804911183999749, "loss": 2.3217, "step": 550 }, { "epoch": 0.20863309352517986, "grad_norm": 8.587485313415527, "learning_rate": 0.00018041993483033144, "loss": 4.8921, "step": 551 }, { "epoch": 0.20901173797803863, "grad_norm": 10.552020072937012, "learning_rate": 0.00018034863572148475, "loss": 5.781, "step": 552 }, { "epoch": 0.2093903824308974, "grad_norm": 9.02157974243164, "learning_rate": 0.00018027722117587016, "loss": 5.0234, "step": 553 }, { "epoch": 0.20976902688375615, "grad_norm": 10.152100563049316, "learning_rate": 0.00018020569129608883, "loss": 4.6749, "step": 554 }, { "epoch": 0.2101476713366149, "grad_norm": 9.074097633361816, "learning_rate": 0.0001801340461849076, "loss": 4.1354, "step": 555 }, { "epoch": 0.21052631578947367, "grad_norm": 9.53470516204834, "learning_rate": 0.00018006228594525894, "loss": 4.0305, "step": 556 }, { "epoch": 0.21090496024233246, "grad_norm": 11.014595985412598, "learning_rate": 0.00017999041068024064, "loss": 5.5452, "step": 557 }, { "epoch": 0.21128360469519122, "grad_norm": 10.290766716003418, "learning_rate": 0.00017991842049311585, "loss": 4.8177, "step": 558 }, { "epoch": 0.21166224914804999, "grad_norm": 10.907533645629883, "learning_rate": 0.00017984631548731273, "loss": 5.1779, "step": 559 }, { "epoch": 0.21204089360090875, "grad_norm": 10.802809715270996, "learning_rate": 0.00017977409576642444, "loss": 4.1945, "step": 560 }, { "epoch": 0.2124195380537675, "grad_norm": 10.898703575134277, "learning_rate": 0.00017970176143420894, "loss": 4.2533, "step": 561 }, { "epoch": 0.21279818250662627, "grad_norm": 13.079732894897461, "learning_rate": 0.00017962931259458888, "loss": 4.8476, "step": 562 }, { "epoch": 0.21317682695948503, "grad_norm": 12.071998596191406, "learning_rate": 0.00017955674935165138, "loss": 4.632, "step": 563 }, { "epoch": 0.21355547141234382, "grad_norm": 12.636747360229492, "learning_rate": 0.00017948407180964798, "loss": 4.8321, "step": 564 }, { "epoch": 0.21393411586520258, "grad_norm": 14.457389831542969, "learning_rate": 0.00017941128007299434, "loss": 4.3604, "step": 565 }, { "epoch": 0.21431276031806135, "grad_norm": 12.480835914611816, "learning_rate": 0.00017933837424627028, "loss": 3.7171, "step": 566 }, { "epoch": 0.2146914047709201, "grad_norm": 13.800310134887695, "learning_rate": 0.00017926535443421954, "loss": 4.7733, "step": 567 }, { "epoch": 0.21507004922377887, "grad_norm": 14.182160377502441, "learning_rate": 0.00017919222074174948, "loss": 3.6579, "step": 568 }, { "epoch": 0.21544869367663763, "grad_norm": 12.221685409545898, "learning_rate": 0.00017911897327393126, "loss": 3.3661, "step": 569 }, { "epoch": 0.2158273381294964, "grad_norm": 14.561601638793945, "learning_rate": 0.00017904561213599932, "loss": 4.3849, "step": 570 }, { "epoch": 0.21620598258235516, "grad_norm": 15.966811180114746, "learning_rate": 0.0001789721374333516, "loss": 3.6633, "step": 571 }, { "epoch": 0.21658462703521394, "grad_norm": 16.546342849731445, "learning_rate": 0.00017889854927154901, "loss": 2.8848, "step": 572 }, { "epoch": 0.2169632714880727, "grad_norm": 21.277877807617188, "learning_rate": 0.0001788248477563156, "loss": 3.3454, "step": 573 }, { "epoch": 0.21734191594093147, "grad_norm": 30.5257568359375, "learning_rate": 0.00017875103299353824, "loss": 3.1811, "step": 574 }, { "epoch": 0.21772056039379023, "grad_norm": 53.368526458740234, "learning_rate": 0.00017867710508926647, "loss": 3.2357, "step": 575 }, { "epoch": 0.218099204846649, "grad_norm": 8.367820739746094, "learning_rate": 0.0001786030641497124, "loss": 5.0559, "step": 576 }, { "epoch": 0.21847784929950775, "grad_norm": 9.639812469482422, "learning_rate": 0.00017852891028125053, "loss": 5.0726, "step": 577 }, { "epoch": 0.21885649375236652, "grad_norm": 9.26480484008789, "learning_rate": 0.00017845464359041765, "loss": 4.88, "step": 578 }, { "epoch": 0.2192351382052253, "grad_norm": 9.129890441894531, "learning_rate": 0.0001783802641839126, "loss": 3.9438, "step": 579 }, { "epoch": 0.21961378265808407, "grad_norm": 9.979665756225586, "learning_rate": 0.00017830577216859615, "loss": 3.7425, "step": 580 }, { "epoch": 0.21999242711094283, "grad_norm": 9.814806938171387, "learning_rate": 0.00017823116765149086, "loss": 5.3337, "step": 581 }, { "epoch": 0.2203710715638016, "grad_norm": 9.500343322753906, "learning_rate": 0.00017815645073978096, "loss": 4.5891, "step": 582 }, { "epoch": 0.22074971601666035, "grad_norm": 11.922303199768066, "learning_rate": 0.00017808162154081208, "loss": 4.1409, "step": 583 }, { "epoch": 0.22112836046951911, "grad_norm": 13.801411628723145, "learning_rate": 0.00017800668016209128, "loss": 4.7769, "step": 584 }, { "epoch": 0.22150700492237788, "grad_norm": 11.385126113891602, "learning_rate": 0.00017793162671128672, "loss": 3.7034, "step": 585 }, { "epoch": 0.22188564937523667, "grad_norm": 11.122262954711914, "learning_rate": 0.00017785646129622756, "loss": 5.1685, "step": 586 }, { "epoch": 0.22226429382809543, "grad_norm": 10.478163719177246, "learning_rate": 0.00017778118402490383, "loss": 3.5963, "step": 587 }, { "epoch": 0.2226429382809542, "grad_norm": 11.611848831176758, "learning_rate": 0.00017770579500546628, "loss": 4.1824, "step": 588 }, { "epoch": 0.22302158273381295, "grad_norm": 14.5952787399292, "learning_rate": 0.00017763029434622626, "loss": 3.979, "step": 589 }, { "epoch": 0.2234002271866717, "grad_norm": 11.742944717407227, "learning_rate": 0.00017755468215565538, "loss": 4.4411, "step": 590 }, { "epoch": 0.22377887163953047, "grad_norm": 14.67832088470459, "learning_rate": 0.00017747895854238564, "loss": 4.5713, "step": 591 }, { "epoch": 0.22415751609238924, "grad_norm": 12.356891632080078, "learning_rate": 0.00017740312361520897, "loss": 3.7381, "step": 592 }, { "epoch": 0.22453616054524803, "grad_norm": 12.271285057067871, "learning_rate": 0.00017732717748307735, "loss": 3.2774, "step": 593 }, { "epoch": 0.2249148049981068, "grad_norm": 13.871350288391113, "learning_rate": 0.00017725112025510247, "loss": 2.9768, "step": 594 }, { "epoch": 0.22529344945096555, "grad_norm": 16.23955726623535, "learning_rate": 0.0001771749520405556, "loss": 2.9301, "step": 595 }, { "epoch": 0.2256720939038243, "grad_norm": 15.869973182678223, "learning_rate": 0.00017709867294886757, "loss": 3.1639, "step": 596 }, { "epoch": 0.22605073835668307, "grad_norm": 17.27775001525879, "learning_rate": 0.0001770222830896284, "loss": 4.2995, "step": 597 }, { "epoch": 0.22642938280954183, "grad_norm": 18.232769012451172, "learning_rate": 0.00017694578257258727, "loss": 3.2705, "step": 598 }, { "epoch": 0.2268080272624006, "grad_norm": 27.304903030395508, "learning_rate": 0.00017686917150765244, "loss": 3.3116, "step": 599 }, { "epoch": 0.22718667171525936, "grad_norm": 64.07026672363281, "learning_rate": 0.0001767924500048908, "loss": 3.8853, "step": 600 }, { "epoch": 0.22756531616811815, "grad_norm": 7.719545841217041, "learning_rate": 0.00017671561817452812, "loss": 4.9895, "step": 601 }, { "epoch": 0.2279439606209769, "grad_norm": 9.30318546295166, "learning_rate": 0.00017663867612694852, "loss": 6.2789, "step": 602 }, { "epoch": 0.22832260507383567, "grad_norm": 8.493380546569824, "learning_rate": 0.00017656162397269455, "loss": 4.0822, "step": 603 }, { "epoch": 0.22870124952669443, "grad_norm": 10.322513580322266, "learning_rate": 0.0001764844618224669, "loss": 4.2392, "step": 604 }, { "epoch": 0.2290798939795532, "grad_norm": 10.969583511352539, "learning_rate": 0.00017640718978712442, "loss": 4.9037, "step": 605 }, { "epoch": 0.22945853843241196, "grad_norm": 10.155062675476074, "learning_rate": 0.0001763298079776836, "loss": 3.9269, "step": 606 }, { "epoch": 0.22983718288527072, "grad_norm": 11.783178329467773, "learning_rate": 0.00017625231650531884, "loss": 5.0255, "step": 607 }, { "epoch": 0.2302158273381295, "grad_norm": 13.353561401367188, "learning_rate": 0.000176174715481362, "loss": 5.0201, "step": 608 }, { "epoch": 0.23059447179098827, "grad_norm": 14.431788444519043, "learning_rate": 0.0001760970050173024, "loss": 4.1539, "step": 609 }, { "epoch": 0.23097311624384703, "grad_norm": 11.670136451721191, "learning_rate": 0.00017601918522478651, "loss": 4.2111, "step": 610 }, { "epoch": 0.2313517606967058, "grad_norm": 12.365769386291504, "learning_rate": 0.0001759412562156179, "loss": 4.7332, "step": 611 }, { "epoch": 0.23173040514956456, "grad_norm": 11.194862365722656, "learning_rate": 0.00017586321810175712, "loss": 4.0395, "step": 612 }, { "epoch": 0.23210904960242332, "grad_norm": 12.092097282409668, "learning_rate": 0.00017578507099532138, "loss": 3.2322, "step": 613 }, { "epoch": 0.23248769405528208, "grad_norm": 12.20335578918457, "learning_rate": 0.0001757068150085845, "loss": 3.9589, "step": 614 }, { "epoch": 0.23286633850814087, "grad_norm": 14.438755989074707, "learning_rate": 0.00017562845025397678, "loss": 4.2131, "step": 615 }, { "epoch": 0.23324498296099963, "grad_norm": 14.180761337280273, "learning_rate": 0.00017554997684408473, "loss": 3.6403, "step": 616 }, { "epoch": 0.2336236274138584, "grad_norm": 12.253854751586914, "learning_rate": 0.00017547139489165097, "loss": 3.2084, "step": 617 }, { "epoch": 0.23400227186671715, "grad_norm": 13.61539363861084, "learning_rate": 0.0001753927045095741, "loss": 3.9068, "step": 618 }, { "epoch": 0.23438091631957592, "grad_norm": 14.397199630737305, "learning_rate": 0.00017531390581090845, "loss": 3.4509, "step": 619 }, { "epoch": 0.23475956077243468, "grad_norm": 14.57890796661377, "learning_rate": 0.00017523499890886401, "loss": 3.025, "step": 620 }, { "epoch": 0.23513820522529344, "grad_norm": 15.363025665283203, "learning_rate": 0.00017515598391680626, "loss": 3.3226, "step": 621 }, { "epoch": 0.2355168496781522, "grad_norm": 14.477925300598145, "learning_rate": 0.0001750768609482558, "loss": 3.4475, "step": 622 }, { "epoch": 0.235895494131011, "grad_norm": 19.390026092529297, "learning_rate": 0.00017499763011688863, "loss": 4.1527, "step": 623 }, { "epoch": 0.23627413858386975, "grad_norm": 18.3753662109375, "learning_rate": 0.0001749182915365355, "loss": 2.1395, "step": 624 }, { "epoch": 0.23665278303672851, "grad_norm": 25.714628219604492, "learning_rate": 0.000174838845321182, "loss": 3.1295, "step": 625 }, { "epoch": 0.23703142748958728, "grad_norm": 8.490006446838379, "learning_rate": 0.0001747592915849684, "loss": 4.353, "step": 626 }, { "epoch": 0.23741007194244604, "grad_norm": 9.353875160217285, "learning_rate": 0.00017467963044218951, "loss": 4.8076, "step": 627 }, { "epoch": 0.2377887163953048, "grad_norm": 11.424484252929688, "learning_rate": 0.00017459986200729432, "loss": 4.6193, "step": 628 }, { "epoch": 0.23816736084816356, "grad_norm": 9.105878829956055, "learning_rate": 0.00017451998639488606, "loss": 4.1965, "step": 629 }, { "epoch": 0.23854600530102235, "grad_norm": 9.515813827514648, "learning_rate": 0.0001744400037197218, "loss": 4.4786, "step": 630 }, { "epoch": 0.2389246497538811, "grad_norm": 9.369331359863281, "learning_rate": 0.0001743599140967127, "loss": 3.6103, "step": 631 }, { "epoch": 0.23930329420673987, "grad_norm": 11.47270393371582, "learning_rate": 0.00017427971764092328, "loss": 4.4817, "step": 632 }, { "epoch": 0.23968193865959864, "grad_norm": 12.192450523376465, "learning_rate": 0.00017419941446757174, "loss": 4.4893, "step": 633 }, { "epoch": 0.2400605831124574, "grad_norm": 12.567253112792969, "learning_rate": 0.00017411900469202943, "loss": 4.5299, "step": 634 }, { "epoch": 0.24043922756531616, "grad_norm": 14.549039840698242, "learning_rate": 0.0001740384884298211, "loss": 4.4132, "step": 635 }, { "epoch": 0.24081787201817492, "grad_norm": 12.618002891540527, "learning_rate": 0.00017395786579662423, "loss": 3.7163, "step": 636 }, { "epoch": 0.2411965164710337, "grad_norm": 10.437346458435059, "learning_rate": 0.00017387713690826932, "loss": 3.2317, "step": 637 }, { "epoch": 0.24157516092389247, "grad_norm": 13.972203254699707, "learning_rate": 0.00017379630188073941, "loss": 3.9498, "step": 638 }, { "epoch": 0.24195380537675124, "grad_norm": 13.032102584838867, "learning_rate": 0.00017371536083017004, "loss": 3.5371, "step": 639 }, { "epoch": 0.24233244982961, "grad_norm": 14.505037307739258, "learning_rate": 0.00017363431387284914, "loss": 4.754, "step": 640 }, { "epoch": 0.24271109428246876, "grad_norm": 10.701925277709961, "learning_rate": 0.00017355316112521675, "loss": 2.8655, "step": 641 }, { "epoch": 0.24308973873532752, "grad_norm": 12.83195686340332, "learning_rate": 0.00017347190270386488, "loss": 3.7197, "step": 642 }, { "epoch": 0.24346838318818628, "grad_norm": 12.69071102142334, "learning_rate": 0.00017339053872553742, "loss": 2.9367, "step": 643 }, { "epoch": 0.24384702764104507, "grad_norm": 14.964492797851562, "learning_rate": 0.00017330906930712988, "loss": 3.9672, "step": 644 }, { "epoch": 0.24422567209390383, "grad_norm": 14.813679695129395, "learning_rate": 0.0001732274945656892, "loss": 2.1732, "step": 645 }, { "epoch": 0.2446043165467626, "grad_norm": 14.024624824523926, "learning_rate": 0.00017314581461841378, "loss": 3.4523, "step": 646 }, { "epoch": 0.24498296099962136, "grad_norm": 18.821441650390625, "learning_rate": 0.00017306402958265299, "loss": 3.0951, "step": 647 }, { "epoch": 0.24536160545248012, "grad_norm": 17.203479766845703, "learning_rate": 0.0001729821395759073, "loss": 2.7061, "step": 648 }, { "epoch": 0.24574024990533888, "grad_norm": 21.973928451538086, "learning_rate": 0.000172900144715828, "loss": 1.9662, "step": 649 }, { "epoch": 0.24611889435819764, "grad_norm": 24.348388671875, "learning_rate": 0.00017281804512021695, "loss": 2.4137, "step": 650 }, { "epoch": 0.2464975388110564, "grad_norm": 8.82381820678711, "learning_rate": 0.00017273584090702655, "loss": 5.5763, "step": 651 }, { "epoch": 0.2468761832639152, "grad_norm": 8.850475311279297, "learning_rate": 0.00017265353219435943, "loss": 5.0723, "step": 652 }, { "epoch": 0.24725482771677396, "grad_norm": 8.894290924072266, "learning_rate": 0.00017257111910046842, "loss": 4.3985, "step": 653 }, { "epoch": 0.24763347216963272, "grad_norm": 10.235475540161133, "learning_rate": 0.00017248860174375632, "loss": 4.657, "step": 654 }, { "epoch": 0.24801211662249148, "grad_norm": 9.949228286743164, "learning_rate": 0.00017240598024277566, "loss": 3.5698, "step": 655 }, { "epoch": 0.24839076107535024, "grad_norm": 10.317509651184082, "learning_rate": 0.00017232325471622863, "loss": 3.9962, "step": 656 }, { "epoch": 0.248769405528209, "grad_norm": 9.790380477905273, "learning_rate": 0.0001722404252829669, "loss": 3.9819, "step": 657 }, { "epoch": 0.24914804998106777, "grad_norm": 12.6632661819458, "learning_rate": 0.00017215749206199137, "loss": 4.5065, "step": 658 }, { "epoch": 0.24952669443392655, "grad_norm": 12.218596458435059, "learning_rate": 0.00017207445517245212, "loss": 4.2112, "step": 659 }, { "epoch": 0.24990533888678532, "grad_norm": 13.204305648803711, "learning_rate": 0.00017199131473364805, "loss": 4.5837, "step": 660 }, { "epoch": 0.2502839833396441, "grad_norm": 11.75940990447998, "learning_rate": 0.00017190807086502695, "loss": 4.6862, "step": 661 }, { "epoch": 0.2502839833396441, "eval_loss": 0.4858866035938263, "eval_runtime": 899.2669, "eval_samples_per_second": 4.946, "eval_steps_per_second": 1.237, "step": 661 } ], "logging_steps": 1, "max_steps": 2641, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 661, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.718153047487021e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }