{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.916590284142988, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0018331805682859762, "grad_norm": 0.07204689830541611, "learning_rate": 4e-05, "loss": 0.02, "step": 1 }, { "epoch": 0.0036663611365719525, "grad_norm": 0.0971730649471283, "learning_rate": 8e-05, "loss": 0.037, "step": 2 }, { "epoch": 0.005499541704857928, "grad_norm": 0.0824146643280983, "learning_rate": 0.00012, "loss": 0.0258, "step": 3 }, { "epoch": 0.007332722273143905, "grad_norm": 0.08816614001989365, "learning_rate": 0.00016, "loss": 0.0266, "step": 4 }, { "epoch": 0.00916590284142988, "grad_norm": 0.09286794066429138, "learning_rate": 0.0002, "loss": 0.0329, "step": 5 }, { "epoch": 0.010999083409715857, "grad_norm": 0.10284189879894257, "learning_rate": 0.00019981566820276498, "loss": 0.0338, "step": 6 }, { "epoch": 0.012832263978001834, "grad_norm": 0.07772455364465714, "learning_rate": 0.00019963133640552995, "loss": 0.0238, "step": 7 }, { "epoch": 0.01466544454628781, "grad_norm": 0.10870955139398575, "learning_rate": 0.00019944700460829492, "loss": 0.0302, "step": 8 }, { "epoch": 0.016498625114573784, "grad_norm": 0.10848188400268555, "learning_rate": 0.00019926267281105992, "loss": 0.0314, "step": 9 }, { "epoch": 0.01833180568285976, "grad_norm": 0.10071466863155365, "learning_rate": 0.0001990783410138249, "loss": 0.0281, "step": 10 }, { "epoch": 0.02016498625114574, "grad_norm": 0.08322206884622574, "learning_rate": 0.00019889400921658986, "loss": 0.0236, "step": 11 }, { "epoch": 0.021998166819431713, "grad_norm": 0.12429114431142807, "learning_rate": 0.00019870967741935483, "loss": 0.0488, "step": 12 }, { "epoch": 0.02383134738771769, "grad_norm": 0.08638562262058258, "learning_rate": 0.0001985253456221198, "loss": 0.0217, "step": 13 }, { "epoch": 0.025664527956003668, "grad_norm": 0.07736796885728836, "learning_rate": 0.0001983410138248848, "loss": 0.0223, "step": 14 }, { "epoch": 0.027497708524289642, "grad_norm": 0.08814552426338196, "learning_rate": 0.00019815668202764977, "loss": 0.0314, "step": 15 }, { "epoch": 0.02933088909257562, "grad_norm": 0.09797844290733337, "learning_rate": 0.00019797235023041477, "loss": 0.0341, "step": 16 }, { "epoch": 0.031164069660861594, "grad_norm": 0.11390747874975204, "learning_rate": 0.00019778801843317974, "loss": 0.0329, "step": 17 }, { "epoch": 0.03299725022914757, "grad_norm": 0.12596647441387177, "learning_rate": 0.00019760368663594472, "loss": 0.0479, "step": 18 }, { "epoch": 0.034830430797433545, "grad_norm": 0.08672292530536652, "learning_rate": 0.00019741935483870969, "loss": 0.0242, "step": 19 }, { "epoch": 0.03666361136571952, "grad_norm": 0.07240811735391617, "learning_rate": 0.00019723502304147466, "loss": 0.023, "step": 20 }, { "epoch": 0.0384967919340055, "grad_norm": 0.09740838408470154, "learning_rate": 0.00019705069124423966, "loss": 0.0325, "step": 21 }, { "epoch": 0.04032997250229148, "grad_norm": 0.10614212602376938, "learning_rate": 0.00019686635944700463, "loss": 0.0415, "step": 22 }, { "epoch": 0.04216315307057745, "grad_norm": 0.06703123450279236, "learning_rate": 0.0001966820276497696, "loss": 0.0201, "step": 23 }, { "epoch": 0.043996333638863426, "grad_norm": 0.11590456962585449, "learning_rate": 0.00019649769585253457, "loss": 0.0444, "step": 24 }, { "epoch": 0.045829514207149404, "grad_norm": 0.08663053065538406, "learning_rate": 0.00019631336405529954, "loss": 0.0331, "step": 25 }, { "epoch": 0.04766269477543538, "grad_norm": 0.0977555587887764, "learning_rate": 0.0001961290322580645, "loss": 0.0262, "step": 26 }, { "epoch": 0.04949587534372136, "grad_norm": 0.07877817749977112, "learning_rate": 0.0001959447004608295, "loss": 0.0309, "step": 27 }, { "epoch": 0.051329055912007336, "grad_norm": 0.09205110371112823, "learning_rate": 0.00019576036866359448, "loss": 0.0363, "step": 28 }, { "epoch": 0.05316223648029331, "grad_norm": 0.07161393016576767, "learning_rate": 0.00019557603686635945, "loss": 0.0231, "step": 29 }, { "epoch": 0.054995417048579284, "grad_norm": 0.1110185980796814, "learning_rate": 0.00019539170506912442, "loss": 0.0414, "step": 30 }, { "epoch": 0.05682859761686526, "grad_norm": 0.08546338975429535, "learning_rate": 0.0001952073732718894, "loss": 0.0297, "step": 31 }, { "epoch": 0.05866177818515124, "grad_norm": 0.08454198390245438, "learning_rate": 0.00019502304147465436, "loss": 0.0244, "step": 32 }, { "epoch": 0.06049495875343722, "grad_norm": 0.09889410436153412, "learning_rate": 0.00019483870967741936, "loss": 0.0334, "step": 33 }, { "epoch": 0.06232813932172319, "grad_norm": 0.10116388648748398, "learning_rate": 0.00019465437788018433, "loss": 0.0309, "step": 34 }, { "epoch": 0.06416131989000917, "grad_norm": 0.12611277401447296, "learning_rate": 0.00019447004608294933, "loss": 0.0511, "step": 35 }, { "epoch": 0.06599450045829514, "grad_norm": 0.11593794077634811, "learning_rate": 0.0001942857142857143, "loss": 0.046, "step": 36 }, { "epoch": 0.06782768102658111, "grad_norm": 0.09269952028989792, "learning_rate": 0.00019410138248847927, "loss": 0.0269, "step": 37 }, { "epoch": 0.06966086159486709, "grad_norm": 0.08128319680690765, "learning_rate": 0.00019391705069124425, "loss": 0.0267, "step": 38 }, { "epoch": 0.07149404216315307, "grad_norm": 0.16776973009109497, "learning_rate": 0.00019373271889400924, "loss": 0.0333, "step": 39 }, { "epoch": 0.07332722273143905, "grad_norm": 0.09216058254241943, "learning_rate": 0.00019354838709677422, "loss": 0.03, "step": 40 }, { "epoch": 0.07516040329972502, "grad_norm": 0.0869499146938324, "learning_rate": 0.00019336405529953919, "loss": 0.0227, "step": 41 }, { "epoch": 0.076993583868011, "grad_norm": 0.08851771056652069, "learning_rate": 0.00019317972350230416, "loss": 0.0269, "step": 42 }, { "epoch": 0.07882676443629698, "grad_norm": 0.07213577628135681, "learning_rate": 0.00019299539170506913, "loss": 0.0271, "step": 43 }, { "epoch": 0.08065994500458296, "grad_norm": 0.08532075583934784, "learning_rate": 0.0001928110599078341, "loss": 0.0343, "step": 44 }, { "epoch": 0.08249312557286893, "grad_norm": 0.1002303957939148, "learning_rate": 0.0001926267281105991, "loss": 0.0421, "step": 45 }, { "epoch": 0.0843263061411549, "grad_norm": 0.10179378092288971, "learning_rate": 0.00019244239631336407, "loss": 0.0472, "step": 46 }, { "epoch": 0.08615948670944087, "grad_norm": 0.08771562576293945, "learning_rate": 0.00019225806451612904, "loss": 0.0329, "step": 47 }, { "epoch": 0.08799266727772685, "grad_norm": 0.0852821096777916, "learning_rate": 0.000192073732718894, "loss": 0.0271, "step": 48 }, { "epoch": 0.08982584784601283, "grad_norm": 0.0855298638343811, "learning_rate": 0.00019188940092165898, "loss": 0.0366, "step": 49 }, { "epoch": 0.09165902841429881, "grad_norm": 0.10242363810539246, "learning_rate": 0.00019170506912442395, "loss": 0.0409, "step": 50 }, { "epoch": 0.09349220898258478, "grad_norm": 0.07894504070281982, "learning_rate": 0.00019152073732718895, "loss": 0.0261, "step": 51 }, { "epoch": 0.09532538955087076, "grad_norm": 0.0891820639371872, "learning_rate": 0.00019133640552995392, "loss": 0.0371, "step": 52 }, { "epoch": 0.09715857011915674, "grad_norm": 0.09067510813474655, "learning_rate": 0.0001911520737327189, "loss": 0.0369, "step": 53 }, { "epoch": 0.09899175068744272, "grad_norm": 0.07762423902750015, "learning_rate": 0.0001909677419354839, "loss": 0.0267, "step": 54 }, { "epoch": 0.1008249312557287, "grad_norm": 0.09637603908777237, "learning_rate": 0.00019078341013824886, "loss": 0.0384, "step": 55 }, { "epoch": 0.10265811182401467, "grad_norm": 0.08040502667427063, "learning_rate": 0.00019059907834101383, "loss": 0.0291, "step": 56 }, { "epoch": 0.10449129239230064, "grad_norm": 0.07859490811824799, "learning_rate": 0.00019041474654377883, "loss": 0.0307, "step": 57 }, { "epoch": 0.10632447296058661, "grad_norm": 0.08587613701820374, "learning_rate": 0.0001902304147465438, "loss": 0.0327, "step": 58 }, { "epoch": 0.10815765352887259, "grad_norm": 0.09821908921003342, "learning_rate": 0.00019004608294930877, "loss": 0.0302, "step": 59 }, { "epoch": 0.10999083409715857, "grad_norm": 0.09804884344339371, "learning_rate": 0.00018986175115207375, "loss": 0.0397, "step": 60 }, { "epoch": 0.11182401466544455, "grad_norm": 0.1171504408121109, "learning_rate": 0.00018967741935483872, "loss": 0.0406, "step": 61 }, { "epoch": 0.11365719523373052, "grad_norm": 0.09860570728778839, "learning_rate": 0.0001894930875576037, "loss": 0.0381, "step": 62 }, { "epoch": 0.1154903758020165, "grad_norm": 0.10370708256959915, "learning_rate": 0.00018930875576036869, "loss": 0.0419, "step": 63 }, { "epoch": 0.11732355637030248, "grad_norm": 0.06204281374812126, "learning_rate": 0.00018912442396313366, "loss": 0.0219, "step": 64 }, { "epoch": 0.11915673693858846, "grad_norm": 0.09292633086442947, "learning_rate": 0.00018894009216589863, "loss": 0.0293, "step": 65 }, { "epoch": 0.12098991750687443, "grad_norm": 0.10509534925222397, "learning_rate": 0.0001887557603686636, "loss": 0.04, "step": 66 }, { "epoch": 0.1228230980751604, "grad_norm": 0.08757118135690689, "learning_rate": 0.00018857142857142857, "loss": 0.0295, "step": 67 }, { "epoch": 0.12465627864344637, "grad_norm": 0.08099905401468277, "learning_rate": 0.00018838709677419354, "loss": 0.027, "step": 68 }, { "epoch": 0.12648945921173235, "grad_norm": 0.0924796611070633, "learning_rate": 0.0001882027649769585, "loss": 0.0289, "step": 69 }, { "epoch": 0.12832263978001834, "grad_norm": 0.09552083164453506, "learning_rate": 0.0001880184331797235, "loss": 0.0322, "step": 70 }, { "epoch": 0.1301558203483043, "grad_norm": 0.09000714868307114, "learning_rate": 0.00018783410138248848, "loss": 0.0304, "step": 71 }, { "epoch": 0.13198900091659027, "grad_norm": 0.08881785720586777, "learning_rate": 0.00018764976958525345, "loss": 0.0305, "step": 72 }, { "epoch": 0.13382218148487626, "grad_norm": 0.10535095632076263, "learning_rate": 0.00018746543778801845, "loss": 0.0435, "step": 73 }, { "epoch": 0.13565536205316223, "grad_norm": 0.097115658223629, "learning_rate": 0.00018728110599078342, "loss": 0.0501, "step": 74 }, { "epoch": 0.13748854262144822, "grad_norm": 0.10467157512903214, "learning_rate": 0.0001870967741935484, "loss": 0.0523, "step": 75 }, { "epoch": 0.13932172318973418, "grad_norm": 0.08117009699344635, "learning_rate": 0.0001869124423963134, "loss": 0.0272, "step": 76 }, { "epoch": 0.14115490375802017, "grad_norm": 0.09257414937019348, "learning_rate": 0.00018672811059907836, "loss": 0.0375, "step": 77 }, { "epoch": 0.14298808432630614, "grad_norm": 0.07603290677070618, "learning_rate": 0.00018654377880184333, "loss": 0.0245, "step": 78 }, { "epoch": 0.14482126489459213, "grad_norm": 0.08395121246576309, "learning_rate": 0.0001863594470046083, "loss": 0.0379, "step": 79 }, { "epoch": 0.1466544454628781, "grad_norm": 0.09991391003131866, "learning_rate": 0.00018617511520737328, "loss": 0.0326, "step": 80 }, { "epoch": 0.14848762603116408, "grad_norm": 0.0780223086476326, "learning_rate": 0.00018599078341013825, "loss": 0.0332, "step": 81 }, { "epoch": 0.15032080659945005, "grad_norm": 0.07637890428304672, "learning_rate": 0.00018580645161290325, "loss": 0.0271, "step": 82 }, { "epoch": 0.152153987167736, "grad_norm": 0.0815526694059372, "learning_rate": 0.00018562211981566822, "loss": 0.0289, "step": 83 }, { "epoch": 0.153987167736022, "grad_norm": 0.07798799872398376, "learning_rate": 0.0001854377880184332, "loss": 0.032, "step": 84 }, { "epoch": 0.15582034830430797, "grad_norm": 0.06180557608604431, "learning_rate": 0.00018525345622119816, "loss": 0.0207, "step": 85 }, { "epoch": 0.15765352887259396, "grad_norm": 0.07907485961914062, "learning_rate": 0.00018506912442396313, "loss": 0.0281, "step": 86 }, { "epoch": 0.15948670944087992, "grad_norm": 0.0909823551774025, "learning_rate": 0.0001848847926267281, "loss": 0.032, "step": 87 }, { "epoch": 0.1613198900091659, "grad_norm": 0.09836460649967194, "learning_rate": 0.0001847004608294931, "loss": 0.0409, "step": 88 }, { "epoch": 0.16315307057745188, "grad_norm": 0.05404837429523468, "learning_rate": 0.00018451612903225807, "loss": 0.02, "step": 89 }, { "epoch": 0.16498625114573787, "grad_norm": 0.09892542660236359, "learning_rate": 0.00018433179723502304, "loss": 0.0321, "step": 90 }, { "epoch": 0.16681943171402383, "grad_norm": 0.08493707329034805, "learning_rate": 0.000184147465437788, "loss": 0.0343, "step": 91 }, { "epoch": 0.1686526122823098, "grad_norm": 0.10508857667446136, "learning_rate": 0.000183963133640553, "loss": 0.0397, "step": 92 }, { "epoch": 0.17048579285059579, "grad_norm": 0.08235018700361252, "learning_rate": 0.00018377880184331798, "loss": 0.0262, "step": 93 }, { "epoch": 0.17231897341888175, "grad_norm": 0.06784114986658096, "learning_rate": 0.00018359447004608298, "loss": 0.0247, "step": 94 }, { "epoch": 0.17415215398716774, "grad_norm": 0.10621548444032669, "learning_rate": 0.00018341013824884795, "loss": 0.035, "step": 95 }, { "epoch": 0.1759853345554537, "grad_norm": 0.10951874405145645, "learning_rate": 0.00018322580645161292, "loss": 0.0409, "step": 96 }, { "epoch": 0.1778185151237397, "grad_norm": 0.08758855611085892, "learning_rate": 0.0001830414746543779, "loss": 0.0389, "step": 97 }, { "epoch": 0.17965169569202566, "grad_norm": 0.10536627471446991, "learning_rate": 0.00018285714285714286, "loss": 0.0372, "step": 98 }, { "epoch": 0.18148487626031165, "grad_norm": 0.10224272310733795, "learning_rate": 0.00018267281105990784, "loss": 0.0394, "step": 99 }, { "epoch": 0.18331805682859761, "grad_norm": 0.07795912027359009, "learning_rate": 0.00018248847926267283, "loss": 0.0271, "step": 100 }, { "epoch": 0.1851512373968836, "grad_norm": 0.0965060293674469, "learning_rate": 0.0001823041474654378, "loss": 0.0357, "step": 101 }, { "epoch": 0.18698441796516957, "grad_norm": 0.061280328780412674, "learning_rate": 0.00018211981566820278, "loss": 0.0263, "step": 102 }, { "epoch": 0.18881759853345553, "grad_norm": 0.08007590472698212, "learning_rate": 0.00018193548387096775, "loss": 0.0324, "step": 103 }, { "epoch": 0.19065077910174152, "grad_norm": 0.08586332201957703, "learning_rate": 0.00018175115207373272, "loss": 0.0362, "step": 104 }, { "epoch": 0.1924839596700275, "grad_norm": 0.08350996673107147, "learning_rate": 0.0001815668202764977, "loss": 0.029, "step": 105 }, { "epoch": 0.19431714023831348, "grad_norm": 0.07932569086551666, "learning_rate": 0.0001813824884792627, "loss": 0.0262, "step": 106 }, { "epoch": 0.19615032080659944, "grad_norm": 0.08850853145122528, "learning_rate": 0.00018119815668202766, "loss": 0.0376, "step": 107 }, { "epoch": 0.19798350137488543, "grad_norm": 0.10090523213148117, "learning_rate": 0.00018101382488479263, "loss": 0.0437, "step": 108 }, { "epoch": 0.1998166819431714, "grad_norm": 0.09374283999204636, "learning_rate": 0.0001808294930875576, "loss": 0.0348, "step": 109 }, { "epoch": 0.2016498625114574, "grad_norm": 0.09550321102142334, "learning_rate": 0.00018064516129032257, "loss": 0.0415, "step": 110 }, { "epoch": 0.20348304307974335, "grad_norm": 0.09793347865343094, "learning_rate": 0.00018046082949308757, "loss": 0.0363, "step": 111 }, { "epoch": 0.20531622364802934, "grad_norm": 0.05917959660291672, "learning_rate": 0.00018027649769585254, "loss": 0.0233, "step": 112 }, { "epoch": 0.2071494042163153, "grad_norm": 0.08095124363899231, "learning_rate": 0.00018009216589861754, "loss": 0.0273, "step": 113 }, { "epoch": 0.20898258478460127, "grad_norm": 0.09870299696922302, "learning_rate": 0.0001799078341013825, "loss": 0.0341, "step": 114 }, { "epoch": 0.21081576535288726, "grad_norm": 0.06982927024364471, "learning_rate": 0.00017972350230414748, "loss": 0.0263, "step": 115 }, { "epoch": 0.21264894592117323, "grad_norm": 0.10992158949375153, "learning_rate": 0.00017953917050691245, "loss": 0.0444, "step": 116 }, { "epoch": 0.21448212648945922, "grad_norm": 0.11806368082761765, "learning_rate": 0.00017935483870967742, "loss": 0.045, "step": 117 }, { "epoch": 0.21631530705774518, "grad_norm": 0.061528291553258896, "learning_rate": 0.00017917050691244242, "loss": 0.0194, "step": 118 }, { "epoch": 0.21814848762603117, "grad_norm": 0.08814897388219833, "learning_rate": 0.0001789861751152074, "loss": 0.0335, "step": 119 }, { "epoch": 0.21998166819431714, "grad_norm": 0.07923831045627594, "learning_rate": 0.00017880184331797236, "loss": 0.0304, "step": 120 }, { "epoch": 0.22181484876260313, "grad_norm": 0.06676612794399261, "learning_rate": 0.00017861751152073734, "loss": 0.0261, "step": 121 }, { "epoch": 0.2236480293308891, "grad_norm": 0.10044591873884201, "learning_rate": 0.0001784331797235023, "loss": 0.0352, "step": 122 }, { "epoch": 0.22548120989917506, "grad_norm": 0.09440556168556213, "learning_rate": 0.00017824884792626728, "loss": 0.0394, "step": 123 }, { "epoch": 0.22731439046746105, "grad_norm": 0.07944708317518234, "learning_rate": 0.00017806451612903228, "loss": 0.0292, "step": 124 }, { "epoch": 0.229147571035747, "grad_norm": 0.11380550265312195, "learning_rate": 0.00017788018433179725, "loss": 0.0479, "step": 125 }, { "epoch": 0.230980751604033, "grad_norm": 0.09983845055103302, "learning_rate": 0.00017769585253456222, "loss": 0.043, "step": 126 }, { "epoch": 0.23281393217231897, "grad_norm": 0.09731481224298477, "learning_rate": 0.0001775115207373272, "loss": 0.0346, "step": 127 }, { "epoch": 0.23464711274060496, "grad_norm": 0.08189846575260162, "learning_rate": 0.00017732718894009216, "loss": 0.0314, "step": 128 }, { "epoch": 0.23648029330889092, "grad_norm": 0.11007854342460632, "learning_rate": 0.00017714285714285713, "loss": 0.0395, "step": 129 }, { "epoch": 0.2383134738771769, "grad_norm": 0.06934443861246109, "learning_rate": 0.00017695852534562213, "loss": 0.0289, "step": 130 }, { "epoch": 0.24014665444546288, "grad_norm": 0.0965190976858139, "learning_rate": 0.0001767741935483871, "loss": 0.0341, "step": 131 }, { "epoch": 0.24197983501374887, "grad_norm": 0.11201060563325882, "learning_rate": 0.0001765898617511521, "loss": 0.0404, "step": 132 }, { "epoch": 0.24381301558203483, "grad_norm": 0.07356410473585129, "learning_rate": 0.00017640552995391707, "loss": 0.0225, "step": 133 }, { "epoch": 0.2456461961503208, "grad_norm": 0.09507370740175247, "learning_rate": 0.00017622119815668204, "loss": 0.0409, "step": 134 }, { "epoch": 0.24747937671860679, "grad_norm": 0.09385097771883011, "learning_rate": 0.000176036866359447, "loss": 0.0314, "step": 135 }, { "epoch": 0.24931255728689275, "grad_norm": 0.08933474868535995, "learning_rate": 0.00017585253456221198, "loss": 0.0308, "step": 136 }, { "epoch": 0.25114573785517874, "grad_norm": 0.07607486099004745, "learning_rate": 0.00017566820276497698, "loss": 0.0272, "step": 137 }, { "epoch": 0.2529789184234647, "grad_norm": 0.060726869851350784, "learning_rate": 0.00017548387096774195, "loss": 0.0254, "step": 138 }, { "epoch": 0.25481209899175067, "grad_norm": 0.08471496403217316, "learning_rate": 0.00017529953917050692, "loss": 0.0299, "step": 139 }, { "epoch": 0.2566452795600367, "grad_norm": 0.06967601180076599, "learning_rate": 0.0001751152073732719, "loss": 0.0234, "step": 140 }, { "epoch": 0.25847846012832265, "grad_norm": 0.08925054222345352, "learning_rate": 0.00017493087557603687, "loss": 0.0375, "step": 141 }, { "epoch": 0.2603116406966086, "grad_norm": 0.07857096195220947, "learning_rate": 0.00017474654377880184, "loss": 0.0294, "step": 142 }, { "epoch": 0.2621448212648946, "grad_norm": 0.10110893100500107, "learning_rate": 0.00017456221198156684, "loss": 0.0388, "step": 143 }, { "epoch": 0.26397800183318054, "grad_norm": 0.10537184774875641, "learning_rate": 0.0001743778801843318, "loss": 0.0426, "step": 144 }, { "epoch": 0.26581118240146656, "grad_norm": 0.10398052632808685, "learning_rate": 0.00017419354838709678, "loss": 0.0362, "step": 145 }, { "epoch": 0.2676443629697525, "grad_norm": 0.06848938763141632, "learning_rate": 0.00017400921658986175, "loss": 0.028, "step": 146 }, { "epoch": 0.2694775435380385, "grad_norm": 0.08947031199932098, "learning_rate": 0.00017382488479262672, "loss": 0.0272, "step": 147 }, { "epoch": 0.27131072410632445, "grad_norm": 0.09318867325782776, "learning_rate": 0.0001736405529953917, "loss": 0.0328, "step": 148 }, { "epoch": 0.27314390467461047, "grad_norm": 0.0872950330376625, "learning_rate": 0.0001734562211981567, "loss": 0.0304, "step": 149 }, { "epoch": 0.27497708524289644, "grad_norm": 0.12327979505062103, "learning_rate": 0.00017327188940092166, "loss": 0.0542, "step": 150 }, { "epoch": 0.2768102658111824, "grad_norm": 0.08264505118131638, "learning_rate": 0.00017308755760368666, "loss": 0.0262, "step": 151 }, { "epoch": 0.27864344637946836, "grad_norm": 0.09241585433483124, "learning_rate": 0.00017290322580645163, "loss": 0.0323, "step": 152 }, { "epoch": 0.2804766269477543, "grad_norm": 0.09120775014162064, "learning_rate": 0.0001727188940092166, "loss": 0.031, "step": 153 }, { "epoch": 0.28230980751604035, "grad_norm": 0.05967549607157707, "learning_rate": 0.00017253456221198157, "loss": 0.02, "step": 154 }, { "epoch": 0.2841429880843263, "grad_norm": 0.08845420181751251, "learning_rate": 0.00017235023041474657, "loss": 0.0359, "step": 155 }, { "epoch": 0.2859761686526123, "grad_norm": 0.10303748399019241, "learning_rate": 0.00017216589861751154, "loss": 0.0336, "step": 156 }, { "epoch": 0.28780934922089824, "grad_norm": 0.11286526173353195, "learning_rate": 0.0001719815668202765, "loss": 0.0433, "step": 157 }, { "epoch": 0.28964252978918426, "grad_norm": 0.05800803378224373, "learning_rate": 0.00017179723502304148, "loss": 0.0191, "step": 158 }, { "epoch": 0.2914757103574702, "grad_norm": 0.06295135617256165, "learning_rate": 0.00017161290322580645, "loss": 0.0211, "step": 159 }, { "epoch": 0.2933088909257562, "grad_norm": 0.061198145151138306, "learning_rate": 0.00017142857142857143, "loss": 0.0222, "step": 160 }, { "epoch": 0.29514207149404215, "grad_norm": 0.08423091471195221, "learning_rate": 0.00017124423963133642, "loss": 0.029, "step": 161 }, { "epoch": 0.29697525206232817, "grad_norm": 0.07596798241138458, "learning_rate": 0.0001710599078341014, "loss": 0.0257, "step": 162 }, { "epoch": 0.29880843263061413, "grad_norm": 0.12243133038282394, "learning_rate": 0.00017087557603686637, "loss": 0.0488, "step": 163 }, { "epoch": 0.3006416131989001, "grad_norm": 0.10156381130218506, "learning_rate": 0.00017069124423963134, "loss": 0.0309, "step": 164 }, { "epoch": 0.30247479376718606, "grad_norm": 0.05903761461377144, "learning_rate": 0.0001705069124423963, "loss": 0.0179, "step": 165 }, { "epoch": 0.304307974335472, "grad_norm": 0.08366727083921432, "learning_rate": 0.00017032258064516128, "loss": 0.0297, "step": 166 }, { "epoch": 0.30614115490375804, "grad_norm": 0.09768462926149368, "learning_rate": 0.00017013824884792628, "loss": 0.0388, "step": 167 }, { "epoch": 0.307974335472044, "grad_norm": 0.07999719679355621, "learning_rate": 0.00016995391705069125, "loss": 0.0285, "step": 168 }, { "epoch": 0.30980751604032997, "grad_norm": 0.10129693150520325, "learning_rate": 0.00016976958525345622, "loss": 0.0437, "step": 169 }, { "epoch": 0.31164069660861593, "grad_norm": 0.07942084223031998, "learning_rate": 0.00016958525345622122, "loss": 0.028, "step": 170 }, { "epoch": 0.31347387717690195, "grad_norm": 0.09509172290563583, "learning_rate": 0.0001694009216589862, "loss": 0.035, "step": 171 }, { "epoch": 0.3153070577451879, "grad_norm": 0.10119883716106415, "learning_rate": 0.00016921658986175116, "loss": 0.0407, "step": 172 }, { "epoch": 0.3171402383134739, "grad_norm": 0.10474774241447449, "learning_rate": 0.00016903225806451616, "loss": 0.0382, "step": 173 }, { "epoch": 0.31897341888175984, "grad_norm": 0.08802273869514465, "learning_rate": 0.00016884792626728113, "loss": 0.0311, "step": 174 }, { "epoch": 0.3208065994500458, "grad_norm": 0.06499020010232925, "learning_rate": 0.0001686635944700461, "loss": 0.0247, "step": 175 }, { "epoch": 0.3226397800183318, "grad_norm": 0.09561455249786377, "learning_rate": 0.00016847926267281107, "loss": 0.0376, "step": 176 }, { "epoch": 0.3244729605866178, "grad_norm": 0.051068369299173355, "learning_rate": 0.00016829493087557604, "loss": 0.0194, "step": 177 }, { "epoch": 0.32630614115490375, "grad_norm": 0.09048140794038773, "learning_rate": 0.000168110599078341, "loss": 0.0344, "step": 178 }, { "epoch": 0.3281393217231897, "grad_norm": 0.08035707473754883, "learning_rate": 0.000167926267281106, "loss": 0.0323, "step": 179 }, { "epoch": 0.32997250229147573, "grad_norm": 0.10091862827539444, "learning_rate": 0.00016774193548387098, "loss": 0.0406, "step": 180 }, { "epoch": 0.3318056828597617, "grad_norm": 0.09862423688173294, "learning_rate": 0.00016755760368663595, "loss": 0.0475, "step": 181 }, { "epoch": 0.33363886342804766, "grad_norm": 0.10344900190830231, "learning_rate": 0.00016737327188940092, "loss": 0.0389, "step": 182 }, { "epoch": 0.3354720439963336, "grad_norm": 0.10084162652492523, "learning_rate": 0.0001671889400921659, "loss": 0.0387, "step": 183 }, { "epoch": 0.3373052245646196, "grad_norm": 0.08419749140739441, "learning_rate": 0.00016700460829493087, "loss": 0.0298, "step": 184 }, { "epoch": 0.3391384051329056, "grad_norm": 0.09623179584741592, "learning_rate": 0.00016682027649769587, "loss": 0.0291, "step": 185 }, { "epoch": 0.34097158570119157, "grad_norm": 0.0974535197019577, "learning_rate": 0.00016663594470046084, "loss": 0.0337, "step": 186 }, { "epoch": 0.34280476626947753, "grad_norm": 0.10164261609315872, "learning_rate": 0.0001664516129032258, "loss": 0.0466, "step": 187 }, { "epoch": 0.3446379468377635, "grad_norm": 0.0877864882349968, "learning_rate": 0.00016626728110599078, "loss": 0.0244, "step": 188 }, { "epoch": 0.3464711274060495, "grad_norm": 0.08111972361803055, "learning_rate": 0.00016608294930875578, "loss": 0.0285, "step": 189 }, { "epoch": 0.3483043079743355, "grad_norm": 0.0656951442360878, "learning_rate": 0.00016589861751152075, "loss": 0.0235, "step": 190 }, { "epoch": 0.35013748854262144, "grad_norm": 0.08031731843948364, "learning_rate": 0.00016571428571428575, "loss": 0.0321, "step": 191 }, { "epoch": 0.3519706691109074, "grad_norm": 0.1022307500243187, "learning_rate": 0.00016552995391705072, "loss": 0.0437, "step": 192 }, { "epoch": 0.3538038496791934, "grad_norm": 0.07790978997945786, "learning_rate": 0.0001653456221198157, "loss": 0.0305, "step": 193 }, { "epoch": 0.3556370302474794, "grad_norm": 0.06656166166067123, "learning_rate": 0.00016516129032258066, "loss": 0.027, "step": 194 }, { "epoch": 0.35747021081576535, "grad_norm": 0.08255946636199951, "learning_rate": 0.00016497695852534563, "loss": 0.0294, "step": 195 }, { "epoch": 0.3593033913840513, "grad_norm": 0.0771537646651268, "learning_rate": 0.0001647926267281106, "loss": 0.027, "step": 196 }, { "epoch": 0.3611365719523373, "grad_norm": 0.08929789811372757, "learning_rate": 0.00016460829493087557, "loss": 0.0435, "step": 197 }, { "epoch": 0.3629697525206233, "grad_norm": 0.07332134991884232, "learning_rate": 0.00016442396313364057, "loss": 0.0254, "step": 198 }, { "epoch": 0.36480293308890926, "grad_norm": 0.09681219607591629, "learning_rate": 0.00016423963133640554, "loss": 0.0404, "step": 199 }, { "epoch": 0.36663611365719523, "grad_norm": 0.0849548950791359, "learning_rate": 0.0001640552995391705, "loss": 0.0337, "step": 200 }, { "epoch": 0.3684692942254812, "grad_norm": 0.10003779083490372, "learning_rate": 0.00016387096774193548, "loss": 0.034, "step": 201 }, { "epoch": 0.3703024747937672, "grad_norm": 0.09228444844484329, "learning_rate": 0.00016368663594470046, "loss": 0.0362, "step": 202 }, { "epoch": 0.3721356553620532, "grad_norm": 0.06095033884048462, "learning_rate": 0.00016350230414746543, "loss": 0.0197, "step": 203 }, { "epoch": 0.37396883593033914, "grad_norm": 0.09504009038209915, "learning_rate": 0.00016331797235023042, "loss": 0.0356, "step": 204 }, { "epoch": 0.3758020164986251, "grad_norm": 0.07681944966316223, "learning_rate": 0.0001631336405529954, "loss": 0.0329, "step": 205 }, { "epoch": 0.37763519706691107, "grad_norm": 0.0894550010561943, "learning_rate": 0.00016294930875576037, "loss": 0.0327, "step": 206 }, { "epoch": 0.3794683776351971, "grad_norm": 0.09541459381580353, "learning_rate": 0.00016276497695852534, "loss": 0.0368, "step": 207 }, { "epoch": 0.38130155820348305, "grad_norm": 0.07463543862104416, "learning_rate": 0.00016258064516129034, "loss": 0.0272, "step": 208 }, { "epoch": 0.383134738771769, "grad_norm": 0.08396443724632263, "learning_rate": 0.0001623963133640553, "loss": 0.0316, "step": 209 }, { "epoch": 0.384967919340055, "grad_norm": 0.07742145657539368, "learning_rate": 0.0001622119815668203, "loss": 0.0276, "step": 210 }, { "epoch": 0.386801099908341, "grad_norm": 0.07310166209936142, "learning_rate": 0.00016202764976958528, "loss": 0.0241, "step": 211 }, { "epoch": 0.38863428047662696, "grad_norm": 0.09384534507989883, "learning_rate": 0.00016184331797235025, "loss": 0.0386, "step": 212 }, { "epoch": 0.3904674610449129, "grad_norm": 0.09084580093622208, "learning_rate": 0.00016165898617511522, "loss": 0.0293, "step": 213 }, { "epoch": 0.3923006416131989, "grad_norm": 0.10701391100883484, "learning_rate": 0.0001614746543778802, "loss": 0.046, "step": 214 }, { "epoch": 0.39413382218148485, "grad_norm": 0.07608213275671005, "learning_rate": 0.00016129032258064516, "loss": 0.031, "step": 215 }, { "epoch": 0.39596700274977087, "grad_norm": 0.09281232208013535, "learning_rate": 0.00016110599078341016, "loss": 0.0311, "step": 216 }, { "epoch": 0.39780018331805683, "grad_norm": 0.07548707723617554, "learning_rate": 0.00016092165898617513, "loss": 0.0244, "step": 217 }, { "epoch": 0.3996333638863428, "grad_norm": 0.08597145974636078, "learning_rate": 0.0001607373271889401, "loss": 0.0296, "step": 218 }, { "epoch": 0.40146654445462876, "grad_norm": 0.090579092502594, "learning_rate": 0.00016055299539170507, "loss": 0.0298, "step": 219 }, { "epoch": 0.4032997250229148, "grad_norm": 0.07005604356527328, "learning_rate": 0.00016036866359447004, "loss": 0.0254, "step": 220 }, { "epoch": 0.40513290559120074, "grad_norm": 0.09800952672958374, "learning_rate": 0.00016018433179723501, "loss": 0.0353, "step": 221 }, { "epoch": 0.4069660861594867, "grad_norm": 0.09682459384202957, "learning_rate": 0.00016, "loss": 0.0342, "step": 222 }, { "epoch": 0.40879926672777267, "grad_norm": 0.08852767199277878, "learning_rate": 0.00015981566820276498, "loss": 0.0356, "step": 223 }, { "epoch": 0.4106324472960587, "grad_norm": 0.07314983755350113, "learning_rate": 0.00015963133640552996, "loss": 0.0266, "step": 224 }, { "epoch": 0.41246562786434465, "grad_norm": 0.08612879365682602, "learning_rate": 0.00015944700460829493, "loss": 0.0314, "step": 225 }, { "epoch": 0.4142988084326306, "grad_norm": 0.09286980330944061, "learning_rate": 0.0001592626728110599, "loss": 0.0359, "step": 226 }, { "epoch": 0.4161319890009166, "grad_norm": 0.0787709653377533, "learning_rate": 0.0001590783410138249, "loss": 0.0299, "step": 227 }, { "epoch": 0.41796516956920254, "grad_norm": 0.08612968027591705, "learning_rate": 0.00015889400921658987, "loss": 0.0299, "step": 228 }, { "epoch": 0.41979835013748856, "grad_norm": 0.06926131248474121, "learning_rate": 0.00015870967741935487, "loss": 0.0257, "step": 229 }, { "epoch": 0.4216315307057745, "grad_norm": 0.09236182272434235, "learning_rate": 0.00015852534562211984, "loss": 0.0284, "step": 230 }, { "epoch": 0.4234647112740605, "grad_norm": 0.07634787261486053, "learning_rate": 0.0001583410138248848, "loss": 0.0335, "step": 231 }, { "epoch": 0.42529789184234645, "grad_norm": 0.10165869444608688, "learning_rate": 0.00015815668202764978, "loss": 0.0407, "step": 232 }, { "epoch": 0.4271310724106325, "grad_norm": 0.09464540332555771, "learning_rate": 0.00015797235023041475, "loss": 0.0389, "step": 233 }, { "epoch": 0.42896425297891844, "grad_norm": 0.0773908719420433, "learning_rate": 0.00015778801843317975, "loss": 0.0329, "step": 234 }, { "epoch": 0.4307974335472044, "grad_norm": 0.08985087275505066, "learning_rate": 0.00015760368663594472, "loss": 0.0328, "step": 235 }, { "epoch": 0.43263061411549036, "grad_norm": 0.06766209751367569, "learning_rate": 0.0001574193548387097, "loss": 0.0227, "step": 236 }, { "epoch": 0.4344637946837763, "grad_norm": 0.07593253999948502, "learning_rate": 0.00015723502304147466, "loss": 0.0252, "step": 237 }, { "epoch": 0.43629697525206235, "grad_norm": 0.0670701190829277, "learning_rate": 0.00015705069124423963, "loss": 0.022, "step": 238 }, { "epoch": 0.4381301558203483, "grad_norm": 0.08073533326387405, "learning_rate": 0.0001568663594470046, "loss": 0.0314, "step": 239 }, { "epoch": 0.4399633363886343, "grad_norm": 0.09111002087593079, "learning_rate": 0.0001566820276497696, "loss": 0.0298, "step": 240 }, { "epoch": 0.44179651695692024, "grad_norm": 0.07969270646572113, "learning_rate": 0.00015649769585253457, "loss": 0.0295, "step": 241 }, { "epoch": 0.44362969752520626, "grad_norm": 0.05996888503432274, "learning_rate": 0.00015631336405529954, "loss": 0.0233, "step": 242 }, { "epoch": 0.4454628780934922, "grad_norm": 0.0899052619934082, "learning_rate": 0.00015612903225806451, "loss": 0.0328, "step": 243 }, { "epoch": 0.4472960586617782, "grad_norm": 0.11184020340442657, "learning_rate": 0.00015594470046082949, "loss": 0.0456, "step": 244 }, { "epoch": 0.44912923923006415, "grad_norm": 0.08015838265419006, "learning_rate": 0.00015576036866359446, "loss": 0.0244, "step": 245 }, { "epoch": 0.4509624197983501, "grad_norm": 0.10306031256914139, "learning_rate": 0.00015557603686635946, "loss": 0.0397, "step": 246 }, { "epoch": 0.45279560036663613, "grad_norm": 0.08525974303483963, "learning_rate": 0.00015539170506912443, "loss": 0.0317, "step": 247 }, { "epoch": 0.4546287809349221, "grad_norm": 0.10056892782449722, "learning_rate": 0.00015520737327188942, "loss": 0.0456, "step": 248 }, { "epoch": 0.45646196150320806, "grad_norm": 0.09121023863554001, "learning_rate": 0.0001550230414746544, "loss": 0.0308, "step": 249 }, { "epoch": 0.458295142071494, "grad_norm": 0.06088346242904663, "learning_rate": 0.00015483870967741937, "loss": 0.0235, "step": 250 }, { "epoch": 0.46012832263978004, "grad_norm": 0.07533244043588638, "learning_rate": 0.00015465437788018434, "loss": 0.0309, "step": 251 }, { "epoch": 0.461961503208066, "grad_norm": 0.08711180835962296, "learning_rate": 0.00015447004608294934, "loss": 0.038, "step": 252 }, { "epoch": 0.46379468377635197, "grad_norm": 0.08756791800260544, "learning_rate": 0.0001542857142857143, "loss": 0.0332, "step": 253 }, { "epoch": 0.46562786434463793, "grad_norm": 0.09424729645252228, "learning_rate": 0.00015410138248847928, "loss": 0.0282, "step": 254 }, { "epoch": 0.4674610449129239, "grad_norm": 0.10178559273481369, "learning_rate": 0.00015391705069124425, "loss": 0.0319, "step": 255 }, { "epoch": 0.4692942254812099, "grad_norm": 0.09317290782928467, "learning_rate": 0.00015373271889400922, "loss": 0.0294, "step": 256 }, { "epoch": 0.4711274060494959, "grad_norm": 0.11380328983068466, "learning_rate": 0.0001535483870967742, "loss": 0.0474, "step": 257 }, { "epoch": 0.47296058661778184, "grad_norm": 0.0751282274723053, "learning_rate": 0.0001533640552995392, "loss": 0.0328, "step": 258 }, { "epoch": 0.4747937671860678, "grad_norm": 0.07097076624631882, "learning_rate": 0.00015317972350230416, "loss": 0.0314, "step": 259 }, { "epoch": 0.4766269477543538, "grad_norm": 0.0817055031657219, "learning_rate": 0.00015299539170506913, "loss": 0.0348, "step": 260 }, { "epoch": 0.4784601283226398, "grad_norm": 0.08542328327894211, "learning_rate": 0.0001528110599078341, "loss": 0.0314, "step": 261 }, { "epoch": 0.48029330889092575, "grad_norm": 0.08346550911664963, "learning_rate": 0.00015262672811059907, "loss": 0.0342, "step": 262 }, { "epoch": 0.4821264894592117, "grad_norm": 0.06232753023505211, "learning_rate": 0.00015244239631336405, "loss": 0.024, "step": 263 }, { "epoch": 0.48395967002749773, "grad_norm": 0.08413577824831009, "learning_rate": 0.00015225806451612902, "loss": 0.0289, "step": 264 }, { "epoch": 0.4857928505957837, "grad_norm": 0.1072312667965889, "learning_rate": 0.00015207373271889401, "loss": 0.0436, "step": 265 }, { "epoch": 0.48762603116406966, "grad_norm": 0.09056718647480011, "learning_rate": 0.00015188940092165899, "loss": 0.0299, "step": 266 }, { "epoch": 0.4894592117323556, "grad_norm": 0.07195229828357697, "learning_rate": 0.00015170506912442398, "loss": 0.0243, "step": 267 }, { "epoch": 0.4912923923006416, "grad_norm": 0.09091556072235107, "learning_rate": 0.00015152073732718895, "loss": 0.0371, "step": 268 }, { "epoch": 0.4931255728689276, "grad_norm": 0.08215435594320297, "learning_rate": 0.00015133640552995393, "loss": 0.0269, "step": 269 }, { "epoch": 0.49495875343721357, "grad_norm": 0.07215309143066406, "learning_rate": 0.0001511520737327189, "loss": 0.0253, "step": 270 }, { "epoch": 0.49679193400549954, "grad_norm": 0.08347906917333603, "learning_rate": 0.0001509677419354839, "loss": 0.0273, "step": 271 }, { "epoch": 0.4986251145737855, "grad_norm": 0.061136987060308456, "learning_rate": 0.00015078341013824887, "loss": 0.0231, "step": 272 }, { "epoch": 0.5004582951420715, "grad_norm": 0.10640832036733627, "learning_rate": 0.00015059907834101384, "loss": 0.0455, "step": 273 }, { "epoch": 0.5022914757103575, "grad_norm": 0.08627432584762573, "learning_rate": 0.0001504147465437788, "loss": 0.0243, "step": 274 }, { "epoch": 0.5041246562786434, "grad_norm": 0.08695763349533081, "learning_rate": 0.00015023041474654378, "loss": 0.034, "step": 275 }, { "epoch": 0.5059578368469294, "grad_norm": 0.07717976719141006, "learning_rate": 0.00015004608294930875, "loss": 0.0275, "step": 276 }, { "epoch": 0.5077910174152154, "grad_norm": 0.0763443112373352, "learning_rate": 0.00014986175115207375, "loss": 0.0274, "step": 277 }, { "epoch": 0.5096241979835013, "grad_norm": 0.06165534630417824, "learning_rate": 0.00014967741935483872, "loss": 0.0234, "step": 278 }, { "epoch": 0.5114573785517873, "grad_norm": 0.08029788732528687, "learning_rate": 0.0001494930875576037, "loss": 0.0272, "step": 279 }, { "epoch": 0.5132905591200734, "grad_norm": 0.08326773345470428, "learning_rate": 0.00014930875576036866, "loss": 0.0262, "step": 280 }, { "epoch": 0.5151237396883593, "grad_norm": 0.08208340406417847, "learning_rate": 0.00014912442396313363, "loss": 0.0264, "step": 281 }, { "epoch": 0.5169569202566453, "grad_norm": 0.0779559388756752, "learning_rate": 0.0001489400921658986, "loss": 0.0211, "step": 282 }, { "epoch": 0.5187901008249313, "grad_norm": 0.09281262755393982, "learning_rate": 0.0001487557603686636, "loss": 0.0346, "step": 283 }, { "epoch": 0.5206232813932172, "grad_norm": 0.09531724452972412, "learning_rate": 0.00014857142857142857, "loss": 0.0374, "step": 284 }, { "epoch": 0.5224564619615032, "grad_norm": 0.10953988879919052, "learning_rate": 0.00014838709677419355, "loss": 0.038, "step": 285 }, { "epoch": 0.5242896425297892, "grad_norm": 0.09352127462625504, "learning_rate": 0.00014820276497695854, "loss": 0.0378, "step": 286 }, { "epoch": 0.5261228230980751, "grad_norm": 0.05571591481566429, "learning_rate": 0.00014801843317972351, "loss": 0.02, "step": 287 }, { "epoch": 0.5279560036663611, "grad_norm": 0.09128769487142563, "learning_rate": 0.00014783410138248849, "loss": 0.0327, "step": 288 }, { "epoch": 0.5297891842346472, "grad_norm": 0.09302745014429092, "learning_rate": 0.00014764976958525348, "loss": 0.029, "step": 289 }, { "epoch": 0.5316223648029331, "grad_norm": 0.05895543098449707, "learning_rate": 0.00014746543778801845, "loss": 0.0203, "step": 290 }, { "epoch": 0.5334555453712191, "grad_norm": 0.08418423682451248, "learning_rate": 0.00014728110599078343, "loss": 0.0329, "step": 291 }, { "epoch": 0.535288725939505, "grad_norm": 0.09295564144849777, "learning_rate": 0.0001470967741935484, "loss": 0.0401, "step": 292 }, { "epoch": 0.537121906507791, "grad_norm": 0.08649936318397522, "learning_rate": 0.00014691244239631337, "loss": 0.0343, "step": 293 }, { "epoch": 0.538955087076077, "grad_norm": 0.08355950564146042, "learning_rate": 0.00014672811059907834, "loss": 0.0315, "step": 294 }, { "epoch": 0.5407882676443629, "grad_norm": 0.06531458348035812, "learning_rate": 0.00014654377880184334, "loss": 0.0217, "step": 295 }, { "epoch": 0.5426214482126489, "grad_norm": 0.09049852937459946, "learning_rate": 0.0001463594470046083, "loss": 0.0298, "step": 296 }, { "epoch": 0.5444546287809349, "grad_norm": 0.08500348031520844, "learning_rate": 0.00014617511520737328, "loss": 0.0253, "step": 297 }, { "epoch": 0.5462878093492209, "grad_norm": 0.09162382036447525, "learning_rate": 0.00014599078341013825, "loss": 0.0299, "step": 298 }, { "epoch": 0.5481209899175069, "grad_norm": 0.07611165195703506, "learning_rate": 0.00014580645161290322, "loss": 0.0256, "step": 299 }, { "epoch": 0.5499541704857929, "grad_norm": 0.09361441433429718, "learning_rate": 0.0001456221198156682, "loss": 0.029, "step": 300 }, { "epoch": 0.5517873510540788, "grad_norm": 0.09738872200250626, "learning_rate": 0.0001454377880184332, "loss": 0.0288, "step": 301 }, { "epoch": 0.5536205316223648, "grad_norm": 0.08112243562936783, "learning_rate": 0.00014525345622119816, "loss": 0.0311, "step": 302 }, { "epoch": 0.5554537121906508, "grad_norm": 0.08341687172651291, "learning_rate": 0.00014506912442396313, "loss": 0.0316, "step": 303 }, { "epoch": 0.5572868927589367, "grad_norm": 0.07399066537618637, "learning_rate": 0.0001448847926267281, "loss": 0.0257, "step": 304 }, { "epoch": 0.5591200733272227, "grad_norm": 0.07534675300121307, "learning_rate": 0.0001447004608294931, "loss": 0.0259, "step": 305 }, { "epoch": 0.5609532538955087, "grad_norm": 0.08384109288454056, "learning_rate": 0.00014451612903225807, "loss": 0.0277, "step": 306 }, { "epoch": 0.5627864344637947, "grad_norm": 0.08456786721944809, "learning_rate": 0.00014433179723502307, "loss": 0.0375, "step": 307 }, { "epoch": 0.5646196150320807, "grad_norm": 0.07096688449382782, "learning_rate": 0.00014414746543778804, "loss": 0.0241, "step": 308 }, { "epoch": 0.5664527956003667, "grad_norm": 0.09720040112733841, "learning_rate": 0.00014396313364055301, "loss": 0.0333, "step": 309 }, { "epoch": 0.5682859761686526, "grad_norm": 0.11616980284452438, "learning_rate": 0.00014377880184331799, "loss": 0.0431, "step": 310 }, { "epoch": 0.5701191567369386, "grad_norm": 0.08847475051879883, "learning_rate": 0.00014359447004608296, "loss": 0.0366, "step": 311 }, { "epoch": 0.5719523373052245, "grad_norm": 0.09937264025211334, "learning_rate": 0.00014341013824884793, "loss": 0.0371, "step": 312 }, { "epoch": 0.5737855178735105, "grad_norm": 0.059645283967256546, "learning_rate": 0.00014322580645161293, "loss": 0.0232, "step": 313 }, { "epoch": 0.5756186984417965, "grad_norm": 0.06614042073488235, "learning_rate": 0.0001430414746543779, "loss": 0.0234, "step": 314 }, { "epoch": 0.5774518790100825, "grad_norm": 0.10562101751565933, "learning_rate": 0.00014285714285714287, "loss": 0.0387, "step": 315 }, { "epoch": 0.5792850595783685, "grad_norm": 0.10281278938055038, "learning_rate": 0.00014267281105990784, "loss": 0.0339, "step": 316 }, { "epoch": 0.5811182401466545, "grad_norm": 0.10205813497304916, "learning_rate": 0.0001424884792626728, "loss": 0.0426, "step": 317 }, { "epoch": 0.5829514207149404, "grad_norm": 0.07571630924940109, "learning_rate": 0.00014230414746543778, "loss": 0.0311, "step": 318 }, { "epoch": 0.5847846012832264, "grad_norm": 0.08494407683610916, "learning_rate": 0.00014211981566820278, "loss": 0.0301, "step": 319 }, { "epoch": 0.5866177818515124, "grad_norm": 0.05080355703830719, "learning_rate": 0.00014193548387096775, "loss": 0.017, "step": 320 }, { "epoch": 0.5884509624197983, "grad_norm": 0.08271925151348114, "learning_rate": 0.00014175115207373272, "loss": 0.0304, "step": 321 }, { "epoch": 0.5902841429880843, "grad_norm": 0.0785074308514595, "learning_rate": 0.0001415668202764977, "loss": 0.0315, "step": 322 }, { "epoch": 0.5921173235563703, "grad_norm": 0.08633995056152344, "learning_rate": 0.00014138248847926266, "loss": 0.0366, "step": 323 }, { "epoch": 0.5939505041246563, "grad_norm": 0.07227237522602081, "learning_rate": 0.00014119815668202766, "loss": 0.0256, "step": 324 }, { "epoch": 0.5957836846929423, "grad_norm": 0.07901857793331146, "learning_rate": 0.00014101382488479263, "loss": 0.0224, "step": 325 }, { "epoch": 0.5976168652612283, "grad_norm": 0.0696311965584755, "learning_rate": 0.00014082949308755763, "loss": 0.0245, "step": 326 }, { "epoch": 0.5994500458295142, "grad_norm": 0.0882924422621727, "learning_rate": 0.0001406451612903226, "loss": 0.0349, "step": 327 }, { "epoch": 0.6012832263978002, "grad_norm": 0.07235971093177795, "learning_rate": 0.00014046082949308757, "loss": 0.0224, "step": 328 }, { "epoch": 0.6031164069660861, "grad_norm": 0.059238139539957047, "learning_rate": 0.00014027649769585254, "loss": 0.0204, "step": 329 }, { "epoch": 0.6049495875343721, "grad_norm": 0.07901135087013245, "learning_rate": 0.00014009216589861752, "loss": 0.0273, "step": 330 }, { "epoch": 0.6067827681026581, "grad_norm": 0.07229477912187576, "learning_rate": 0.0001399078341013825, "loss": 0.029, "step": 331 }, { "epoch": 0.608615948670944, "grad_norm": 0.07699091732501984, "learning_rate": 0.00013972350230414749, "loss": 0.026, "step": 332 }, { "epoch": 0.6104491292392301, "grad_norm": 0.08664306253194809, "learning_rate": 0.00013953917050691246, "loss": 0.032, "step": 333 }, { "epoch": 0.6122823098075161, "grad_norm": 0.09364963322877884, "learning_rate": 0.00013935483870967743, "loss": 0.0395, "step": 334 }, { "epoch": 0.614115490375802, "grad_norm": 0.0945357009768486, "learning_rate": 0.0001391705069124424, "loss": 0.0326, "step": 335 }, { "epoch": 0.615948670944088, "grad_norm": 0.08954328298568726, "learning_rate": 0.00013898617511520737, "loss": 0.0326, "step": 336 }, { "epoch": 0.617781851512374, "grad_norm": 0.09588358551263809, "learning_rate": 0.00013880184331797234, "loss": 0.0335, "step": 337 }, { "epoch": 0.6196150320806599, "grad_norm": 0.0669722706079483, "learning_rate": 0.00013861751152073734, "loss": 0.0217, "step": 338 }, { "epoch": 0.6214482126489459, "grad_norm": 0.06408808380365372, "learning_rate": 0.0001384331797235023, "loss": 0.0211, "step": 339 }, { "epoch": 0.6232813932172319, "grad_norm": 0.07464352995157242, "learning_rate": 0.00013824884792626728, "loss": 0.0314, "step": 340 }, { "epoch": 0.6251145737855178, "grad_norm": 0.08413645625114441, "learning_rate": 0.00013806451612903225, "loss": 0.024, "step": 341 }, { "epoch": 0.6269477543538039, "grad_norm": 0.08873338252305984, "learning_rate": 0.00013788018433179722, "loss": 0.0281, "step": 342 }, { "epoch": 0.6287809349220899, "grad_norm": 0.07131095975637436, "learning_rate": 0.00013769585253456222, "loss": 0.0288, "step": 343 }, { "epoch": 0.6306141154903758, "grad_norm": 0.05225027725100517, "learning_rate": 0.0001375115207373272, "loss": 0.0178, "step": 344 }, { "epoch": 0.6324472960586618, "grad_norm": 0.06744900345802307, "learning_rate": 0.0001373271889400922, "loss": 0.0217, "step": 345 }, { "epoch": 0.6342804766269478, "grad_norm": 0.07560716569423676, "learning_rate": 0.00013714285714285716, "loss": 0.0272, "step": 346 }, { "epoch": 0.6361136571952337, "grad_norm": 0.0856630727648735, "learning_rate": 0.00013695852534562213, "loss": 0.0296, "step": 347 }, { "epoch": 0.6379468377635197, "grad_norm": 0.07206695526838303, "learning_rate": 0.0001367741935483871, "loss": 0.0212, "step": 348 }, { "epoch": 0.6397800183318056, "grad_norm": 0.08413973450660706, "learning_rate": 0.00013658986175115208, "loss": 0.0309, "step": 349 }, { "epoch": 0.6416131989000916, "grad_norm": 0.08946281671524048, "learning_rate": 0.00013640552995391707, "loss": 0.0341, "step": 350 }, { "epoch": 0.6434463794683777, "grad_norm": 0.09534583985805511, "learning_rate": 0.00013622119815668204, "loss": 0.0301, "step": 351 }, { "epoch": 0.6452795600366636, "grad_norm": 0.11210379004478455, "learning_rate": 0.00013603686635944702, "loss": 0.0546, "step": 352 }, { "epoch": 0.6471127406049496, "grad_norm": 0.08674897998571396, "learning_rate": 0.000135852534562212, "loss": 0.034, "step": 353 }, { "epoch": 0.6489459211732356, "grad_norm": 0.0715477392077446, "learning_rate": 0.00013566820276497696, "loss": 0.0294, "step": 354 }, { "epoch": 0.6507791017415215, "grad_norm": 0.06651375442743301, "learning_rate": 0.00013548387096774193, "loss": 0.0246, "step": 355 }, { "epoch": 0.6526122823098075, "grad_norm": 0.103563591837883, "learning_rate": 0.00013529953917050693, "loss": 0.048, "step": 356 }, { "epoch": 0.6544454628780935, "grad_norm": 0.11457951366901398, "learning_rate": 0.0001351152073732719, "loss": 0.0287, "step": 357 }, { "epoch": 0.6562786434463794, "grad_norm": 0.060861390084028244, "learning_rate": 0.00013493087557603687, "loss": 0.025, "step": 358 }, { "epoch": 0.6581118240146654, "grad_norm": 0.058961618691682816, "learning_rate": 0.00013474654377880184, "loss": 0.0208, "step": 359 }, { "epoch": 0.6599450045829515, "grad_norm": 0.10199262201786041, "learning_rate": 0.0001345622119815668, "loss": 0.0319, "step": 360 }, { "epoch": 0.6617781851512374, "grad_norm": 0.0710933730006218, "learning_rate": 0.00013437788018433178, "loss": 0.0245, "step": 361 }, { "epoch": 0.6636113657195234, "grad_norm": 0.06901897490024567, "learning_rate": 0.00013419354838709678, "loss": 0.0189, "step": 362 }, { "epoch": 0.6654445462878094, "grad_norm": 0.08602811396121979, "learning_rate": 0.00013400921658986175, "loss": 0.0344, "step": 363 }, { "epoch": 0.6672777268560953, "grad_norm": 0.052524056285619736, "learning_rate": 0.00013382488479262675, "loss": 0.0211, "step": 364 }, { "epoch": 0.6691109074243813, "grad_norm": 0.07917725294828415, "learning_rate": 0.00013364055299539172, "loss": 0.0285, "step": 365 }, { "epoch": 0.6709440879926672, "grad_norm": 0.0799289420247078, "learning_rate": 0.0001334562211981567, "loss": 0.0296, "step": 366 }, { "epoch": 0.6727772685609532, "grad_norm": 0.08307263255119324, "learning_rate": 0.00013327188940092166, "loss": 0.0259, "step": 367 }, { "epoch": 0.6746104491292392, "grad_norm": 0.08490724861621857, "learning_rate": 0.00013308755760368666, "loss": 0.0317, "step": 368 }, { "epoch": 0.6764436296975253, "grad_norm": 0.07663150131702423, "learning_rate": 0.00013290322580645163, "loss": 0.0279, "step": 369 }, { "epoch": 0.6782768102658112, "grad_norm": 0.08253347873687744, "learning_rate": 0.0001327188940092166, "loss": 0.0323, "step": 370 }, { "epoch": 0.6801099908340972, "grad_norm": 0.06597882509231567, "learning_rate": 0.00013253456221198157, "loss": 0.0275, "step": 371 }, { "epoch": 0.6819431714023831, "grad_norm": 0.08167731761932373, "learning_rate": 0.00013235023041474655, "loss": 0.0256, "step": 372 }, { "epoch": 0.6837763519706691, "grad_norm": 0.1000673770904541, "learning_rate": 0.00013216589861751152, "loss": 0.0428, "step": 373 }, { "epoch": 0.6856095325389551, "grad_norm": 0.08605007827281952, "learning_rate": 0.00013198156682027652, "loss": 0.0365, "step": 374 }, { "epoch": 0.687442713107241, "grad_norm": 0.06620384752750397, "learning_rate": 0.0001317972350230415, "loss": 0.0212, "step": 375 }, { "epoch": 0.689275893675527, "grad_norm": 0.08665503561496735, "learning_rate": 0.00013161290322580646, "loss": 0.0311, "step": 376 }, { "epoch": 0.6911090742438131, "grad_norm": 0.06734751164913177, "learning_rate": 0.00013142857142857143, "loss": 0.0252, "step": 377 }, { "epoch": 0.692942254812099, "grad_norm": 0.06160259246826172, "learning_rate": 0.0001312442396313364, "loss": 0.0238, "step": 378 }, { "epoch": 0.694775435380385, "grad_norm": 0.0831260159611702, "learning_rate": 0.00013105990783410137, "loss": 0.0322, "step": 379 }, { "epoch": 0.696608615948671, "grad_norm": 0.07255002856254578, "learning_rate": 0.00013087557603686637, "loss": 0.0265, "step": 380 }, { "epoch": 0.6984417965169569, "grad_norm": 0.08250346034765244, "learning_rate": 0.00013069124423963134, "loss": 0.031, "step": 381 }, { "epoch": 0.7002749770852429, "grad_norm": 0.06267958134412766, "learning_rate": 0.0001305069124423963, "loss": 0.023, "step": 382 }, { "epoch": 0.7021081576535289, "grad_norm": 0.08702743053436279, "learning_rate": 0.0001303225806451613, "loss": 0.0294, "step": 383 }, { "epoch": 0.7039413382218148, "grad_norm": 0.08447282761335373, "learning_rate": 0.00013013824884792628, "loss": 0.0282, "step": 384 }, { "epoch": 0.7057745187901008, "grad_norm": 0.0854048877954483, "learning_rate": 0.00012995391705069125, "loss": 0.0301, "step": 385 }, { "epoch": 0.7076076993583869, "grad_norm": 0.08276678621768951, "learning_rate": 0.00012976958525345625, "loss": 0.0273, "step": 386 }, { "epoch": 0.7094408799266728, "grad_norm": 0.11089053750038147, "learning_rate": 0.00012958525345622122, "loss": 0.0385, "step": 387 }, { "epoch": 0.7112740604949588, "grad_norm": 0.08636972308158875, "learning_rate": 0.0001294009216589862, "loss": 0.0272, "step": 388 }, { "epoch": 0.7131072410632447, "grad_norm": 0.09062766283750534, "learning_rate": 0.00012921658986175116, "loss": 0.0327, "step": 389 }, { "epoch": 0.7149404216315307, "grad_norm": 0.0708693340420723, "learning_rate": 0.00012903225806451613, "loss": 0.0249, "step": 390 }, { "epoch": 0.7167736021998167, "grad_norm": 0.07402048259973526, "learning_rate": 0.0001288479262672811, "loss": 0.0184, "step": 391 }, { "epoch": 0.7186067827681026, "grad_norm": 0.13229697942733765, "learning_rate": 0.00012866359447004608, "loss": 0.0489, "step": 392 }, { "epoch": 0.7204399633363886, "grad_norm": 0.06817866861820221, "learning_rate": 0.00012847926267281107, "loss": 0.0233, "step": 393 }, { "epoch": 0.7222731439046746, "grad_norm": 0.09490000456571579, "learning_rate": 0.00012829493087557605, "loss": 0.0303, "step": 394 }, { "epoch": 0.7241063244729606, "grad_norm": 0.06846782565116882, "learning_rate": 0.00012811059907834102, "loss": 0.0238, "step": 395 }, { "epoch": 0.7259395050412466, "grad_norm": 0.09812495112419128, "learning_rate": 0.000127926267281106, "loss": 0.0324, "step": 396 }, { "epoch": 0.7277726856095326, "grad_norm": 0.08136089891195297, "learning_rate": 0.00012774193548387096, "loss": 0.0309, "step": 397 }, { "epoch": 0.7296058661778185, "grad_norm": 0.07089602202177048, "learning_rate": 0.00012755760368663593, "loss": 0.0251, "step": 398 }, { "epoch": 0.7314390467461045, "grad_norm": 0.10192608833312988, "learning_rate": 0.00012737327188940093, "loss": 0.0345, "step": 399 }, { "epoch": 0.7332722273143905, "grad_norm": 0.07810863107442856, "learning_rate": 0.0001271889400921659, "loss": 0.0264, "step": 400 }, { "epoch": 0.7351054078826764, "grad_norm": 0.0839025229215622, "learning_rate": 0.00012700460829493087, "loss": 0.0279, "step": 401 }, { "epoch": 0.7369385884509624, "grad_norm": 0.08430635184049606, "learning_rate": 0.00012682027649769587, "loss": 0.0345, "step": 402 }, { "epoch": 0.7387717690192483, "grad_norm": 0.05288069695234299, "learning_rate": 0.00012663594470046084, "loss": 0.0181, "step": 403 }, { "epoch": 0.7406049495875344, "grad_norm": 0.06337739527225494, "learning_rate": 0.0001264516129032258, "loss": 0.024, "step": 404 }, { "epoch": 0.7424381301558204, "grad_norm": 0.08434322476387024, "learning_rate": 0.0001262672811059908, "loss": 0.0277, "step": 405 }, { "epoch": 0.7442713107241063, "grad_norm": 0.07922643423080444, "learning_rate": 0.00012608294930875578, "loss": 0.0277, "step": 406 }, { "epoch": 0.7461044912923923, "grad_norm": 0.0888214185833931, "learning_rate": 0.00012589861751152075, "loss": 0.0405, "step": 407 }, { "epoch": 0.7479376718606783, "grad_norm": 0.08394885063171387, "learning_rate": 0.00012571428571428572, "loss": 0.0268, "step": 408 }, { "epoch": 0.7497708524289642, "grad_norm": 0.08741919696331024, "learning_rate": 0.0001255299539170507, "loss": 0.034, "step": 409 }, { "epoch": 0.7516040329972502, "grad_norm": 0.08985461294651031, "learning_rate": 0.00012534562211981566, "loss": 0.0349, "step": 410 }, { "epoch": 0.7534372135655362, "grad_norm": 0.055699512362480164, "learning_rate": 0.00012516129032258066, "loss": 0.0205, "step": 411 }, { "epoch": 0.7552703941338221, "grad_norm": 0.09050854295492172, "learning_rate": 0.00012497695852534563, "loss": 0.0283, "step": 412 }, { "epoch": 0.7571035747021082, "grad_norm": 0.11931566148996353, "learning_rate": 0.0001247926267281106, "loss": 0.0614, "step": 413 }, { "epoch": 0.7589367552703942, "grad_norm": 0.06575947254896164, "learning_rate": 0.00012460829493087558, "loss": 0.0214, "step": 414 }, { "epoch": 0.7607699358386801, "grad_norm": 0.0591006726026535, "learning_rate": 0.00012442396313364055, "loss": 0.0198, "step": 415 }, { "epoch": 0.7626031164069661, "grad_norm": 0.12222932279109955, "learning_rate": 0.00012423963133640552, "loss": 0.0453, "step": 416 }, { "epoch": 0.7644362969752521, "grad_norm": 0.07346849143505096, "learning_rate": 0.00012405529953917052, "loss": 0.027, "step": 417 }, { "epoch": 0.766269477543538, "grad_norm": 0.05644283443689346, "learning_rate": 0.0001238709677419355, "loss": 0.0225, "step": 418 }, { "epoch": 0.768102658111824, "grad_norm": 0.07721511274576187, "learning_rate": 0.00012368663594470046, "loss": 0.0279, "step": 419 }, { "epoch": 0.76993583868011, "grad_norm": 0.08552254736423492, "learning_rate": 0.00012350230414746543, "loss": 0.0294, "step": 420 }, { "epoch": 0.7717690192483959, "grad_norm": 0.06860620528459549, "learning_rate": 0.00012331797235023043, "loss": 0.0287, "step": 421 }, { "epoch": 0.773602199816682, "grad_norm": 0.09986478835344315, "learning_rate": 0.0001231336405529954, "loss": 0.0396, "step": 422 }, { "epoch": 0.775435380384968, "grad_norm": 0.08369968086481094, "learning_rate": 0.00012294930875576037, "loss": 0.0251, "step": 423 }, { "epoch": 0.7772685609532539, "grad_norm": 0.06935148686170578, "learning_rate": 0.00012276497695852537, "loss": 0.0261, "step": 424 }, { "epoch": 0.7791017415215399, "grad_norm": 0.08505766093730927, "learning_rate": 0.00012258064516129034, "loss": 0.0318, "step": 425 }, { "epoch": 0.7809349220898258, "grad_norm": 0.07806456089019775, "learning_rate": 0.0001223963133640553, "loss": 0.0264, "step": 426 }, { "epoch": 0.7827681026581118, "grad_norm": 0.10398893058300018, "learning_rate": 0.00012221198156682028, "loss": 0.0397, "step": 427 }, { "epoch": 0.7846012832263978, "grad_norm": 0.07210515439510345, "learning_rate": 0.00012202764976958525, "loss": 0.0292, "step": 428 }, { "epoch": 0.7864344637946837, "grad_norm": 0.06596438586711884, "learning_rate": 0.00012184331797235025, "loss": 0.0233, "step": 429 }, { "epoch": 0.7882676443629697, "grad_norm": 0.11197281628847122, "learning_rate": 0.00012165898617511522, "loss": 0.0362, "step": 430 }, { "epoch": 0.7901008249312558, "grad_norm": 0.10142118483781815, "learning_rate": 0.0001214746543778802, "loss": 0.0384, "step": 431 }, { "epoch": 0.7919340054995417, "grad_norm": 0.08676854521036148, "learning_rate": 0.00012129032258064516, "loss": 0.0287, "step": 432 }, { "epoch": 0.7937671860678277, "grad_norm": 0.08402638882398605, "learning_rate": 0.00012110599078341014, "loss": 0.03, "step": 433 }, { "epoch": 0.7956003666361137, "grad_norm": 0.07576748728752136, "learning_rate": 0.00012092165898617511, "loss": 0.027, "step": 434 }, { "epoch": 0.7974335472043996, "grad_norm": 0.08877792209386826, "learning_rate": 0.0001207373271889401, "loss": 0.0294, "step": 435 }, { "epoch": 0.7992667277726856, "grad_norm": 0.09211436659097672, "learning_rate": 0.00012055299539170508, "loss": 0.0306, "step": 436 }, { "epoch": 0.8010999083409716, "grad_norm": 0.0953056812286377, "learning_rate": 0.00012036866359447006, "loss": 0.0343, "step": 437 }, { "epoch": 0.8029330889092575, "grad_norm": 0.08642080426216125, "learning_rate": 0.00012018433179723503, "loss": 0.0343, "step": 438 }, { "epoch": 0.8047662694775435, "grad_norm": 0.09826899319887161, "learning_rate": 0.00012, "loss": 0.0301, "step": 439 }, { "epoch": 0.8065994500458296, "grad_norm": 0.09221632778644562, "learning_rate": 0.00011981566820276497, "loss": 0.0342, "step": 440 }, { "epoch": 0.8084326306141155, "grad_norm": 0.091212198138237, "learning_rate": 0.00011963133640552997, "loss": 0.0388, "step": 441 }, { "epoch": 0.8102658111824015, "grad_norm": 0.07262887060642242, "learning_rate": 0.00011944700460829494, "loss": 0.0269, "step": 442 }, { "epoch": 0.8120989917506874, "grad_norm": 0.09429402649402618, "learning_rate": 0.00011926267281105991, "loss": 0.0344, "step": 443 }, { "epoch": 0.8139321723189734, "grad_norm": 0.07602915167808533, "learning_rate": 0.00011907834101382489, "loss": 0.0229, "step": 444 }, { "epoch": 0.8157653528872594, "grad_norm": 0.08622591942548752, "learning_rate": 0.00011889400921658986, "loss": 0.0299, "step": 445 }, { "epoch": 0.8175985334555453, "grad_norm": 0.07046458125114441, "learning_rate": 0.00011870967741935484, "loss": 0.0283, "step": 446 }, { "epoch": 0.8194317140238313, "grad_norm": 0.06315992772579193, "learning_rate": 0.00011852534562211983, "loss": 0.0231, "step": 447 }, { "epoch": 0.8212648945921174, "grad_norm": 0.11382705718278885, "learning_rate": 0.00011834101382488481, "loss": 0.0292, "step": 448 }, { "epoch": 0.8230980751604033, "grad_norm": 0.0543043278157711, "learning_rate": 0.00011815668202764978, "loss": 0.021, "step": 449 }, { "epoch": 0.8249312557286893, "grad_norm": 0.08020134270191193, "learning_rate": 0.00011797235023041475, "loss": 0.024, "step": 450 }, { "epoch": 0.8267644362969753, "grad_norm": 0.08001308888196945, "learning_rate": 0.00011778801843317972, "loss": 0.026, "step": 451 }, { "epoch": 0.8285976168652612, "grad_norm": 0.07345568388700485, "learning_rate": 0.0001176036866359447, "loss": 0.0252, "step": 452 }, { "epoch": 0.8304307974335472, "grad_norm": 0.08047681301832199, "learning_rate": 0.00011741935483870967, "loss": 0.0382, "step": 453 }, { "epoch": 0.8322639780018332, "grad_norm": 0.07006672024726868, "learning_rate": 0.00011723502304147466, "loss": 0.0282, "step": 454 }, { "epoch": 0.8340971585701191, "grad_norm": 0.08989156782627106, "learning_rate": 0.00011705069124423964, "loss": 0.0358, "step": 455 }, { "epoch": 0.8359303391384051, "grad_norm": 0.07250788062810898, "learning_rate": 0.00011686635944700462, "loss": 0.0282, "step": 456 }, { "epoch": 0.8377635197066912, "grad_norm": 0.07556509971618652, "learning_rate": 0.00011668202764976959, "loss": 0.0285, "step": 457 }, { "epoch": 0.8395967002749771, "grad_norm": 0.07886163890361786, "learning_rate": 0.00011649769585253456, "loss": 0.0245, "step": 458 }, { "epoch": 0.8414298808432631, "grad_norm": 0.10710224509239197, "learning_rate": 0.00011631336405529953, "loss": 0.0448, "step": 459 }, { "epoch": 0.843263061411549, "grad_norm": 0.092331163585186, "learning_rate": 0.00011612903225806453, "loss": 0.0319, "step": 460 }, { "epoch": 0.845096241979835, "grad_norm": 0.07957134395837784, "learning_rate": 0.0001159447004608295, "loss": 0.0308, "step": 461 }, { "epoch": 0.846929422548121, "grad_norm": 0.06576257944107056, "learning_rate": 0.00011576036866359447, "loss": 0.0244, "step": 462 }, { "epoch": 0.8487626031164069, "grad_norm": 0.060584548860788345, "learning_rate": 0.00011557603686635945, "loss": 0.0211, "step": 463 }, { "epoch": 0.8505957836846929, "grad_norm": 0.0770929604768753, "learning_rate": 0.00011539170506912442, "loss": 0.0283, "step": 464 }, { "epoch": 0.8524289642529789, "grad_norm": 0.10374728590250015, "learning_rate": 0.0001152073732718894, "loss": 0.031, "step": 465 }, { "epoch": 0.854262144821265, "grad_norm": 0.08729080855846405, "learning_rate": 0.00011502304147465439, "loss": 0.0292, "step": 466 }, { "epoch": 0.8560953253895509, "grad_norm": 0.08986352384090424, "learning_rate": 0.00011483870967741937, "loss": 0.0416, "step": 467 }, { "epoch": 0.8579285059578369, "grad_norm": 0.09187289327383041, "learning_rate": 0.00011465437788018434, "loss": 0.0379, "step": 468 }, { "epoch": 0.8597616865261228, "grad_norm": 0.07304208725690842, "learning_rate": 0.00011447004608294931, "loss": 0.0233, "step": 469 }, { "epoch": 0.8615948670944088, "grad_norm": 0.08240114897489548, "learning_rate": 0.00011428571428571428, "loss": 0.0231, "step": 470 }, { "epoch": 0.8634280476626948, "grad_norm": 0.08332082629203796, "learning_rate": 0.00011410138248847925, "loss": 0.0256, "step": 471 }, { "epoch": 0.8652612282309807, "grad_norm": 0.09715988487005234, "learning_rate": 0.00011391705069124425, "loss": 0.0389, "step": 472 }, { "epoch": 0.8670944087992667, "grad_norm": 0.0702865719795227, "learning_rate": 0.00011373271889400922, "loss": 0.0241, "step": 473 }, { "epoch": 0.8689275893675527, "grad_norm": 0.07741749286651611, "learning_rate": 0.0001135483870967742, "loss": 0.0294, "step": 474 }, { "epoch": 0.8707607699358387, "grad_norm": 0.08457162976264954, "learning_rate": 0.00011336405529953918, "loss": 0.0273, "step": 475 }, { "epoch": 0.8725939505041247, "grad_norm": 0.08684583753347397, "learning_rate": 0.00011317972350230415, "loss": 0.0301, "step": 476 }, { "epoch": 0.8744271310724107, "grad_norm": 0.07238451391458511, "learning_rate": 0.00011299539170506912, "loss": 0.0299, "step": 477 }, { "epoch": 0.8762603116406966, "grad_norm": 0.06936534494161606, "learning_rate": 0.00011281105990783412, "loss": 0.0255, "step": 478 }, { "epoch": 0.8780934922089826, "grad_norm": 0.0781572014093399, "learning_rate": 0.00011262672811059909, "loss": 0.0272, "step": 479 }, { "epoch": 0.8799266727772685, "grad_norm": 0.05346061289310455, "learning_rate": 0.00011244239631336406, "loss": 0.0186, "step": 480 }, { "epoch": 0.8817598533455545, "grad_norm": 0.10047460347414017, "learning_rate": 0.00011225806451612903, "loss": 0.0261, "step": 481 }, { "epoch": 0.8835930339138405, "grad_norm": 0.09505181759595871, "learning_rate": 0.000112073732718894, "loss": 0.0293, "step": 482 }, { "epoch": 0.8854262144821264, "grad_norm": 0.08039534837007523, "learning_rate": 0.00011188940092165898, "loss": 0.0339, "step": 483 }, { "epoch": 0.8872593950504125, "grad_norm": 0.0761064887046814, "learning_rate": 0.00011170506912442397, "loss": 0.032, "step": 484 }, { "epoch": 0.8890925756186985, "grad_norm": 0.0830230712890625, "learning_rate": 0.00011152073732718894, "loss": 0.0282, "step": 485 }, { "epoch": 0.8909257561869844, "grad_norm": 0.0726420059800148, "learning_rate": 0.00011133640552995393, "loss": 0.0293, "step": 486 }, { "epoch": 0.8927589367552704, "grad_norm": 0.07475108653306961, "learning_rate": 0.0001111520737327189, "loss": 0.0311, "step": 487 }, { "epoch": 0.8945921173235564, "grad_norm": 0.07357434183359146, "learning_rate": 0.00011096774193548387, "loss": 0.0255, "step": 488 }, { "epoch": 0.8964252978918423, "grad_norm": 0.09986915439367294, "learning_rate": 0.00011078341013824884, "loss": 0.0312, "step": 489 }, { "epoch": 0.8982584784601283, "grad_norm": 0.10107365250587463, "learning_rate": 0.00011059907834101384, "loss": 0.037, "step": 490 }, { "epoch": 0.9000916590284143, "grad_norm": 0.08120604604482651, "learning_rate": 0.00011041474654377881, "loss": 0.035, "step": 491 }, { "epoch": 0.9019248395967002, "grad_norm": 0.05490780994296074, "learning_rate": 0.00011023041474654378, "loss": 0.0202, "step": 492 }, { "epoch": 0.9037580201649863, "grad_norm": 0.10805200785398483, "learning_rate": 0.00011004608294930875, "loss": 0.045, "step": 493 }, { "epoch": 0.9055912007332723, "grad_norm": 0.05866795405745506, "learning_rate": 0.00010986175115207374, "loss": 0.0203, "step": 494 }, { "epoch": 0.9074243813015582, "grad_norm": 0.09001267701387405, "learning_rate": 0.00010967741935483871, "loss": 0.0286, "step": 495 }, { "epoch": 0.9092575618698442, "grad_norm": 0.0694073960185051, "learning_rate": 0.00010949308755760371, "loss": 0.0318, "step": 496 }, { "epoch": 0.9110907424381302, "grad_norm": 0.06593529880046844, "learning_rate": 0.00010930875576036868, "loss": 0.0227, "step": 497 }, { "epoch": 0.9129239230064161, "grad_norm": 0.06799976527690887, "learning_rate": 0.00010912442396313365, "loss": 0.024, "step": 498 }, { "epoch": 0.9147571035747021, "grad_norm": 0.07797495275735855, "learning_rate": 0.00010894009216589862, "loss": 0.0268, "step": 499 }, { "epoch": 0.916590284142988, "grad_norm": 0.07280347496271133, "learning_rate": 0.00010875576036866359, "loss": 0.0252, "step": 500 } ], "logging_steps": 1, "max_steps": 1090, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.5010830413889536e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }