diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,70033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.2107882485150103, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00032107882485150104, + "grad_norm": 15.157858848571777, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.4811, + "step": 1 + }, + { + "epoch": 0.0006421576497030021, + "grad_norm": 15.802922248840332, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.3901, + "step": 2 + }, + { + "epoch": 0.0009632364745545031, + "grad_norm": 23.659759521484375, + "learning_rate": 6.000000000000001e-07, + "loss": 2.3375, + "step": 3 + }, + { + "epoch": 0.0012843152994060042, + "grad_norm": 27.097450256347656, + "learning_rate": 8.000000000000001e-07, + "loss": 2.3807, + "step": 4 + }, + { + "epoch": 0.0016053941242575051, + "grad_norm": 29.78221321105957, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.4191, + "step": 5 + }, + { + "epoch": 0.0019264729491090063, + "grad_norm": 33.7828254699707, + "learning_rate": 1.2000000000000002e-06, + "loss": 2.5859, + "step": 6 + }, + { + "epoch": 0.002247551773960507, + "grad_norm": 43.44171142578125, + "learning_rate": 1.4000000000000001e-06, + "loss": 2.8257, + "step": 7 + }, + { + "epoch": 0.0025686305988120084, + "grad_norm": 51.39571762084961, + "learning_rate": 1.6000000000000001e-06, + "loss": 3.0337, + "step": 8 + }, + { + "epoch": 0.0028897094236635095, + "grad_norm": 54.60009002685547, + "learning_rate": 1.8e-06, + "loss": 3.1402, + "step": 9 + }, + { + "epoch": 0.0032107882485150102, + "grad_norm": 57.4810791015625, + "learning_rate": 2.0000000000000003e-06, + "loss": 3.4821, + "step": 10 + }, + { + "epoch": 0.0035318670733665114, + "grad_norm": 61.211952209472656, + "learning_rate": 2.2e-06, + "loss": 3.4304, + "step": 11 + }, + { + "epoch": 0.0038529458982180125, + "grad_norm": 61.09406280517578, + "learning_rate": 2.4000000000000003e-06, + "loss": 3.5862, + "step": 12 + }, + { + "epoch": 0.004174024723069513, + "grad_norm": 61.53642654418945, + "learning_rate": 2.6e-06, + "loss": 3.6255, + "step": 13 + }, + { + "epoch": 0.004495103547921014, + "grad_norm": 58.33311080932617, + "learning_rate": 2.8000000000000003e-06, + "loss": 3.7706, + "step": 14 + }, + { + "epoch": 0.0048161823727725156, + "grad_norm": 57.1992301940918, + "learning_rate": 3e-06, + "loss": 3.7195, + "step": 15 + }, + { + "epoch": 0.005137261197624017, + "grad_norm": 55.674320220947266, + "learning_rate": 3.2000000000000003e-06, + "loss": 3.6768, + "step": 16 + }, + { + "epoch": 0.005458340022475518, + "grad_norm": 54.121376037597656, + "learning_rate": 3.4000000000000005e-06, + "loss": 3.9861, + "step": 17 + }, + { + "epoch": 0.005779418847327019, + "grad_norm": 52.377349853515625, + "learning_rate": 3.6e-06, + "loss": 3.8352, + "step": 18 + }, + { + "epoch": 0.00610049767217852, + "grad_norm": 50.22018051147461, + "learning_rate": 3.8e-06, + "loss": 3.9499, + "step": 19 + }, + { + "epoch": 0.0064215764970300205, + "grad_norm": 47.37348556518555, + "learning_rate": 4.000000000000001e-06, + "loss": 3.8506, + "step": 20 + }, + { + "epoch": 0.006742655321881522, + "grad_norm": 44.72926712036133, + "learning_rate": 4.2000000000000004e-06, + "loss": 4.0215, + "step": 21 + }, + { + "epoch": 0.007063734146733023, + "grad_norm": 44.81752014160156, + "learning_rate": 4.4e-06, + "loss": 3.6998, + "step": 22 + }, + { + "epoch": 0.007384812971584524, + "grad_norm": 43.61850357055664, + "learning_rate": 4.6e-06, + "loss": 3.7522, + "step": 23 + }, + { + "epoch": 0.007705891796436025, + "grad_norm": 45.95828628540039, + "learning_rate": 4.800000000000001e-06, + "loss": 3.816, + "step": 24 + }, + { + "epoch": 0.008026970621287526, + "grad_norm": 49.80207443237305, + "learning_rate": 5e-06, + "loss": 4.0823, + "step": 25 + }, + { + "epoch": 0.008348049446139027, + "grad_norm": 51.24964141845703, + "learning_rate": 5.2e-06, + "loss": 3.9784, + "step": 26 + }, + { + "epoch": 0.008669128270990529, + "grad_norm": 54.43299102783203, + "learning_rate": 5.4e-06, + "loss": 3.9404, + "step": 27 + }, + { + "epoch": 0.008990207095842029, + "grad_norm": 55.02586364746094, + "learning_rate": 5.600000000000001e-06, + "loss": 3.917, + "step": 28 + }, + { + "epoch": 0.00931128592069353, + "grad_norm": 55.936119079589844, + "learning_rate": 5.8e-06, + "loss": 3.852, + "step": 29 + }, + { + "epoch": 0.009632364745545031, + "grad_norm": 53.62590789794922, + "learning_rate": 6e-06, + "loss": 3.8488, + "step": 30 + }, + { + "epoch": 0.009953443570396533, + "grad_norm": 50.38666534423828, + "learning_rate": 6.2e-06, + "loss": 3.3054, + "step": 31 + }, + { + "epoch": 0.010274522395248033, + "grad_norm": 49.8444709777832, + "learning_rate": 6.4000000000000006e-06, + "loss": 3.6827, + "step": 32 + }, + { + "epoch": 0.010595601220099534, + "grad_norm": 46.34492111206055, + "learning_rate": 6.6e-06, + "loss": 3.7251, + "step": 33 + }, + { + "epoch": 0.010916680044951036, + "grad_norm": 45.03758239746094, + "learning_rate": 6.800000000000001e-06, + "loss": 3.6144, + "step": 34 + }, + { + "epoch": 0.011237758869802536, + "grad_norm": 40.92512893676758, + "learning_rate": 7.000000000000001e-06, + "loss": 3.3677, + "step": 35 + }, + { + "epoch": 0.011558837694654038, + "grad_norm": 36.829925537109375, + "learning_rate": 7.2e-06, + "loss": 3.1936, + "step": 36 + }, + { + "epoch": 0.011879916519505538, + "grad_norm": 40.46910858154297, + "learning_rate": 7.4e-06, + "loss": 3.4061, + "step": 37 + }, + { + "epoch": 0.01220099534435704, + "grad_norm": 54.956382751464844, + "learning_rate": 7.6e-06, + "loss": 3.5662, + "step": 38 + }, + { + "epoch": 0.01252207416920854, + "grad_norm": 57.516902923583984, + "learning_rate": 7.8e-06, + "loss": 3.0622, + "step": 39 + }, + { + "epoch": 0.012843152994060041, + "grad_norm": 61.08616256713867, + "learning_rate": 8.000000000000001e-06, + "loss": 3.5104, + "step": 40 + }, + { + "epoch": 0.013164231818911543, + "grad_norm": 58.04061508178711, + "learning_rate": 8.200000000000001e-06, + "loss": 2.9037, + "step": 41 + }, + { + "epoch": 0.013485310643763043, + "grad_norm": 56.112709045410156, + "learning_rate": 8.400000000000001e-06, + "loss": 2.8564, + "step": 42 + }, + { + "epoch": 0.013806389468614545, + "grad_norm": 50.86652755737305, + "learning_rate": 8.599999999999999e-06, + "loss": 3.1863, + "step": 43 + }, + { + "epoch": 0.014127468293466046, + "grad_norm": 41.73533630371094, + "learning_rate": 8.8e-06, + "loss": 3.0879, + "step": 44 + }, + { + "epoch": 0.014448547118317548, + "grad_norm": 36.46621322631836, + "learning_rate": 9e-06, + "loss": 3.1359, + "step": 45 + }, + { + "epoch": 0.014769625943169048, + "grad_norm": 28.671815872192383, + "learning_rate": 9.2e-06, + "loss": 2.9814, + "step": 46 + }, + { + "epoch": 0.01509070476802055, + "grad_norm": 24.950931549072266, + "learning_rate": 9.4e-06, + "loss": 2.8951, + "step": 47 + }, + { + "epoch": 0.01541178359287205, + "grad_norm": 27.621028900146484, + "learning_rate": 9.600000000000001e-06, + "loss": 2.6941, + "step": 48 + }, + { + "epoch": 0.01573286241772355, + "grad_norm": 27.553085327148438, + "learning_rate": 9.800000000000001e-06, + "loss": 2.5166, + "step": 49 + }, + { + "epoch": 0.016053941242575052, + "grad_norm": 26.491987228393555, + "learning_rate": 1e-05, + "loss": 2.6277, + "step": 50 + }, + { + "epoch": 0.016375020067426554, + "grad_norm": 5.4101104736328125, + "learning_rate": 1.02e-05, + "loss": 1.925, + "step": 51 + }, + { + "epoch": 0.016696098892278053, + "grad_norm": 4.891691207885742, + "learning_rate": 1.04e-05, + "loss": 1.8656, + "step": 52 + }, + { + "epoch": 0.017017177717129555, + "grad_norm": 5.655025482177734, + "learning_rate": 1.06e-05, + "loss": 1.66, + "step": 53 + }, + { + "epoch": 0.017338256541981057, + "grad_norm": 6.683502674102783, + "learning_rate": 1.08e-05, + "loss": 1.6638, + "step": 54 + }, + { + "epoch": 0.01765933536683256, + "grad_norm": 6.518195152282715, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.6495, + "step": 55 + }, + { + "epoch": 0.017980414191684058, + "grad_norm": 6.019643783569336, + "learning_rate": 1.1200000000000001e-05, + "loss": 1.7143, + "step": 56 + }, + { + "epoch": 0.01830149301653556, + "grad_norm": 5.8600053787231445, + "learning_rate": 1.1400000000000001e-05, + "loss": 1.3576, + "step": 57 + }, + { + "epoch": 0.01862257184138706, + "grad_norm": 6.502221584320068, + "learning_rate": 1.16e-05, + "loss": 1.4601, + "step": 58 + }, + { + "epoch": 0.01894365066623856, + "grad_norm": 5.659090995788574, + "learning_rate": 1.18e-05, + "loss": 1.4309, + "step": 59 + }, + { + "epoch": 0.019264729491090062, + "grad_norm": 5.396006107330322, + "learning_rate": 1.2e-05, + "loss": 1.6735, + "step": 60 + }, + { + "epoch": 0.019585808315941564, + "grad_norm": 5.009734630584717, + "learning_rate": 1.22e-05, + "loss": 1.5251, + "step": 61 + }, + { + "epoch": 0.019906887140793066, + "grad_norm": 5.51364278793335, + "learning_rate": 1.24e-05, + "loss": 1.8121, + "step": 62 + }, + { + "epoch": 0.020227965965644565, + "grad_norm": 4.808631420135498, + "learning_rate": 1.2600000000000001e-05, + "loss": 1.5659, + "step": 63 + }, + { + "epoch": 0.020549044790496067, + "grad_norm": 3.892801523208618, + "learning_rate": 1.2800000000000001e-05, + "loss": 1.6308, + "step": 64 + }, + { + "epoch": 0.02087012361534757, + "grad_norm": 3.904027223587036, + "learning_rate": 1.3000000000000001e-05, + "loss": 1.5642, + "step": 65 + }, + { + "epoch": 0.021191202440199067, + "grad_norm": 4.057150840759277, + "learning_rate": 1.32e-05, + "loss": 1.4334, + "step": 66 + }, + { + "epoch": 0.02151228126505057, + "grad_norm": 4.412062168121338, + "learning_rate": 1.3400000000000002e-05, + "loss": 1.5825, + "step": 67 + }, + { + "epoch": 0.02183336008990207, + "grad_norm": 3.494084358215332, + "learning_rate": 1.3600000000000002e-05, + "loss": 1.5724, + "step": 68 + }, + { + "epoch": 0.022154438914753573, + "grad_norm": 4.483047008514404, + "learning_rate": 1.3800000000000002e-05, + "loss": 1.688, + "step": 69 + }, + { + "epoch": 0.022475517739605072, + "grad_norm": 3.417116165161133, + "learning_rate": 1.4000000000000001e-05, + "loss": 1.5698, + "step": 70 + }, + { + "epoch": 0.022796596564456574, + "grad_norm": 6.7289838790893555, + "learning_rate": 1.42e-05, + "loss": 1.6379, + "step": 71 + }, + { + "epoch": 0.023117675389308076, + "grad_norm": 3.190251111984253, + "learning_rate": 1.44e-05, + "loss": 1.4405, + "step": 72 + }, + { + "epoch": 0.023438754214159575, + "grad_norm": 3.8525373935699463, + "learning_rate": 1.4599999999999999e-05, + "loss": 1.5232, + "step": 73 + }, + { + "epoch": 0.023759833039011077, + "grad_norm": 3.5374369621276855, + "learning_rate": 1.48e-05, + "loss": 1.7238, + "step": 74 + }, + { + "epoch": 0.02408091186386258, + "grad_norm": 5.770689964294434, + "learning_rate": 1.5e-05, + "loss": 1.5226, + "step": 75 + }, + { + "epoch": 0.02440199068871408, + "grad_norm": 4.280745506286621, + "learning_rate": 1.52e-05, + "loss": 1.6567, + "step": 76 + }, + { + "epoch": 0.02472306951356558, + "grad_norm": 3.3552117347717285, + "learning_rate": 1.54e-05, + "loss": 1.4622, + "step": 77 + }, + { + "epoch": 0.02504414833841708, + "grad_norm": 4.613933563232422, + "learning_rate": 1.56e-05, + "loss": 1.612, + "step": 78 + }, + { + "epoch": 0.025365227163268583, + "grad_norm": 4.299830913543701, + "learning_rate": 1.58e-05, + "loss": 1.7098, + "step": 79 + }, + { + "epoch": 0.025686305988120082, + "grad_norm": 3.547947406768799, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.4593, + "step": 80 + }, + { + "epoch": 0.026007384812971584, + "grad_norm": 3.5630974769592285, + "learning_rate": 1.62e-05, + "loss": 1.6422, + "step": 81 + }, + { + "epoch": 0.026328463637823086, + "grad_norm": 3.984318256378174, + "learning_rate": 1.6400000000000002e-05, + "loss": 1.5161, + "step": 82 + }, + { + "epoch": 0.026649542462674588, + "grad_norm": 4.502562046051025, + "learning_rate": 1.66e-05, + "loss": 1.7414, + "step": 83 + }, + { + "epoch": 0.026970621287526086, + "grad_norm": 3.2806875705718994, + "learning_rate": 1.6800000000000002e-05, + "loss": 1.4668, + "step": 84 + }, + { + "epoch": 0.02729170011237759, + "grad_norm": 3.762796401977539, + "learning_rate": 1.7000000000000003e-05, + "loss": 1.6335, + "step": 85 + }, + { + "epoch": 0.02761277893722909, + "grad_norm": 3.4556496143341064, + "learning_rate": 1.7199999999999998e-05, + "loss": 1.6127, + "step": 86 + }, + { + "epoch": 0.027933857762080593, + "grad_norm": 3.3840537071228027, + "learning_rate": 1.74e-05, + "loss": 1.4251, + "step": 87 + }, + { + "epoch": 0.02825493658693209, + "grad_norm": 3.363680601119995, + "learning_rate": 1.76e-05, + "loss": 1.6259, + "step": 88 + }, + { + "epoch": 0.028576015411783593, + "grad_norm": 3.2910609245300293, + "learning_rate": 1.78e-05, + "loss": 1.5546, + "step": 89 + }, + { + "epoch": 0.028897094236635095, + "grad_norm": 2.9161438941955566, + "learning_rate": 1.8e-05, + "loss": 1.2032, + "step": 90 + }, + { + "epoch": 0.029218173061486594, + "grad_norm": 3.4742963314056396, + "learning_rate": 1.8200000000000002e-05, + "loss": 1.5664, + "step": 91 + }, + { + "epoch": 0.029539251886338096, + "grad_norm": 3.563717842102051, + "learning_rate": 1.84e-05, + "loss": 1.4364, + "step": 92 + }, + { + "epoch": 0.029860330711189598, + "grad_norm": 2.6174569129943848, + "learning_rate": 1.86e-05, + "loss": 1.4534, + "step": 93 + }, + { + "epoch": 0.0301814095360411, + "grad_norm": 5.107251167297363, + "learning_rate": 1.88e-05, + "loss": 1.4267, + "step": 94 + }, + { + "epoch": 0.0305024883608926, + "grad_norm": 3.532844066619873, + "learning_rate": 1.9e-05, + "loss": 1.5197, + "step": 95 + }, + { + "epoch": 0.0308235671857441, + "grad_norm": 3.5595579147338867, + "learning_rate": 1.9200000000000003e-05, + "loss": 1.3465, + "step": 96 + }, + { + "epoch": 0.031144646010595602, + "grad_norm": 3.7405896186828613, + "learning_rate": 1.94e-05, + "loss": 1.4418, + "step": 97 + }, + { + "epoch": 0.0314657248354471, + "grad_norm": 3.423434257507324, + "learning_rate": 1.9600000000000002e-05, + "loss": 1.3589, + "step": 98 + }, + { + "epoch": 0.0317868036602986, + "grad_norm": 7.2687602043151855, + "learning_rate": 1.9800000000000004e-05, + "loss": 1.5237, + "step": 99 + }, + { + "epoch": 0.032107882485150105, + "grad_norm": 9.787822723388672, + "learning_rate": 2e-05, + "loss": 1.3776, + "step": 100 + }, + { + "epoch": 0.03242896131000161, + "grad_norm": 3.462364435195923, + "learning_rate": 2.0200000000000003e-05, + "loss": 1.921, + "step": 101 + }, + { + "epoch": 0.03275004013485311, + "grad_norm": 3.6257283687591553, + "learning_rate": 2.04e-05, + "loss": 1.7884, + "step": 102 + }, + { + "epoch": 0.033071118959704604, + "grad_norm": 3.9720280170440674, + "learning_rate": 2.06e-05, + "loss": 1.5258, + "step": 103 + }, + { + "epoch": 0.033392197784556106, + "grad_norm": 3.6847167015075684, + "learning_rate": 2.08e-05, + "loss": 1.2969, + "step": 104 + }, + { + "epoch": 0.03371327660940761, + "grad_norm": 4.045772075653076, + "learning_rate": 2.1e-05, + "loss": 1.4464, + "step": 105 + }, + { + "epoch": 0.03403435543425911, + "grad_norm": 3.264374017715454, + "learning_rate": 2.12e-05, + "loss": 1.2651, + "step": 106 + }, + { + "epoch": 0.03435543425911061, + "grad_norm": 2.960892677307129, + "learning_rate": 2.1400000000000002e-05, + "loss": 1.1481, + "step": 107 + }, + { + "epoch": 0.034676513083962114, + "grad_norm": 2.9885120391845703, + "learning_rate": 2.16e-05, + "loss": 1.367, + "step": 108 + }, + { + "epoch": 0.034997591908813616, + "grad_norm": 3.1829378604888916, + "learning_rate": 2.18e-05, + "loss": 1.3112, + "step": 109 + }, + { + "epoch": 0.03531867073366512, + "grad_norm": 2.956650733947754, + "learning_rate": 2.2000000000000003e-05, + "loss": 1.4058, + "step": 110 + }, + { + "epoch": 0.03563974955851661, + "grad_norm": 2.93878436088562, + "learning_rate": 2.22e-05, + "loss": 1.6511, + "step": 111 + }, + { + "epoch": 0.035960828383368115, + "grad_norm": 4.111947059631348, + "learning_rate": 2.2400000000000002e-05, + "loss": 1.2943, + "step": 112 + }, + { + "epoch": 0.03628190720821962, + "grad_norm": 3.833181142807007, + "learning_rate": 2.26e-05, + "loss": 1.3733, + "step": 113 + }, + { + "epoch": 0.03660298603307112, + "grad_norm": 2.5650758743286133, + "learning_rate": 2.2800000000000002e-05, + "loss": 1.4837, + "step": 114 + }, + { + "epoch": 0.03692406485792262, + "grad_norm": 2.7790398597717285, + "learning_rate": 2.3000000000000003e-05, + "loss": 1.3758, + "step": 115 + }, + { + "epoch": 0.03724514368277412, + "grad_norm": 2.0799965858459473, + "learning_rate": 2.32e-05, + "loss": 1.2325, + "step": 116 + }, + { + "epoch": 0.037566222507625625, + "grad_norm": 2.8642420768737793, + "learning_rate": 2.3400000000000003e-05, + "loss": 1.422, + "step": 117 + }, + { + "epoch": 0.03788730133247712, + "grad_norm": 2.840057373046875, + "learning_rate": 2.36e-05, + "loss": 1.3231, + "step": 118 + }, + { + "epoch": 0.03820838015732862, + "grad_norm": 2.1735825538635254, + "learning_rate": 2.38e-05, + "loss": 1.441, + "step": 119 + }, + { + "epoch": 0.038529458982180124, + "grad_norm": 2.365070343017578, + "learning_rate": 2.4e-05, + "loss": 1.3248, + "step": 120 + }, + { + "epoch": 0.038850537807031627, + "grad_norm": 2.179810047149658, + "learning_rate": 2.4200000000000002e-05, + "loss": 1.4982, + "step": 121 + }, + { + "epoch": 0.03917161663188313, + "grad_norm": 2.4042208194732666, + "learning_rate": 2.44e-05, + "loss": 1.6785, + "step": 122 + }, + { + "epoch": 0.03949269545673463, + "grad_norm": 2.441134452819824, + "learning_rate": 2.46e-05, + "loss": 1.4426, + "step": 123 + }, + { + "epoch": 0.03981377428158613, + "grad_norm": 3.4666857719421387, + "learning_rate": 2.48e-05, + "loss": 1.4485, + "step": 124 + }, + { + "epoch": 0.04013485310643763, + "grad_norm": 2.5980589389801025, + "learning_rate": 2.5e-05, + "loss": 1.3558, + "step": 125 + }, + { + "epoch": 0.04045593193128913, + "grad_norm": 3.334627628326416, + "learning_rate": 2.5200000000000003e-05, + "loss": 1.5553, + "step": 126 + }, + { + "epoch": 0.04077701075614063, + "grad_norm": 2.676223039627075, + "learning_rate": 2.54e-05, + "loss": 1.4527, + "step": 127 + }, + { + "epoch": 0.041098089580992134, + "grad_norm": 2.7710494995117188, + "learning_rate": 2.5600000000000002e-05, + "loss": 1.5945, + "step": 128 + }, + { + "epoch": 0.041419168405843636, + "grad_norm": 2.559156894683838, + "learning_rate": 2.58e-05, + "loss": 1.3244, + "step": 129 + }, + { + "epoch": 0.04174024723069514, + "grad_norm": 2.5117619037628174, + "learning_rate": 2.6000000000000002e-05, + "loss": 1.5478, + "step": 130 + }, + { + "epoch": 0.04206132605554664, + "grad_norm": 2.3930675983428955, + "learning_rate": 2.6200000000000003e-05, + "loss": 1.3329, + "step": 131 + }, + { + "epoch": 0.042382404880398135, + "grad_norm": 2.4628591537475586, + "learning_rate": 2.64e-05, + "loss": 1.4671, + "step": 132 + }, + { + "epoch": 0.04270348370524964, + "grad_norm": 1.9905048608779907, + "learning_rate": 2.6600000000000003e-05, + "loss": 1.482, + "step": 133 + }, + { + "epoch": 0.04302456253010114, + "grad_norm": 2.3793320655822754, + "learning_rate": 2.6800000000000004e-05, + "loss": 1.4634, + "step": 134 + }, + { + "epoch": 0.04334564135495264, + "grad_norm": 2.54632830619812, + "learning_rate": 2.7000000000000002e-05, + "loss": 1.2852, + "step": 135 + }, + { + "epoch": 0.04366672017980414, + "grad_norm": 3.641115188598633, + "learning_rate": 2.7200000000000004e-05, + "loss": 1.369, + "step": 136 + }, + { + "epoch": 0.043987799004655645, + "grad_norm": 2.443338394165039, + "learning_rate": 2.7400000000000002e-05, + "loss": 1.3057, + "step": 137 + }, + { + "epoch": 0.04430887782950715, + "grad_norm": 3.577340602874756, + "learning_rate": 2.7600000000000003e-05, + "loss": 1.3757, + "step": 138 + }, + { + "epoch": 0.04462995665435864, + "grad_norm": 2.4298670291900635, + "learning_rate": 2.7800000000000005e-05, + "loss": 1.2701, + "step": 139 + }, + { + "epoch": 0.044951035479210144, + "grad_norm": 2.5725412368774414, + "learning_rate": 2.8000000000000003e-05, + "loss": 1.3694, + "step": 140 + }, + { + "epoch": 0.045272114304061646, + "grad_norm": 2.3307549953460693, + "learning_rate": 2.8199999999999998e-05, + "loss": 1.3261, + "step": 141 + }, + { + "epoch": 0.04559319312891315, + "grad_norm": 3.738875389099121, + "learning_rate": 2.84e-05, + "loss": 1.3514, + "step": 142 + }, + { + "epoch": 0.04591427195376465, + "grad_norm": 2.905665397644043, + "learning_rate": 2.86e-05, + "loss": 1.0867, + "step": 143 + }, + { + "epoch": 0.04623535077861615, + "grad_norm": 3.744802713394165, + "learning_rate": 2.88e-05, + "loss": 1.1365, + "step": 144 + }, + { + "epoch": 0.046556429603467654, + "grad_norm": 3.559023141860962, + "learning_rate": 2.9e-05, + "loss": 1.2399, + "step": 145 + }, + { + "epoch": 0.04687750842831915, + "grad_norm": 3.565185070037842, + "learning_rate": 2.9199999999999998e-05, + "loss": 1.2912, + "step": 146 + }, + { + "epoch": 0.04719858725317065, + "grad_norm": 3.949876308441162, + "learning_rate": 2.94e-05, + "loss": 1.1523, + "step": 147 + }, + { + "epoch": 0.04751966607802215, + "grad_norm": 3.7057857513427734, + "learning_rate": 2.96e-05, + "loss": 1.1759, + "step": 148 + }, + { + "epoch": 0.047840744902873655, + "grad_norm": 2.8545663356781006, + "learning_rate": 2.98e-05, + "loss": 1.058, + "step": 149 + }, + { + "epoch": 0.04816182372772516, + "grad_norm": 5.648036003112793, + "learning_rate": 3e-05, + "loss": 1.1387, + "step": 150 + }, + { + "epoch": 0.04848290255257666, + "grad_norm": 2.1492836475372314, + "learning_rate": 3.02e-05, + "loss": 1.6236, + "step": 151 + }, + { + "epoch": 0.04880398137742816, + "grad_norm": 2.80293607711792, + "learning_rate": 3.04e-05, + "loss": 1.7931, + "step": 152 + }, + { + "epoch": 0.049125060202279656, + "grad_norm": 2.3928093910217285, + "learning_rate": 3.06e-05, + "loss": 1.2039, + "step": 153 + }, + { + "epoch": 0.04944613902713116, + "grad_norm": 2.5905890464782715, + "learning_rate": 3.08e-05, + "loss": 1.1153, + "step": 154 + }, + { + "epoch": 0.04976721785198266, + "grad_norm": 3.102632761001587, + "learning_rate": 3.1e-05, + "loss": 1.2945, + "step": 155 + }, + { + "epoch": 0.05008829667683416, + "grad_norm": 3.5045828819274902, + "learning_rate": 3.12e-05, + "loss": 1.15, + "step": 156 + }, + { + "epoch": 0.050409375501685665, + "grad_norm": 3.210549831390381, + "learning_rate": 3.1400000000000004e-05, + "loss": 1.2008, + "step": 157 + }, + { + "epoch": 0.05073045432653717, + "grad_norm": 2.5296566486358643, + "learning_rate": 3.16e-05, + "loss": 1.2581, + "step": 158 + }, + { + "epoch": 0.05105153315138867, + "grad_norm": 3.071568727493286, + "learning_rate": 3.18e-05, + "loss": 0.9904, + "step": 159 + }, + { + "epoch": 0.051372611976240164, + "grad_norm": 2.763911724090576, + "learning_rate": 3.2000000000000005e-05, + "loss": 1.12, + "step": 160 + }, + { + "epoch": 0.051693690801091666, + "grad_norm": 2.2567617893218994, + "learning_rate": 3.2200000000000003e-05, + "loss": 1.2534, + "step": 161 + }, + { + "epoch": 0.05201476962594317, + "grad_norm": 2.49045991897583, + "learning_rate": 3.24e-05, + "loss": 1.2713, + "step": 162 + }, + { + "epoch": 0.05233584845079467, + "grad_norm": 3.3164381980895996, + "learning_rate": 3.26e-05, + "loss": 1.1137, + "step": 163 + }, + { + "epoch": 0.05265692727564617, + "grad_norm": 2.6513657569885254, + "learning_rate": 3.2800000000000004e-05, + "loss": 1.3505, + "step": 164 + }, + { + "epoch": 0.052978006100497674, + "grad_norm": 2.629395008087158, + "learning_rate": 3.3e-05, + "loss": 1.206, + "step": 165 + }, + { + "epoch": 0.053299084925349176, + "grad_norm": 3.333925485610962, + "learning_rate": 3.32e-05, + "loss": 1.1088, + "step": 166 + }, + { + "epoch": 0.05362016375020067, + "grad_norm": 2.59550142288208, + "learning_rate": 3.3400000000000005e-05, + "loss": 1.2533, + "step": 167 + }, + { + "epoch": 0.05394124257505217, + "grad_norm": 2.676664352416992, + "learning_rate": 3.3600000000000004e-05, + "loss": 1.2916, + "step": 168 + }, + { + "epoch": 0.054262321399903675, + "grad_norm": 2.5266637802124023, + "learning_rate": 3.38e-05, + "loss": 1.3359, + "step": 169 + }, + { + "epoch": 0.05458340022475518, + "grad_norm": 2.394808292388916, + "learning_rate": 3.4000000000000007e-05, + "loss": 1.0293, + "step": 170 + }, + { + "epoch": 0.05490447904960668, + "grad_norm": 3.7004685401916504, + "learning_rate": 3.4200000000000005e-05, + "loss": 1.224, + "step": 171 + }, + { + "epoch": 0.05522555787445818, + "grad_norm": 2.8753304481506348, + "learning_rate": 3.4399999999999996e-05, + "loss": 1.2212, + "step": 172 + }, + { + "epoch": 0.05554663669930968, + "grad_norm": 2.939927101135254, + "learning_rate": 3.46e-05, + "loss": 1.4577, + "step": 173 + }, + { + "epoch": 0.055867715524161185, + "grad_norm": 2.922861099243164, + "learning_rate": 3.48e-05, + "loss": 1.2349, + "step": 174 + }, + { + "epoch": 0.05618879434901268, + "grad_norm": 2.545013427734375, + "learning_rate": 3.5e-05, + "loss": 1.0631, + "step": 175 + }, + { + "epoch": 0.05650987317386418, + "grad_norm": 3.1497952938079834, + "learning_rate": 3.52e-05, + "loss": 1.2337, + "step": 176 + }, + { + "epoch": 0.056830951998715684, + "grad_norm": 2.5342869758605957, + "learning_rate": 3.54e-05, + "loss": 1.2889, + "step": 177 + }, + { + "epoch": 0.057152030823567186, + "grad_norm": 2.435774087905884, + "learning_rate": 3.56e-05, + "loss": 1.4616, + "step": 178 + }, + { + "epoch": 0.05747310964841869, + "grad_norm": 1.7864798307418823, + "learning_rate": 3.58e-05, + "loss": 1.0572, + "step": 179 + }, + { + "epoch": 0.05779418847327019, + "grad_norm": 1.8739286661148071, + "learning_rate": 3.6e-05, + "loss": 1.227, + "step": 180 + }, + { + "epoch": 0.05811526729812169, + "grad_norm": 3.2398200035095215, + "learning_rate": 3.62e-05, + "loss": 1.2632, + "step": 181 + }, + { + "epoch": 0.05843634612297319, + "grad_norm": 2.340625047683716, + "learning_rate": 3.6400000000000004e-05, + "loss": 1.2501, + "step": 182 + }, + { + "epoch": 0.05875742494782469, + "grad_norm": 2.0247950553894043, + "learning_rate": 3.66e-05, + "loss": 1.3319, + "step": 183 + }, + { + "epoch": 0.05907850377267619, + "grad_norm": 2.264277935028076, + "learning_rate": 3.68e-05, + "loss": 1.2761, + "step": 184 + }, + { + "epoch": 0.05939958259752769, + "grad_norm": 1.971635341644287, + "learning_rate": 3.7e-05, + "loss": 0.9021, + "step": 185 + }, + { + "epoch": 0.059720661422379195, + "grad_norm": 1.713057518005371, + "learning_rate": 3.72e-05, + "loss": 1.2372, + "step": 186 + }, + { + "epoch": 0.0600417402472307, + "grad_norm": 3.2709195613861084, + "learning_rate": 3.74e-05, + "loss": 1.2062, + "step": 187 + }, + { + "epoch": 0.0603628190720822, + "grad_norm": 1.7991886138916016, + "learning_rate": 3.76e-05, + "loss": 1.0789, + "step": 188 + }, + { + "epoch": 0.060683897896933695, + "grad_norm": 3.217481851577759, + "learning_rate": 3.7800000000000004e-05, + "loss": 1.2653, + "step": 189 + }, + { + "epoch": 0.0610049767217852, + "grad_norm": 1.5862120389938354, + "learning_rate": 3.8e-05, + "loss": 1.2394, + "step": 190 + }, + { + "epoch": 0.0613260555466367, + "grad_norm": 1.8591636419296265, + "learning_rate": 3.82e-05, + "loss": 1.2675, + "step": 191 + }, + { + "epoch": 0.0616471343714882, + "grad_norm": 1.6677289009094238, + "learning_rate": 3.8400000000000005e-05, + "loss": 1.0502, + "step": 192 + }, + { + "epoch": 0.0619682131963397, + "grad_norm": 2.0882251262664795, + "learning_rate": 3.86e-05, + "loss": 0.9488, + "step": 193 + }, + { + "epoch": 0.062289292021191205, + "grad_norm": 1.579268217086792, + "learning_rate": 3.88e-05, + "loss": 1.1667, + "step": 194 + }, + { + "epoch": 0.0626103708460427, + "grad_norm": 1.7781903743743896, + "learning_rate": 3.9000000000000006e-05, + "loss": 1.238, + "step": 195 + }, + { + "epoch": 0.0629314496708942, + "grad_norm": 2.556015729904175, + "learning_rate": 3.9200000000000004e-05, + "loss": 1.0398, + "step": 196 + }, + { + "epoch": 0.06325252849574571, + "grad_norm": 1.9498616456985474, + "learning_rate": 3.94e-05, + "loss": 1.0116, + "step": 197 + }, + { + "epoch": 0.0635736073205972, + "grad_norm": 3.5160417556762695, + "learning_rate": 3.960000000000001e-05, + "loss": 0.8621, + "step": 198 + }, + { + "epoch": 0.0638946861454487, + "grad_norm": 3.2988150119781494, + "learning_rate": 3.9800000000000005e-05, + "loss": 0.836, + "step": 199 + }, + { + "epoch": 0.06421576497030021, + "grad_norm": 3.2342076301574707, + "learning_rate": 4e-05, + "loss": 0.7853, + "step": 200 + }, + { + "epoch": 0.0645368437951517, + "grad_norm": 3.1116089820861816, + "learning_rate": 4.02e-05, + "loss": 1.5181, + "step": 201 + }, + { + "epoch": 0.06485792262000321, + "grad_norm": 4.22519063949585, + "learning_rate": 4.0400000000000006e-05, + "loss": 1.5198, + "step": 202 + }, + { + "epoch": 0.06517900144485471, + "grad_norm": 3.6451752185821533, + "learning_rate": 4.0600000000000004e-05, + "loss": 1.1846, + "step": 203 + }, + { + "epoch": 0.06550008026970622, + "grad_norm": 2.907416582107544, + "learning_rate": 4.08e-05, + "loss": 1.1008, + "step": 204 + }, + { + "epoch": 0.06582115909455771, + "grad_norm": 3.7612717151641846, + "learning_rate": 4.1e-05, + "loss": 1.0514, + "step": 205 + }, + { + "epoch": 0.06614223791940921, + "grad_norm": 2.8270351886749268, + "learning_rate": 4.12e-05, + "loss": 1.1469, + "step": 206 + }, + { + "epoch": 0.06646331674426072, + "grad_norm": 2.9322104454040527, + "learning_rate": 4.14e-05, + "loss": 1.1255, + "step": 207 + }, + { + "epoch": 0.06678439556911221, + "grad_norm": 2.4711625576019287, + "learning_rate": 4.16e-05, + "loss": 1.0622, + "step": 208 + }, + { + "epoch": 0.06710547439396372, + "grad_norm": 1.7461209297180176, + "learning_rate": 4.18e-05, + "loss": 1.1028, + "step": 209 + }, + { + "epoch": 0.06742655321881522, + "grad_norm": 2.2536096572875977, + "learning_rate": 4.2e-05, + "loss": 1.0743, + "step": 210 + }, + { + "epoch": 0.06774763204366673, + "grad_norm": 2.037104606628418, + "learning_rate": 4.22e-05, + "loss": 1.2257, + "step": 211 + }, + { + "epoch": 0.06806871086851822, + "grad_norm": 2.400221586227417, + "learning_rate": 4.24e-05, + "loss": 1.2784, + "step": 212 + }, + { + "epoch": 0.06838978969336973, + "grad_norm": 2.0338759422302246, + "learning_rate": 4.26e-05, + "loss": 1.1648, + "step": 213 + }, + { + "epoch": 0.06871086851822122, + "grad_norm": 3.6578612327575684, + "learning_rate": 4.2800000000000004e-05, + "loss": 1.1226, + "step": 214 + }, + { + "epoch": 0.06903194734307272, + "grad_norm": 2.1461496353149414, + "learning_rate": 4.3e-05, + "loss": 1.0823, + "step": 215 + }, + { + "epoch": 0.06935302616792423, + "grad_norm": 1.87822687625885, + "learning_rate": 4.32e-05, + "loss": 1.1413, + "step": 216 + }, + { + "epoch": 0.06967410499277572, + "grad_norm": 1.7954866886138916, + "learning_rate": 4.3400000000000005e-05, + "loss": 1.2208, + "step": 217 + }, + { + "epoch": 0.06999518381762723, + "grad_norm": 2.6523921489715576, + "learning_rate": 4.36e-05, + "loss": 1.0858, + "step": 218 + }, + { + "epoch": 0.07031626264247873, + "grad_norm": 2.4885449409484863, + "learning_rate": 4.38e-05, + "loss": 1.0251, + "step": 219 + }, + { + "epoch": 0.07063734146733024, + "grad_norm": 2.9225237369537354, + "learning_rate": 4.4000000000000006e-05, + "loss": 1.2586, + "step": 220 + }, + { + "epoch": 0.07095842029218173, + "grad_norm": 2.579859495162964, + "learning_rate": 4.4200000000000004e-05, + "loss": 1.2634, + "step": 221 + }, + { + "epoch": 0.07127949911703323, + "grad_norm": 3.152012586593628, + "learning_rate": 4.44e-05, + "loss": 1.0579, + "step": 222 + }, + { + "epoch": 0.07160057794188474, + "grad_norm": 1.843085765838623, + "learning_rate": 4.46e-05, + "loss": 1.0507, + "step": 223 + }, + { + "epoch": 0.07192165676673623, + "grad_norm": 2.062282085418701, + "learning_rate": 4.4800000000000005e-05, + "loss": 1.3025, + "step": 224 + }, + { + "epoch": 0.07224273559158774, + "grad_norm": 1.8540760278701782, + "learning_rate": 4.5e-05, + "loss": 1.0763, + "step": 225 + }, + { + "epoch": 0.07256381441643923, + "grad_norm": 2.1458258628845215, + "learning_rate": 4.52e-05, + "loss": 1.1163, + "step": 226 + }, + { + "epoch": 0.07288489324129074, + "grad_norm": 2.402344226837158, + "learning_rate": 4.5400000000000006e-05, + "loss": 1.2973, + "step": 227 + }, + { + "epoch": 0.07320597206614224, + "grad_norm": 1.6383142471313477, + "learning_rate": 4.5600000000000004e-05, + "loss": 1.1735, + "step": 228 + }, + { + "epoch": 0.07352705089099373, + "grad_norm": 1.9293252229690552, + "learning_rate": 4.58e-05, + "loss": 1.1113, + "step": 229 + }, + { + "epoch": 0.07384812971584524, + "grad_norm": 2.625548839569092, + "learning_rate": 4.600000000000001e-05, + "loss": 1.1503, + "step": 230 + }, + { + "epoch": 0.07416920854069674, + "grad_norm": 2.3177950382232666, + "learning_rate": 4.6200000000000005e-05, + "loss": 0.9863, + "step": 231 + }, + { + "epoch": 0.07449028736554825, + "grad_norm": 3.1583642959594727, + "learning_rate": 4.64e-05, + "loss": 1.3059, + "step": 232 + }, + { + "epoch": 0.07481136619039974, + "grad_norm": 1.7886139154434204, + "learning_rate": 4.660000000000001e-05, + "loss": 1.0193, + "step": 233 + }, + { + "epoch": 0.07513244501525125, + "grad_norm": 3.0615925788879395, + "learning_rate": 4.6800000000000006e-05, + "loss": 1.4904, + "step": 234 + }, + { + "epoch": 0.07545352384010275, + "grad_norm": 823.4281005859375, + "learning_rate": 4.7e-05, + "loss": 1.1034, + "step": 235 + }, + { + "epoch": 0.07577460266495424, + "grad_norm": 3.0355687141418457, + "learning_rate": 4.72e-05, + "loss": 0.9429, + "step": 236 + }, + { + "epoch": 0.07609568148980575, + "grad_norm": 556.7845458984375, + "learning_rate": 4.74e-05, + "loss": 1.1951, + "step": 237 + }, + { + "epoch": 0.07641676031465724, + "grad_norm": 3.066845417022705, + "learning_rate": 4.76e-05, + "loss": 1.1543, + "step": 238 + }, + { + "epoch": 0.07673783913950875, + "grad_norm": 2.495962142944336, + "learning_rate": 4.78e-05, + "loss": 1.1047, + "step": 239 + }, + { + "epoch": 0.07705891796436025, + "grad_norm": 12.180785179138184, + "learning_rate": 4.8e-05, + "loss": 0.9588, + "step": 240 + }, + { + "epoch": 0.07737999678921176, + "grad_norm": 3.484467029571533, + "learning_rate": 4.82e-05, + "loss": 1.1392, + "step": 241 + }, + { + "epoch": 0.07770107561406325, + "grad_norm": 2.198939561843872, + "learning_rate": 4.8400000000000004e-05, + "loss": 1.3359, + "step": 242 + }, + { + "epoch": 0.07802215443891475, + "grad_norm": 1.4292407035827637, + "learning_rate": 4.86e-05, + "loss": 0.9899, + "step": 243 + }, + { + "epoch": 0.07834323326376626, + "grad_norm": 1.910438895225525, + "learning_rate": 4.88e-05, + "loss": 0.928, + "step": 244 + }, + { + "epoch": 0.07866431208861775, + "grad_norm": 3.0100486278533936, + "learning_rate": 4.9e-05, + "loss": 1.4606, + "step": 245 + }, + { + "epoch": 0.07898539091346926, + "grad_norm": 2.0493595600128174, + "learning_rate": 4.92e-05, + "loss": 1.0296, + "step": 246 + }, + { + "epoch": 0.07930646973832076, + "grad_norm": 2.848602771759033, + "learning_rate": 4.94e-05, + "loss": 0.9189, + "step": 247 + }, + { + "epoch": 0.07962754856317227, + "grad_norm": 3.749636650085449, + "learning_rate": 4.96e-05, + "loss": 0.9614, + "step": 248 + }, + { + "epoch": 0.07994862738802376, + "grad_norm": 2.882443428039551, + "learning_rate": 4.9800000000000004e-05, + "loss": 0.872, + "step": 249 + }, + { + "epoch": 0.08026970621287526, + "grad_norm": 1.6551190614700317, + "learning_rate": 5e-05, + "loss": 0.7895, + "step": 250 + }, + { + "epoch": 0.08059078503772676, + "grad_norm": 2.3475422859191895, + "learning_rate": 5.02e-05, + "loss": 1.4449, + "step": 251 + }, + { + "epoch": 0.08091186386257826, + "grad_norm": 2.298914670944214, + "learning_rate": 5.0400000000000005e-05, + "loss": 1.0738, + "step": 252 + }, + { + "epoch": 0.08123294268742977, + "grad_norm": 2.8183484077453613, + "learning_rate": 5.0600000000000003e-05, + "loss": 1.0035, + "step": 253 + }, + { + "epoch": 0.08155402151228126, + "grad_norm": 2.7477033138275146, + "learning_rate": 5.08e-05, + "loss": 1.0145, + "step": 254 + }, + { + "epoch": 0.08187510033713277, + "grad_norm": 2.9522218704223633, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.962, + "step": 255 + }, + { + "epoch": 0.08219617916198427, + "grad_norm": 2.496779203414917, + "learning_rate": 5.1200000000000004e-05, + "loss": 0.899, + "step": 256 + }, + { + "epoch": 0.08251725798683576, + "grad_norm": 3.338456392288208, + "learning_rate": 5.14e-05, + "loss": 1.0442, + "step": 257 + }, + { + "epoch": 0.08283833681168727, + "grad_norm": 2.447253465652466, + "learning_rate": 5.16e-05, + "loss": 1.0634, + "step": 258 + }, + { + "epoch": 0.08315941563653877, + "grad_norm": 2.5269699096679688, + "learning_rate": 5.1800000000000005e-05, + "loss": 1.216, + "step": 259 + }, + { + "epoch": 0.08348049446139028, + "grad_norm": 1.807884693145752, + "learning_rate": 5.2000000000000004e-05, + "loss": 1.051, + "step": 260 + }, + { + "epoch": 0.08380157328624177, + "grad_norm": 3.292743444442749, + "learning_rate": 5.22e-05, + "loss": 1.0427, + "step": 261 + }, + { + "epoch": 0.08412265211109328, + "grad_norm": 1.926658034324646, + "learning_rate": 5.2400000000000007e-05, + "loss": 1.1237, + "step": 262 + }, + { + "epoch": 0.08444373093594477, + "grad_norm": 2.5301294326782227, + "learning_rate": 5.2600000000000005e-05, + "loss": 1.1137, + "step": 263 + }, + { + "epoch": 0.08476480976079627, + "grad_norm": 2.3400182723999023, + "learning_rate": 5.28e-05, + "loss": 0.9848, + "step": 264 + }, + { + "epoch": 0.08508588858564778, + "grad_norm": 1.7750979661941528, + "learning_rate": 5.300000000000001e-05, + "loss": 1.1295, + "step": 265 + }, + { + "epoch": 0.08540696741049927, + "grad_norm": 2.335904598236084, + "learning_rate": 5.3200000000000006e-05, + "loss": 1.2678, + "step": 266 + }, + { + "epoch": 0.08572804623535078, + "grad_norm": 1.9911553859710693, + "learning_rate": 5.3400000000000004e-05, + "loss": 1.1416, + "step": 267 + }, + { + "epoch": 0.08604912506020228, + "grad_norm": 1.7395027875900269, + "learning_rate": 5.360000000000001e-05, + "loss": 1.1398, + "step": 268 + }, + { + "epoch": 0.08637020388505379, + "grad_norm": 1.7891736030578613, + "learning_rate": 5.380000000000001e-05, + "loss": 1.1321, + "step": 269 + }, + { + "epoch": 0.08669128270990528, + "grad_norm": 1.8368785381317139, + "learning_rate": 5.4000000000000005e-05, + "loss": 1.0112, + "step": 270 + }, + { + "epoch": 0.08701236153475678, + "grad_norm": 1.638079047203064, + "learning_rate": 5.420000000000001e-05, + "loss": 1.1376, + "step": 271 + }, + { + "epoch": 0.08733344035960829, + "grad_norm": 1.9354631900787354, + "learning_rate": 5.440000000000001e-05, + "loss": 1.1693, + "step": 272 + }, + { + "epoch": 0.08765451918445978, + "grad_norm": 2.017259359359741, + "learning_rate": 5.4600000000000006e-05, + "loss": 1.2163, + "step": 273 + }, + { + "epoch": 0.08797559800931129, + "grad_norm": 1.6436361074447632, + "learning_rate": 5.4800000000000004e-05, + "loss": 0.9737, + "step": 274 + }, + { + "epoch": 0.08829667683416279, + "grad_norm": 1.7359622716903687, + "learning_rate": 5.500000000000001e-05, + "loss": 1.2599, + "step": 275 + }, + { + "epoch": 0.0886177556590143, + "grad_norm": 1.698771595954895, + "learning_rate": 5.520000000000001e-05, + "loss": 1.0724, + "step": 276 + }, + { + "epoch": 0.08893883448386579, + "grad_norm": 3.8529412746429443, + "learning_rate": 5.5400000000000005e-05, + "loss": 1.4372, + "step": 277 + }, + { + "epoch": 0.08925991330871728, + "grad_norm": 1.826474905014038, + "learning_rate": 5.560000000000001e-05, + "loss": 1.4767, + "step": 278 + }, + { + "epoch": 0.0895809921335688, + "grad_norm": 1.7558326721191406, + "learning_rate": 5.580000000000001e-05, + "loss": 1.1549, + "step": 279 + }, + { + "epoch": 0.08990207095842029, + "grad_norm": 2.5463342666625977, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.9819, + "step": 280 + }, + { + "epoch": 0.0902231497832718, + "grad_norm": 1.6322652101516724, + "learning_rate": 5.620000000000001e-05, + "loss": 1.2071, + "step": 281 + }, + { + "epoch": 0.09054422860812329, + "grad_norm": 1.6438385248184204, + "learning_rate": 5.6399999999999995e-05, + "loss": 0.991, + "step": 282 + }, + { + "epoch": 0.0908653074329748, + "grad_norm": 1.8053836822509766, + "learning_rate": 5.66e-05, + "loss": 1.1238, + "step": 283 + }, + { + "epoch": 0.0911863862578263, + "grad_norm": 2.149841070175171, + "learning_rate": 5.68e-05, + "loss": 1.1431, + "step": 284 + }, + { + "epoch": 0.09150746508267779, + "grad_norm": 1.8680065870285034, + "learning_rate": 5.6999999999999996e-05, + "loss": 1.0299, + "step": 285 + }, + { + "epoch": 0.0918285439075293, + "grad_norm": 1.402052879333496, + "learning_rate": 5.72e-05, + "loss": 1.0877, + "step": 286 + }, + { + "epoch": 0.0921496227323808, + "grad_norm": 1.4989826679229736, + "learning_rate": 5.74e-05, + "loss": 0.9015, + "step": 287 + }, + { + "epoch": 0.0924707015572323, + "grad_norm": 1.2958741188049316, + "learning_rate": 5.76e-05, + "loss": 1.0378, + "step": 288 + }, + { + "epoch": 0.0927917803820838, + "grad_norm": 2.0742673873901367, + "learning_rate": 5.7799999999999995e-05, + "loss": 1.0337, + "step": 289 + }, + { + "epoch": 0.09311285920693531, + "grad_norm": 1.4535075426101685, + "learning_rate": 5.8e-05, + "loss": 1.0506, + "step": 290 + }, + { + "epoch": 0.0934339380317868, + "grad_norm": 1.7830564975738525, + "learning_rate": 5.82e-05, + "loss": 1.0116, + "step": 291 + }, + { + "epoch": 0.0937550168566383, + "grad_norm": 1.6709142923355103, + "learning_rate": 5.8399999999999997e-05, + "loss": 0.7538, + "step": 292 + }, + { + "epoch": 0.09407609568148981, + "grad_norm": 2.7168803215026855, + "learning_rate": 5.86e-05, + "loss": 1.1456, + "step": 293 + }, + { + "epoch": 0.0943971745063413, + "grad_norm": 1.8454351425170898, + "learning_rate": 5.88e-05, + "loss": 0.7657, + "step": 294 + }, + { + "epoch": 0.09471825333119281, + "grad_norm": 2.2031004428863525, + "learning_rate": 5.9e-05, + "loss": 0.8091, + "step": 295 + }, + { + "epoch": 0.0950393321560443, + "grad_norm": 1.5919681787490845, + "learning_rate": 5.92e-05, + "loss": 0.9871, + "step": 296 + }, + { + "epoch": 0.09536041098089582, + "grad_norm": 2.7647721767425537, + "learning_rate": 5.94e-05, + "loss": 0.9902, + "step": 297 + }, + { + "epoch": 0.09568148980574731, + "grad_norm": 2.6411798000335693, + "learning_rate": 5.96e-05, + "loss": 0.9511, + "step": 298 + }, + { + "epoch": 0.0960025686305988, + "grad_norm": 2.426961898803711, + "learning_rate": 5.9800000000000003e-05, + "loss": 1.06, + "step": 299 + }, + { + "epoch": 0.09632364745545031, + "grad_norm": 2.5059926509857178, + "learning_rate": 6e-05, + "loss": 0.7106, + "step": 300 + }, + { + "epoch": 0.09664472628030181, + "grad_norm": 3.1037328243255615, + "learning_rate": 6.02e-05, + "loss": 1.5007, + "step": 301 + }, + { + "epoch": 0.09696580510515332, + "grad_norm": 2.7043604850769043, + "learning_rate": 6.04e-05, + "loss": 1.5066, + "step": 302 + }, + { + "epoch": 0.09728688393000481, + "grad_norm": 3.1140081882476807, + "learning_rate": 6.06e-05, + "loss": 1.052, + "step": 303 + }, + { + "epoch": 0.09760796275485632, + "grad_norm": 3.623964309692383, + "learning_rate": 6.08e-05, + "loss": 0.946, + "step": 304 + }, + { + "epoch": 0.09792904157970782, + "grad_norm": 3.414036750793457, + "learning_rate": 6.1e-05, + "loss": 0.9647, + "step": 305 + }, + { + "epoch": 0.09825012040455931, + "grad_norm": 3.4817802906036377, + "learning_rate": 6.12e-05, + "loss": 0.9753, + "step": 306 + }, + { + "epoch": 0.09857119922941082, + "grad_norm": 3.067047119140625, + "learning_rate": 6.14e-05, + "loss": 1.1435, + "step": 307 + }, + { + "epoch": 0.09889227805426232, + "grad_norm": 2.2207508087158203, + "learning_rate": 6.16e-05, + "loss": 0.998, + "step": 308 + }, + { + "epoch": 0.09921335687911383, + "grad_norm": 2.4805872440338135, + "learning_rate": 6.18e-05, + "loss": 0.9144, + "step": 309 + }, + { + "epoch": 0.09953443570396532, + "grad_norm": 1.8067208528518677, + "learning_rate": 6.2e-05, + "loss": 1.1963, + "step": 310 + }, + { + "epoch": 0.09985551452881683, + "grad_norm": 1.950782299041748, + "learning_rate": 6.220000000000001e-05, + "loss": 1.1947, + "step": 311 + }, + { + "epoch": 0.10017659335366833, + "grad_norm": 1.88652765750885, + "learning_rate": 6.24e-05, + "loss": 0.9983, + "step": 312 + }, + { + "epoch": 0.10049767217851982, + "grad_norm": 1.9043608903884888, + "learning_rate": 6.26e-05, + "loss": 0.8638, + "step": 313 + }, + { + "epoch": 0.10081875100337133, + "grad_norm": 1.84428071975708, + "learning_rate": 6.280000000000001e-05, + "loss": 1.0008, + "step": 314 + }, + { + "epoch": 0.10113982982822282, + "grad_norm": 1.763142466545105, + "learning_rate": 6.3e-05, + "loss": 1.3726, + "step": 315 + }, + { + "epoch": 0.10146090865307433, + "grad_norm": 1.6071946620941162, + "learning_rate": 6.32e-05, + "loss": 1.0704, + "step": 316 + }, + { + "epoch": 0.10178198747792583, + "grad_norm": 2.2262184619903564, + "learning_rate": 6.340000000000001e-05, + "loss": 1.0787, + "step": 317 + }, + { + "epoch": 0.10210306630277734, + "grad_norm": 2.342939615249634, + "learning_rate": 6.36e-05, + "loss": 1.096, + "step": 318 + }, + { + "epoch": 0.10242414512762883, + "grad_norm": 2.372135639190674, + "learning_rate": 6.38e-05, + "loss": 1.1364, + "step": 319 + }, + { + "epoch": 0.10274522395248033, + "grad_norm": 1.7545585632324219, + "learning_rate": 6.400000000000001e-05, + "loss": 0.9141, + "step": 320 + }, + { + "epoch": 0.10306630277733184, + "grad_norm": 2.9137213230133057, + "learning_rate": 6.42e-05, + "loss": 0.9489, + "step": 321 + }, + { + "epoch": 0.10338738160218333, + "grad_norm": 2.827456474304199, + "learning_rate": 6.440000000000001e-05, + "loss": 1.1006, + "step": 322 + }, + { + "epoch": 0.10370846042703484, + "grad_norm": 3.5646371841430664, + "learning_rate": 6.460000000000001e-05, + "loss": 1.0534, + "step": 323 + }, + { + "epoch": 0.10402953925188634, + "grad_norm": 1.827415943145752, + "learning_rate": 6.48e-05, + "loss": 1.0138, + "step": 324 + }, + { + "epoch": 0.10435061807673784, + "grad_norm": 2.491384267807007, + "learning_rate": 6.500000000000001e-05, + "loss": 1.4315, + "step": 325 + }, + { + "epoch": 0.10467169690158934, + "grad_norm": 1.7412528991699219, + "learning_rate": 6.52e-05, + "loss": 0.9961, + "step": 326 + }, + { + "epoch": 0.10499277572644083, + "grad_norm": 2.2214128971099854, + "learning_rate": 6.54e-05, + "loss": 1.2276, + "step": 327 + }, + { + "epoch": 0.10531385455129234, + "grad_norm": 1.9566681385040283, + "learning_rate": 6.560000000000001e-05, + "loss": 1.3574, + "step": 328 + }, + { + "epoch": 0.10563493337614384, + "grad_norm": 1.5677928924560547, + "learning_rate": 6.58e-05, + "loss": 1.1721, + "step": 329 + }, + { + "epoch": 0.10595601220099535, + "grad_norm": 2.4960641860961914, + "learning_rate": 6.6e-05, + "loss": 1.4447, + "step": 330 + }, + { + "epoch": 0.10627709102584684, + "grad_norm": 2.4556682109832764, + "learning_rate": 6.620000000000001e-05, + "loss": 1.0275, + "step": 331 + }, + { + "epoch": 0.10659816985069835, + "grad_norm": 2.3573825359344482, + "learning_rate": 6.64e-05, + "loss": 1.2238, + "step": 332 + }, + { + "epoch": 0.10691924867554985, + "grad_norm": 1.6296504735946655, + "learning_rate": 6.66e-05, + "loss": 1.0889, + "step": 333 + }, + { + "epoch": 0.10724032750040134, + "grad_norm": 1.5856961011886597, + "learning_rate": 6.680000000000001e-05, + "loss": 1.1944, + "step": 334 + }, + { + "epoch": 0.10756140632525285, + "grad_norm": 1.3460253477096558, + "learning_rate": 6.7e-05, + "loss": 1.2147, + "step": 335 + }, + { + "epoch": 0.10788248515010435, + "grad_norm": 2.425680637359619, + "learning_rate": 6.720000000000001e-05, + "loss": 1.0764, + "step": 336 + }, + { + "epoch": 0.10820356397495585, + "grad_norm": 1.4525935649871826, + "learning_rate": 6.740000000000001e-05, + "loss": 1.1013, + "step": 337 + }, + { + "epoch": 0.10852464279980735, + "grad_norm": 2.8759169578552246, + "learning_rate": 6.76e-05, + "loss": 1.1745, + "step": 338 + }, + { + "epoch": 0.10884572162465886, + "grad_norm": 2.58520770072937, + "learning_rate": 6.780000000000001e-05, + "loss": 0.8345, + "step": 339 + }, + { + "epoch": 0.10916680044951035, + "grad_norm": 1.522647738456726, + "learning_rate": 6.800000000000001e-05, + "loss": 0.9961, + "step": 340 + }, + { + "epoch": 0.10948787927436185, + "grad_norm": 2.1563315391540527, + "learning_rate": 6.82e-05, + "loss": 1.1935, + "step": 341 + }, + { + "epoch": 0.10980895809921336, + "grad_norm": 2.3725993633270264, + "learning_rate": 6.840000000000001e-05, + "loss": 1.193, + "step": 342 + }, + { + "epoch": 0.11013003692406485, + "grad_norm": 1.3734397888183594, + "learning_rate": 6.860000000000001e-05, + "loss": 1.0738, + "step": 343 + }, + { + "epoch": 0.11045111574891636, + "grad_norm": 2.5987794399261475, + "learning_rate": 6.879999999999999e-05, + "loss": 1.3577, + "step": 344 + }, + { + "epoch": 0.11077219457376786, + "grad_norm": 2.429164171218872, + "learning_rate": 6.9e-05, + "loss": 1.1118, + "step": 345 + }, + { + "epoch": 0.11109327339861937, + "grad_norm": 2.4097049236297607, + "learning_rate": 6.92e-05, + "loss": 1.0425, + "step": 346 + }, + { + "epoch": 0.11141435222347086, + "grad_norm": 2.2892768383026123, + "learning_rate": 6.939999999999999e-05, + "loss": 0.914, + "step": 347 + }, + { + "epoch": 0.11173543104832237, + "grad_norm": 2.1729300022125244, + "learning_rate": 6.96e-05, + "loss": 1.0326, + "step": 348 + }, + { + "epoch": 0.11205650987317387, + "grad_norm": 3.1933183670043945, + "learning_rate": 6.98e-05, + "loss": 0.7324, + "step": 349 + }, + { + "epoch": 0.11237758869802536, + "grad_norm": 2.5278120040893555, + "learning_rate": 7e-05, + "loss": 0.5691, + "step": 350 + }, + { + "epoch": 0.11269866752287687, + "grad_norm": 2.725175619125366, + "learning_rate": 7.02e-05, + "loss": 1.5229, + "step": 351 + }, + { + "epoch": 0.11301974634772836, + "grad_norm": 2.0373573303222656, + "learning_rate": 7.04e-05, + "loss": 1.4782, + "step": 352 + }, + { + "epoch": 0.11334082517257987, + "grad_norm": 2.7589669227600098, + "learning_rate": 7.06e-05, + "loss": 1.0219, + "step": 353 + }, + { + "epoch": 0.11366190399743137, + "grad_norm": 3.1588339805603027, + "learning_rate": 7.08e-05, + "loss": 1.0494, + "step": 354 + }, + { + "epoch": 0.11398298282228288, + "grad_norm": 2.983210563659668, + "learning_rate": 7.1e-05, + "loss": 0.9478, + "step": 355 + }, + { + "epoch": 0.11430406164713437, + "grad_norm": 2.4293222427368164, + "learning_rate": 7.12e-05, + "loss": 0.9519, + "step": 356 + }, + { + "epoch": 0.11462514047198587, + "grad_norm": 3.7218642234802246, + "learning_rate": 7.14e-05, + "loss": 0.8504, + "step": 357 + }, + { + "epoch": 0.11494621929683738, + "grad_norm": 2.7284634113311768, + "learning_rate": 7.16e-05, + "loss": 0.9837, + "step": 358 + }, + { + "epoch": 0.11526729812168887, + "grad_norm": 2.264646291732788, + "learning_rate": 7.18e-05, + "loss": 0.8728, + "step": 359 + }, + { + "epoch": 0.11558837694654038, + "grad_norm": 3.814307451248169, + "learning_rate": 7.2e-05, + "loss": 1.1014, + "step": 360 + }, + { + "epoch": 0.11590945577139188, + "grad_norm": 2.3103249073028564, + "learning_rate": 7.22e-05, + "loss": 1.0027, + "step": 361 + }, + { + "epoch": 0.11623053459624338, + "grad_norm": 2.059762716293335, + "learning_rate": 7.24e-05, + "loss": 1.0825, + "step": 362 + }, + { + "epoch": 0.11655161342109488, + "grad_norm": 1.9121147394180298, + "learning_rate": 7.26e-05, + "loss": 1.0882, + "step": 363 + }, + { + "epoch": 0.11687269224594637, + "grad_norm": 1.7068138122558594, + "learning_rate": 7.280000000000001e-05, + "loss": 1.0976, + "step": 364 + }, + { + "epoch": 0.11719377107079788, + "grad_norm": 4.309755802154541, + "learning_rate": 7.3e-05, + "loss": 1.1305, + "step": 365 + }, + { + "epoch": 0.11751484989564938, + "grad_norm": 1.5519227981567383, + "learning_rate": 7.32e-05, + "loss": 0.9828, + "step": 366 + }, + { + "epoch": 0.11783592872050089, + "grad_norm": 2.859467029571533, + "learning_rate": 7.340000000000001e-05, + "loss": 1.0766, + "step": 367 + }, + { + "epoch": 0.11815700754535238, + "grad_norm": 2.033249855041504, + "learning_rate": 7.36e-05, + "loss": 1.0528, + "step": 368 + }, + { + "epoch": 0.11847808637020389, + "grad_norm": 3.5415592193603516, + "learning_rate": 7.38e-05, + "loss": 1.1399, + "step": 369 + }, + { + "epoch": 0.11879916519505539, + "grad_norm": 2.895137071609497, + "learning_rate": 7.4e-05, + "loss": 1.1532, + "step": 370 + }, + { + "epoch": 0.11912024401990688, + "grad_norm": 1.6931350231170654, + "learning_rate": 7.42e-05, + "loss": 1.1186, + "step": 371 + }, + { + "epoch": 0.11944132284475839, + "grad_norm": 1.9246593713760376, + "learning_rate": 7.44e-05, + "loss": 1.0281, + "step": 372 + }, + { + "epoch": 0.11976240166960989, + "grad_norm": 1.789298176765442, + "learning_rate": 7.46e-05, + "loss": 1.0957, + "step": 373 + }, + { + "epoch": 0.1200834804944614, + "grad_norm": 2.8938393592834473, + "learning_rate": 7.48e-05, + "loss": 0.9665, + "step": 374 + }, + { + "epoch": 0.12040455931931289, + "grad_norm": 2.1179943084716797, + "learning_rate": 7.500000000000001e-05, + "loss": 1.1447, + "step": 375 + }, + { + "epoch": 0.1207256381441644, + "grad_norm": 1.7801289558410645, + "learning_rate": 7.52e-05, + "loss": 1.2483, + "step": 376 + }, + { + "epoch": 0.1210467169690159, + "grad_norm": 1.6875920295715332, + "learning_rate": 7.54e-05, + "loss": 1.009, + "step": 377 + }, + { + "epoch": 0.12136779579386739, + "grad_norm": 3.599494695663452, + "learning_rate": 7.560000000000001e-05, + "loss": 1.2534, + "step": 378 + }, + { + "epoch": 0.1216888746187189, + "grad_norm": 1.4294577836990356, + "learning_rate": 7.58e-05, + "loss": 1.1328, + "step": 379 + }, + { + "epoch": 0.1220099534435704, + "grad_norm": 1.809747576713562, + "learning_rate": 7.6e-05, + "loss": 1.0365, + "step": 380 + }, + { + "epoch": 0.1223310322684219, + "grad_norm": 1.7651673555374146, + "learning_rate": 7.620000000000001e-05, + "loss": 1.0842, + "step": 381 + }, + { + "epoch": 0.1226521110932734, + "grad_norm": 2.022040843963623, + "learning_rate": 7.64e-05, + "loss": 1.1518, + "step": 382 + }, + { + "epoch": 0.1229731899181249, + "grad_norm": 1.3994489908218384, + "learning_rate": 7.66e-05, + "loss": 0.9633, + "step": 383 + }, + { + "epoch": 0.1232942687429764, + "grad_norm": 1.4892637729644775, + "learning_rate": 7.680000000000001e-05, + "loss": 1.1681, + "step": 384 + }, + { + "epoch": 0.1236153475678279, + "grad_norm": 2.849607467651367, + "learning_rate": 7.7e-05, + "loss": 1.2844, + "step": 385 + }, + { + "epoch": 0.1239364263926794, + "grad_norm": 3.940389394760132, + "learning_rate": 7.72e-05, + "loss": 1.1795, + "step": 386 + }, + { + "epoch": 0.1242575052175309, + "grad_norm": 1.6070865392684937, + "learning_rate": 7.740000000000001e-05, + "loss": 1.1437, + "step": 387 + }, + { + "epoch": 0.12457858404238241, + "grad_norm": 1.3367242813110352, + "learning_rate": 7.76e-05, + "loss": 0.9778, + "step": 388 + }, + { + "epoch": 0.1248996628672339, + "grad_norm": 1.9852445125579834, + "learning_rate": 7.780000000000001e-05, + "loss": 1.0968, + "step": 389 + }, + { + "epoch": 0.1252207416920854, + "grad_norm": 2.6707146167755127, + "learning_rate": 7.800000000000001e-05, + "loss": 1.2846, + "step": 390 + }, + { + "epoch": 0.1255418205169369, + "grad_norm": 2.2092552185058594, + "learning_rate": 7.82e-05, + "loss": 1.0746, + "step": 391 + }, + { + "epoch": 0.1258628993417884, + "grad_norm": 2.5755155086517334, + "learning_rate": 7.840000000000001e-05, + "loss": 0.9431, + "step": 392 + }, + { + "epoch": 0.1261839781666399, + "grad_norm": 1.506671667098999, + "learning_rate": 7.860000000000001e-05, + "loss": 1.0914, + "step": 393 + }, + { + "epoch": 0.12650505699149142, + "grad_norm": 1.7302513122558594, + "learning_rate": 7.88e-05, + "loss": 0.9212, + "step": 394 + }, + { + "epoch": 0.12682613581634292, + "grad_norm": 3.166785478591919, + "learning_rate": 7.900000000000001e-05, + "loss": 1.1882, + "step": 395 + }, + { + "epoch": 0.1271472146411944, + "grad_norm": 2.0447499752044678, + "learning_rate": 7.920000000000001e-05, + "loss": 0.9331, + "step": 396 + }, + { + "epoch": 0.1274682934660459, + "grad_norm": 2.1262240409851074, + "learning_rate": 7.94e-05, + "loss": 0.9565, + "step": 397 + }, + { + "epoch": 0.1277893722908974, + "grad_norm": 3.0543501377105713, + "learning_rate": 7.960000000000001e-05, + "loss": 0.9219, + "step": 398 + }, + { + "epoch": 0.12811045111574892, + "grad_norm": 4.1444597244262695, + "learning_rate": 7.98e-05, + "loss": 0.8325, + "step": 399 + }, + { + "epoch": 0.12843152994060042, + "grad_norm": 1.8101000785827637, + "learning_rate": 8e-05, + "loss": 0.7651, + "step": 400 + }, + { + "epoch": 0.12875260876545191, + "grad_norm": 4.419740676879883, + "learning_rate": 8.020000000000001e-05, + "loss": 1.6615, + "step": 401 + }, + { + "epoch": 0.1290736875903034, + "grad_norm": 2.6006617546081543, + "learning_rate": 8.04e-05, + "loss": 1.6427, + "step": 402 + }, + { + "epoch": 0.12939476641515493, + "grad_norm": 3.2348382472991943, + "learning_rate": 8.060000000000001e-05, + "loss": 1.0184, + "step": 403 + }, + { + "epoch": 0.12971584524000643, + "grad_norm": 2.8556830883026123, + "learning_rate": 8.080000000000001e-05, + "loss": 1.0283, + "step": 404 + }, + { + "epoch": 0.13003692406485792, + "grad_norm": 3.4048428535461426, + "learning_rate": 8.1e-05, + "loss": 1.0695, + "step": 405 + }, + { + "epoch": 0.13035800288970942, + "grad_norm": 2.2543652057647705, + "learning_rate": 8.120000000000001e-05, + "loss": 0.9112, + "step": 406 + }, + { + "epoch": 0.1306790817145609, + "grad_norm": 4.042235851287842, + "learning_rate": 8.14e-05, + "loss": 0.9898, + "step": 407 + }, + { + "epoch": 0.13100016053941244, + "grad_norm": 2.8973443508148193, + "learning_rate": 8.16e-05, + "loss": 0.8473, + "step": 408 + }, + { + "epoch": 0.13132123936426393, + "grad_norm": 3.279799461364746, + "learning_rate": 8.18e-05, + "loss": 1.0893, + "step": 409 + }, + { + "epoch": 0.13164231818911543, + "grad_norm": 3.183879852294922, + "learning_rate": 8.2e-05, + "loss": 1.0263, + "step": 410 + }, + { + "epoch": 0.13196339701396692, + "grad_norm": 2.066544532775879, + "learning_rate": 8.22e-05, + "loss": 1.021, + "step": 411 + }, + { + "epoch": 0.13228447583881842, + "grad_norm": 1.7702231407165527, + "learning_rate": 8.24e-05, + "loss": 1.1344, + "step": 412 + }, + { + "epoch": 0.13260555466366994, + "grad_norm": 2.262660026550293, + "learning_rate": 8.26e-05, + "loss": 1.024, + "step": 413 + }, + { + "epoch": 0.13292663348852143, + "grad_norm": 1.833029866218567, + "learning_rate": 8.28e-05, + "loss": 1.2207, + "step": 414 + }, + { + "epoch": 0.13324771231337293, + "grad_norm": 2.144501209259033, + "learning_rate": 8.3e-05, + "loss": 1.1325, + "step": 415 + }, + { + "epoch": 0.13356879113822442, + "grad_norm": 3.092042922973633, + "learning_rate": 8.32e-05, + "loss": 1.3365, + "step": 416 + }, + { + "epoch": 0.13388986996307595, + "grad_norm": 2.136219024658203, + "learning_rate": 8.34e-05, + "loss": 1.0163, + "step": 417 + }, + { + "epoch": 0.13421094878792744, + "grad_norm": 1.7004696130752563, + "learning_rate": 8.36e-05, + "loss": 0.9517, + "step": 418 + }, + { + "epoch": 0.13453202761277894, + "grad_norm": 2.4267795085906982, + "learning_rate": 8.38e-05, + "loss": 1.0013, + "step": 419 + }, + { + "epoch": 0.13485310643763043, + "grad_norm": 2.0210540294647217, + "learning_rate": 8.4e-05, + "loss": 1.1871, + "step": 420 + }, + { + "epoch": 0.13517418526248193, + "grad_norm": 2.205508232116699, + "learning_rate": 8.42e-05, + "loss": 1.0592, + "step": 421 + }, + { + "epoch": 0.13549526408733345, + "grad_norm": 2.3667571544647217, + "learning_rate": 8.44e-05, + "loss": 0.9743, + "step": 422 + }, + { + "epoch": 0.13581634291218495, + "grad_norm": 2.3195512294769287, + "learning_rate": 8.46e-05, + "loss": 1.1209, + "step": 423 + }, + { + "epoch": 0.13613742173703644, + "grad_norm": 2.4299728870391846, + "learning_rate": 8.48e-05, + "loss": 1.1707, + "step": 424 + }, + { + "epoch": 0.13645850056188794, + "grad_norm": 2.341249465942383, + "learning_rate": 8.5e-05, + "loss": 1.0963, + "step": 425 + }, + { + "epoch": 0.13677957938673946, + "grad_norm": 2.3623969554901123, + "learning_rate": 8.52e-05, + "loss": 1.2913, + "step": 426 + }, + { + "epoch": 0.13710065821159095, + "grad_norm": 3.114179849624634, + "learning_rate": 8.54e-05, + "loss": 1.2193, + "step": 427 + }, + { + "epoch": 0.13742173703644245, + "grad_norm": 2.372199058532715, + "learning_rate": 8.560000000000001e-05, + "loss": 1.1863, + "step": 428 + }, + { + "epoch": 0.13774281586129394, + "grad_norm": 2.142880916595459, + "learning_rate": 8.58e-05, + "loss": 1.0837, + "step": 429 + }, + { + "epoch": 0.13806389468614544, + "grad_norm": 2.024472951889038, + "learning_rate": 8.6e-05, + "loss": 1.036, + "step": 430 + }, + { + "epoch": 0.13838497351099696, + "grad_norm": 2.7535400390625, + "learning_rate": 8.620000000000001e-05, + "loss": 0.9773, + "step": 431 + }, + { + "epoch": 0.13870605233584846, + "grad_norm": 2.199688196182251, + "learning_rate": 8.64e-05, + "loss": 1.0129, + "step": 432 + }, + { + "epoch": 0.13902713116069995, + "grad_norm": 2.8626081943511963, + "learning_rate": 8.66e-05, + "loss": 1.0404, + "step": 433 + }, + { + "epoch": 0.13934820998555145, + "grad_norm": 2.4395930767059326, + "learning_rate": 8.680000000000001e-05, + "loss": 0.9112, + "step": 434 + }, + { + "epoch": 0.13966928881040294, + "grad_norm": 2.2529709339141846, + "learning_rate": 8.7e-05, + "loss": 1.0701, + "step": 435 + }, + { + "epoch": 0.13999036763525446, + "grad_norm": 2.0451371669769287, + "learning_rate": 8.72e-05, + "loss": 1.2465, + "step": 436 + }, + { + "epoch": 0.14031144646010596, + "grad_norm": 1.9797896146774292, + "learning_rate": 8.740000000000001e-05, + "loss": 0.9705, + "step": 437 + }, + { + "epoch": 0.14063252528495745, + "grad_norm": 2.4405622482299805, + "learning_rate": 8.76e-05, + "loss": 1.197, + "step": 438 + }, + { + "epoch": 0.14095360410980895, + "grad_norm": 1.761278748512268, + "learning_rate": 8.78e-05, + "loss": 1.0646, + "step": 439 + }, + { + "epoch": 0.14127468293466047, + "grad_norm": 1.8971152305603027, + "learning_rate": 8.800000000000001e-05, + "loss": 0.9596, + "step": 440 + }, + { + "epoch": 0.14159576175951197, + "grad_norm": 2.072056531906128, + "learning_rate": 8.82e-05, + "loss": 0.9762, + "step": 441 + }, + { + "epoch": 0.14191684058436346, + "grad_norm": 2.229867935180664, + "learning_rate": 8.840000000000001e-05, + "loss": 1.1814, + "step": 442 + }, + { + "epoch": 0.14223791940921496, + "grad_norm": 1.3738412857055664, + "learning_rate": 8.86e-05, + "loss": 0.8249, + "step": 443 + }, + { + "epoch": 0.14255899823406645, + "grad_norm": 1.5406107902526855, + "learning_rate": 8.88e-05, + "loss": 0.894, + "step": 444 + }, + { + "epoch": 0.14288007705891798, + "grad_norm": 4.295513153076172, + "learning_rate": 8.900000000000001e-05, + "loss": 0.9362, + "step": 445 + }, + { + "epoch": 0.14320115588376947, + "grad_norm": 2.7712790966033936, + "learning_rate": 8.92e-05, + "loss": 0.8543, + "step": 446 + }, + { + "epoch": 0.14352223470862097, + "grad_norm": 2.536055088043213, + "learning_rate": 8.94e-05, + "loss": 0.9281, + "step": 447 + }, + { + "epoch": 0.14384331353347246, + "grad_norm": 3.111720323562622, + "learning_rate": 8.960000000000001e-05, + "loss": 0.7778, + "step": 448 + }, + { + "epoch": 0.14416439235832396, + "grad_norm": 2.919748306274414, + "learning_rate": 8.98e-05, + "loss": 0.8721, + "step": 449 + }, + { + "epoch": 0.14448547118317548, + "grad_norm": 2.425325870513916, + "learning_rate": 9e-05, + "loss": 0.7305, + "step": 450 + }, + { + "epoch": 0.14480655000802697, + "grad_norm": 2.294600009918213, + "learning_rate": 9.020000000000001e-05, + "loss": 1.6339, + "step": 451 + }, + { + "epoch": 0.14512762883287847, + "grad_norm": 2.2173311710357666, + "learning_rate": 9.04e-05, + "loss": 1.4077, + "step": 452 + }, + { + "epoch": 0.14544870765772996, + "grad_norm": 2.2760026454925537, + "learning_rate": 9.06e-05, + "loss": 1.0451, + "step": 453 + }, + { + "epoch": 0.1457697864825815, + "grad_norm": 2.850890636444092, + "learning_rate": 9.080000000000001e-05, + "loss": 0.9108, + "step": 454 + }, + { + "epoch": 0.14609086530743298, + "grad_norm": 2.1252541542053223, + "learning_rate": 9.1e-05, + "loss": 0.9681, + "step": 455 + }, + { + "epoch": 0.14641194413228448, + "grad_norm": 2.2665953636169434, + "learning_rate": 9.120000000000001e-05, + "loss": 0.9184, + "step": 456 + }, + { + "epoch": 0.14673302295713597, + "grad_norm": 3.0463030338287354, + "learning_rate": 9.140000000000001e-05, + "loss": 0.8551, + "step": 457 + }, + { + "epoch": 0.14705410178198747, + "grad_norm": 4.869279384613037, + "learning_rate": 9.16e-05, + "loss": 1.1517, + "step": 458 + }, + { + "epoch": 0.147375180606839, + "grad_norm": 2.495591878890991, + "learning_rate": 9.180000000000001e-05, + "loss": 1.264, + "step": 459 + }, + { + "epoch": 0.14769625943169049, + "grad_norm": 1.5504367351531982, + "learning_rate": 9.200000000000001e-05, + "loss": 1.1832, + "step": 460 + }, + { + "epoch": 0.14801733825654198, + "grad_norm": 1.7231016159057617, + "learning_rate": 9.22e-05, + "loss": 1.0143, + "step": 461 + }, + { + "epoch": 0.14833841708139348, + "grad_norm": 2.187086343765259, + "learning_rate": 9.240000000000001e-05, + "loss": 1.2475, + "step": 462 + }, + { + "epoch": 0.14865949590624497, + "grad_norm": 2.143453359603882, + "learning_rate": 9.260000000000001e-05, + "loss": 1.1353, + "step": 463 + }, + { + "epoch": 0.1489805747310965, + "grad_norm": 2.101834774017334, + "learning_rate": 9.28e-05, + "loss": 1.2927, + "step": 464 + }, + { + "epoch": 0.149301653555948, + "grad_norm": 2.4847493171691895, + "learning_rate": 9.300000000000001e-05, + "loss": 1.2384, + "step": 465 + }, + { + "epoch": 0.14962273238079948, + "grad_norm": 2.28727650642395, + "learning_rate": 9.320000000000002e-05, + "loss": 1.0456, + "step": 466 + }, + { + "epoch": 0.14994381120565098, + "grad_norm": 1.9797146320343018, + "learning_rate": 9.340000000000001e-05, + "loss": 1.1047, + "step": 467 + }, + { + "epoch": 0.1502648900305025, + "grad_norm": 2.9211812019348145, + "learning_rate": 9.360000000000001e-05, + "loss": 1.0695, + "step": 468 + }, + { + "epoch": 0.150585968855354, + "grad_norm": 1.7299671173095703, + "learning_rate": 9.38e-05, + "loss": 0.8681, + "step": 469 + }, + { + "epoch": 0.1509070476802055, + "grad_norm": 2.1782050132751465, + "learning_rate": 9.4e-05, + "loss": 1.1974, + "step": 470 + }, + { + "epoch": 0.151228126505057, + "grad_norm": 4.492698669433594, + "learning_rate": 9.42e-05, + "loss": 1.0819, + "step": 471 + }, + { + "epoch": 0.15154920532990848, + "grad_norm": 2.4074699878692627, + "learning_rate": 9.44e-05, + "loss": 1.1869, + "step": 472 + }, + { + "epoch": 0.15187028415476, + "grad_norm": 2.0078368186950684, + "learning_rate": 9.46e-05, + "loss": 1.0196, + "step": 473 + }, + { + "epoch": 0.1521913629796115, + "grad_norm": 1.4234404563903809, + "learning_rate": 9.48e-05, + "loss": 1.1275, + "step": 474 + }, + { + "epoch": 0.152512441804463, + "grad_norm": 1.73353111743927, + "learning_rate": 9.5e-05, + "loss": 1.103, + "step": 475 + }, + { + "epoch": 0.1528335206293145, + "grad_norm": 2.3276383876800537, + "learning_rate": 9.52e-05, + "loss": 1.0127, + "step": 476 + }, + { + "epoch": 0.15315459945416598, + "grad_norm": 2.4894511699676514, + "learning_rate": 9.54e-05, + "loss": 1.3882, + "step": 477 + }, + { + "epoch": 0.1534756782790175, + "grad_norm": 2.496798038482666, + "learning_rate": 9.56e-05, + "loss": 0.9479, + "step": 478 + }, + { + "epoch": 0.153796757103869, + "grad_norm": 1.701995611190796, + "learning_rate": 9.58e-05, + "loss": 1.0526, + "step": 479 + }, + { + "epoch": 0.1541178359287205, + "grad_norm": 1.432371973991394, + "learning_rate": 9.6e-05, + "loss": 1.1382, + "step": 480 + }, + { + "epoch": 0.154438914753572, + "grad_norm": 2.706610918045044, + "learning_rate": 9.620000000000001e-05, + "loss": 0.9633, + "step": 481 + }, + { + "epoch": 0.15475999357842352, + "grad_norm": 2.06569504737854, + "learning_rate": 9.64e-05, + "loss": 0.9708, + "step": 482 + }, + { + "epoch": 0.155081072403275, + "grad_norm": 2.1104109287261963, + "learning_rate": 9.66e-05, + "loss": 0.9835, + "step": 483 + }, + { + "epoch": 0.1554021512281265, + "grad_norm": 1.5272530317306519, + "learning_rate": 9.680000000000001e-05, + "loss": 1.0398, + "step": 484 + }, + { + "epoch": 0.155723230052978, + "grad_norm": 2.510329246520996, + "learning_rate": 9.7e-05, + "loss": 1.1728, + "step": 485 + }, + { + "epoch": 0.1560443088778295, + "grad_norm": 1.6082532405853271, + "learning_rate": 9.72e-05, + "loss": 1.1114, + "step": 486 + }, + { + "epoch": 0.15636538770268102, + "grad_norm": 1.8710626363754272, + "learning_rate": 9.74e-05, + "loss": 0.8267, + "step": 487 + }, + { + "epoch": 0.15668646652753251, + "grad_norm": 1.3604836463928223, + "learning_rate": 9.76e-05, + "loss": 0.9311, + "step": 488 + }, + { + "epoch": 0.157007545352384, + "grad_norm": 1.7233448028564453, + "learning_rate": 9.78e-05, + "loss": 1.0344, + "step": 489 + }, + { + "epoch": 0.1573286241772355, + "grad_norm": 1.827020287513733, + "learning_rate": 9.8e-05, + "loss": 1.0538, + "step": 490 + }, + { + "epoch": 0.157649703002087, + "grad_norm": 2.838731050491333, + "learning_rate": 9.82e-05, + "loss": 1.0937, + "step": 491 + }, + { + "epoch": 0.15797078182693852, + "grad_norm": 2.2758519649505615, + "learning_rate": 9.84e-05, + "loss": 1.0409, + "step": 492 + }, + { + "epoch": 0.15829186065179002, + "grad_norm": 3.4251341819763184, + "learning_rate": 9.86e-05, + "loss": 1.0816, + "step": 493 + }, + { + "epoch": 0.1586129394766415, + "grad_norm": 2.235792398452759, + "learning_rate": 9.88e-05, + "loss": 0.9386, + "step": 494 + }, + { + "epoch": 0.158934018301493, + "grad_norm": 1.5516377687454224, + "learning_rate": 9.900000000000001e-05, + "loss": 1.0178, + "step": 495 + }, + { + "epoch": 0.15925509712634453, + "grad_norm": 2.0881407260894775, + "learning_rate": 9.92e-05, + "loss": 0.893, + "step": 496 + }, + { + "epoch": 0.15957617595119603, + "grad_norm": 2.0493526458740234, + "learning_rate": 9.94e-05, + "loss": 0.9658, + "step": 497 + }, + { + "epoch": 0.15989725477604752, + "grad_norm": 1.7837196588516235, + "learning_rate": 9.960000000000001e-05, + "loss": 0.7789, + "step": 498 + }, + { + "epoch": 0.16021833360089902, + "grad_norm": 2.1364388465881348, + "learning_rate": 9.98e-05, + "loss": 0.586, + "step": 499 + }, + { + "epoch": 0.1605394124257505, + "grad_norm": 2.1484198570251465, + "learning_rate": 0.0001, + "loss": 0.6172, + "step": 500 + }, + { + "epoch": 0.16086049125060203, + "grad_norm": 6.418078899383545, + "learning_rate": 0.00010020000000000001, + "loss": 1.499, + "step": 501 + }, + { + "epoch": 0.16118157007545353, + "grad_norm": 3.4180362224578857, + "learning_rate": 0.0001004, + "loss": 1.1029, + "step": 502 + }, + { + "epoch": 0.16150264890030502, + "grad_norm": 3.427628517150879, + "learning_rate": 0.0001006, + "loss": 1.1054, + "step": 503 + }, + { + "epoch": 0.16182372772515652, + "grad_norm": 3.6065824031829834, + "learning_rate": 0.00010080000000000001, + "loss": 0.9196, + "step": 504 + }, + { + "epoch": 0.162144806550008, + "grad_norm": 2.572456121444702, + "learning_rate": 0.000101, + "loss": 0.9429, + "step": 505 + }, + { + "epoch": 0.16246588537485954, + "grad_norm": 2.5031585693359375, + "learning_rate": 0.00010120000000000001, + "loss": 0.8701, + "step": 506 + }, + { + "epoch": 0.16278696419971103, + "grad_norm": 2.1001455783843994, + "learning_rate": 0.00010140000000000001, + "loss": 0.992, + "step": 507 + }, + { + "epoch": 0.16310804302456253, + "grad_norm": 2.580418109893799, + "learning_rate": 0.0001016, + "loss": 1.0474, + "step": 508 + }, + { + "epoch": 0.16342912184941402, + "grad_norm": 2.187901496887207, + "learning_rate": 0.00010180000000000001, + "loss": 1.1151, + "step": 509 + }, + { + "epoch": 0.16375020067426554, + "grad_norm": 1.5737202167510986, + "learning_rate": 0.00010200000000000001, + "loss": 1.0749, + "step": 510 + }, + { + "epoch": 0.16407127949911704, + "grad_norm": 2.0835020542144775, + "learning_rate": 0.0001022, + "loss": 0.7957, + "step": 511 + }, + { + "epoch": 0.16439235832396853, + "grad_norm": 2.3619582653045654, + "learning_rate": 0.00010240000000000001, + "loss": 0.8308, + "step": 512 + }, + { + "epoch": 0.16471343714882003, + "grad_norm": 1.7399780750274658, + "learning_rate": 0.00010260000000000001, + "loss": 1.1296, + "step": 513 + }, + { + "epoch": 0.16503451597367153, + "grad_norm": 2.110400676727295, + "learning_rate": 0.0001028, + "loss": 0.8289, + "step": 514 + }, + { + "epoch": 0.16535559479852305, + "grad_norm": 1.960696816444397, + "learning_rate": 0.00010300000000000001, + "loss": 1.1581, + "step": 515 + }, + { + "epoch": 0.16567667362337454, + "grad_norm": 1.2459540367126465, + "learning_rate": 0.0001032, + "loss": 1.0465, + "step": 516 + }, + { + "epoch": 0.16599775244822604, + "grad_norm": 1.4018001556396484, + "learning_rate": 0.0001034, + "loss": 0.9106, + "step": 517 + }, + { + "epoch": 0.16631883127307753, + "grad_norm": 1.3802064657211304, + "learning_rate": 0.00010360000000000001, + "loss": 1.0696, + "step": 518 + }, + { + "epoch": 0.16663991009792903, + "grad_norm": 5.539737224578857, + "learning_rate": 0.0001038, + "loss": 1.0406, + "step": 519 + }, + { + "epoch": 0.16696098892278055, + "grad_norm": 2.785435676574707, + "learning_rate": 0.00010400000000000001, + "loss": 0.9252, + "step": 520 + }, + { + "epoch": 0.16728206774763205, + "grad_norm": 1.6449223756790161, + "learning_rate": 0.00010420000000000001, + "loss": 0.9388, + "step": 521 + }, + { + "epoch": 0.16760314657248354, + "grad_norm": 1.6841291189193726, + "learning_rate": 0.0001044, + "loss": 0.9676, + "step": 522 + }, + { + "epoch": 0.16792422539733504, + "grad_norm": 2.328427314758301, + "learning_rate": 0.00010460000000000001, + "loss": 0.9967, + "step": 523 + }, + { + "epoch": 0.16824530422218656, + "grad_norm": 1.5015443563461304, + "learning_rate": 0.00010480000000000001, + "loss": 0.9851, + "step": 524 + }, + { + "epoch": 0.16856638304703805, + "grad_norm": 1.587814450263977, + "learning_rate": 0.000105, + "loss": 0.9411, + "step": 525 + }, + { + "epoch": 0.16888746187188955, + "grad_norm": 2.9774792194366455, + "learning_rate": 0.00010520000000000001, + "loss": 0.9465, + "step": 526 + }, + { + "epoch": 0.16920854069674104, + "grad_norm": 2.033804416656494, + "learning_rate": 0.00010540000000000001, + "loss": 0.991, + "step": 527 + }, + { + "epoch": 0.16952961952159254, + "grad_norm": 2.3683605194091797, + "learning_rate": 0.0001056, + "loss": 1.0893, + "step": 528 + }, + { + "epoch": 0.16985069834644406, + "grad_norm": 2.22643780708313, + "learning_rate": 0.00010580000000000001, + "loss": 1.1218, + "step": 529 + }, + { + "epoch": 0.17017177717129556, + "grad_norm": 2.789557933807373, + "learning_rate": 0.00010600000000000002, + "loss": 1.0919, + "step": 530 + }, + { + "epoch": 0.17049285599614705, + "grad_norm": 1.379966378211975, + "learning_rate": 0.0001062, + "loss": 1.1469, + "step": 531 + }, + { + "epoch": 0.17081393482099855, + "grad_norm": 1.87637197971344, + "learning_rate": 0.00010640000000000001, + "loss": 1.0153, + "step": 532 + }, + { + "epoch": 0.17113501364585004, + "grad_norm": 1.728216528892517, + "learning_rate": 0.00010660000000000002, + "loss": 1.0824, + "step": 533 + }, + { + "epoch": 0.17145609247070157, + "grad_norm": 1.858970046043396, + "learning_rate": 0.00010680000000000001, + "loss": 1.0331, + "step": 534 + }, + { + "epoch": 0.17177717129555306, + "grad_norm": 2.390653133392334, + "learning_rate": 0.00010700000000000001, + "loss": 1.2246, + "step": 535 + }, + { + "epoch": 0.17209825012040456, + "grad_norm": 1.582828402519226, + "learning_rate": 0.00010720000000000002, + "loss": 1.0276, + "step": 536 + }, + { + "epoch": 0.17241932894525605, + "grad_norm": 1.6667520999908447, + "learning_rate": 0.00010740000000000001, + "loss": 0.899, + "step": 537 + }, + { + "epoch": 0.17274040777010757, + "grad_norm": 2.3515865802764893, + "learning_rate": 0.00010760000000000001, + "loss": 0.9804, + "step": 538 + }, + { + "epoch": 0.17306148659495907, + "grad_norm": 2.0802536010742188, + "learning_rate": 0.00010780000000000002, + "loss": 1.0002, + "step": 539 + }, + { + "epoch": 0.17338256541981056, + "grad_norm": 1.2941781282424927, + "learning_rate": 0.00010800000000000001, + "loss": 1.1026, + "step": 540 + }, + { + "epoch": 0.17370364424466206, + "grad_norm": 1.7126529216766357, + "learning_rate": 0.00010820000000000001, + "loss": 1.1084, + "step": 541 + }, + { + "epoch": 0.17402472306951355, + "grad_norm": 1.5774749517440796, + "learning_rate": 0.00010840000000000002, + "loss": 0.99, + "step": 542 + }, + { + "epoch": 0.17434580189436508, + "grad_norm": 1.576080322265625, + "learning_rate": 0.00010860000000000001, + "loss": 0.9603, + "step": 543 + }, + { + "epoch": 0.17466688071921657, + "grad_norm": 1.4444918632507324, + "learning_rate": 0.00010880000000000002, + "loss": 1.0226, + "step": 544 + }, + { + "epoch": 0.17498795954406807, + "grad_norm": 2.149320363998413, + "learning_rate": 0.000109, + "loss": 0.8809, + "step": 545 + }, + { + "epoch": 0.17530903836891956, + "grad_norm": 2.1339616775512695, + "learning_rate": 0.00010920000000000001, + "loss": 0.9411, + "step": 546 + }, + { + "epoch": 0.17563011719377106, + "grad_norm": 3.13826847076416, + "learning_rate": 0.00010940000000000002, + "loss": 0.9321, + "step": 547 + }, + { + "epoch": 0.17595119601862258, + "grad_norm": 2.3245761394500732, + "learning_rate": 0.00010960000000000001, + "loss": 0.8781, + "step": 548 + }, + { + "epoch": 0.17627227484347407, + "grad_norm": 1.5890634059906006, + "learning_rate": 0.00010980000000000001, + "loss": 0.8076, + "step": 549 + }, + { + "epoch": 0.17659335366832557, + "grad_norm": 1.824544906616211, + "learning_rate": 0.00011000000000000002, + "loss": 0.5724, + "step": 550 + }, + { + "epoch": 0.17691443249317707, + "grad_norm": 3.6526408195495605, + "learning_rate": 0.00011020000000000001, + "loss": 1.5191, + "step": 551 + }, + { + "epoch": 0.1772355113180286, + "grad_norm": 3.4367077350616455, + "learning_rate": 0.00011040000000000001, + "loss": 1.2869, + "step": 552 + }, + { + "epoch": 0.17755659014288008, + "grad_norm": 3.5046842098236084, + "learning_rate": 0.00011060000000000002, + "loss": 1.0015, + "step": 553 + }, + { + "epoch": 0.17787766896773158, + "grad_norm": 3.2727510929107666, + "learning_rate": 0.00011080000000000001, + "loss": 0.9038, + "step": 554 + }, + { + "epoch": 0.17819874779258307, + "grad_norm": 2.613293409347534, + "learning_rate": 0.00011100000000000001, + "loss": 0.9512, + "step": 555 + }, + { + "epoch": 0.17851982661743457, + "grad_norm": 4.011002540588379, + "learning_rate": 0.00011120000000000002, + "loss": 0.8484, + "step": 556 + }, + { + "epoch": 0.1788409054422861, + "grad_norm": 3.158597707748413, + "learning_rate": 0.00011140000000000001, + "loss": 1.0517, + "step": 557 + }, + { + "epoch": 0.1791619842671376, + "grad_norm": 2.143503427505493, + "learning_rate": 0.00011160000000000002, + "loss": 1.0966, + "step": 558 + }, + { + "epoch": 0.17948306309198908, + "grad_norm": 2.083723545074463, + "learning_rate": 0.00011180000000000002, + "loss": 1.0665, + "step": 559 + }, + { + "epoch": 0.17980414191684058, + "grad_norm": 1.7016173601150513, + "learning_rate": 0.00011200000000000001, + "loss": 0.9647, + "step": 560 + }, + { + "epoch": 0.1801252207416921, + "grad_norm": 3.817215919494629, + "learning_rate": 0.00011220000000000002, + "loss": 0.7164, + "step": 561 + }, + { + "epoch": 0.1804462995665436, + "grad_norm": 2.013831853866577, + "learning_rate": 0.00011240000000000002, + "loss": 1.135, + "step": 562 + }, + { + "epoch": 0.1807673783913951, + "grad_norm": 2.0431296825408936, + "learning_rate": 0.0001126, + "loss": 1.1298, + "step": 563 + }, + { + "epoch": 0.18108845721624658, + "grad_norm": 2.106182813644409, + "learning_rate": 0.00011279999999999999, + "loss": 1.1183, + "step": 564 + }, + { + "epoch": 0.18140953604109808, + "grad_norm": 2.747995615005493, + "learning_rate": 0.000113, + "loss": 0.9181, + "step": 565 + }, + { + "epoch": 0.1817306148659496, + "grad_norm": 2.186434268951416, + "learning_rate": 0.0001132, + "loss": 1.3199, + "step": 566 + }, + { + "epoch": 0.1820516936908011, + "grad_norm": 2.072404146194458, + "learning_rate": 0.00011339999999999999, + "loss": 0.9275, + "step": 567 + }, + { + "epoch": 0.1823727725156526, + "grad_norm": 2.051323890686035, + "learning_rate": 0.0001136, + "loss": 0.9817, + "step": 568 + }, + { + "epoch": 0.1826938513405041, + "grad_norm": 2.325993061065674, + "learning_rate": 0.0001138, + "loss": 1.2054, + "step": 569 + }, + { + "epoch": 0.18301493016535558, + "grad_norm": 2.8382861614227295, + "learning_rate": 0.00011399999999999999, + "loss": 1.0196, + "step": 570 + }, + { + "epoch": 0.1833360089902071, + "grad_norm": 1.9472187757492065, + "learning_rate": 0.0001142, + "loss": 1.028, + "step": 571 + }, + { + "epoch": 0.1836570878150586, + "grad_norm": 2.2961337566375732, + "learning_rate": 0.0001144, + "loss": 1.1601, + "step": 572 + }, + { + "epoch": 0.1839781666399101, + "grad_norm": 1.6054937839508057, + "learning_rate": 0.0001146, + "loss": 0.9415, + "step": 573 + }, + { + "epoch": 0.1842992454647616, + "grad_norm": 1.5658379793167114, + "learning_rate": 0.0001148, + "loss": 0.8921, + "step": 574 + }, + { + "epoch": 0.1846203242896131, + "grad_norm": 2.6736388206481934, + "learning_rate": 0.00011499999999999999, + "loss": 1.036, + "step": 575 + }, + { + "epoch": 0.1849414031144646, + "grad_norm": 2.179887056350708, + "learning_rate": 0.0001152, + "loss": 1.3059, + "step": 576 + }, + { + "epoch": 0.1852624819393161, + "grad_norm": 1.418059229850769, + "learning_rate": 0.0001154, + "loss": 1.0964, + "step": 577 + }, + { + "epoch": 0.1855835607641676, + "grad_norm": 2.5086400508880615, + "learning_rate": 0.00011559999999999999, + "loss": 1.1934, + "step": 578 + }, + { + "epoch": 0.1859046395890191, + "grad_norm": 2.1473209857940674, + "learning_rate": 0.0001158, + "loss": 1.0245, + "step": 579 + }, + { + "epoch": 0.18622571841387062, + "grad_norm": 4.459160804748535, + "learning_rate": 0.000116, + "loss": 1.3467, + "step": 580 + }, + { + "epoch": 0.1865467972387221, + "grad_norm": 2.073753833770752, + "learning_rate": 0.00011619999999999999, + "loss": 0.8753, + "step": 581 + }, + { + "epoch": 0.1868678760635736, + "grad_norm": 1.9933395385742188, + "learning_rate": 0.0001164, + "loss": 0.7768, + "step": 582 + }, + { + "epoch": 0.1871889548884251, + "grad_norm": 2.649278402328491, + "learning_rate": 0.0001166, + "loss": 1.2715, + "step": 583 + }, + { + "epoch": 0.1875100337132766, + "grad_norm": 3.018387794494629, + "learning_rate": 0.00011679999999999999, + "loss": 1.139, + "step": 584 + }, + { + "epoch": 0.18783111253812812, + "grad_norm": 2.405787467956543, + "learning_rate": 0.000117, + "loss": 1.1507, + "step": 585 + }, + { + "epoch": 0.18815219136297961, + "grad_norm": 2.1480977535247803, + "learning_rate": 0.0001172, + "loss": 1.1337, + "step": 586 + }, + { + "epoch": 0.1884732701878311, + "grad_norm": NaN, + "learning_rate": 0.0001172, + "loss": 1.233, + "step": 587 + }, + { + "epoch": 0.1887943490126826, + "grad_norm": 1.5942261219024658, + "learning_rate": 0.0001174, + "loss": 0.874, + "step": 588 + }, + { + "epoch": 0.18911542783753413, + "grad_norm": 3.029067039489746, + "learning_rate": 0.0001176, + "loss": 0.9061, + "step": 589 + }, + { + "epoch": 0.18943650666238562, + "grad_norm": 1.6258342266082764, + "learning_rate": 0.0001178, + "loss": 0.8887, + "step": 590 + }, + { + "epoch": 0.18975758548723712, + "grad_norm": 1.8686645030975342, + "learning_rate": 0.000118, + "loss": 0.887, + "step": 591 + }, + { + "epoch": 0.1900786643120886, + "grad_norm": 2.315640687942505, + "learning_rate": 0.0001182, + "loss": 1.2327, + "step": 592 + }, + { + "epoch": 0.1903997431369401, + "grad_norm": 2.111231565475464, + "learning_rate": 0.0001184, + "loss": 1.2629, + "step": 593 + }, + { + "epoch": 0.19072082196179163, + "grad_norm": 2.422922372817993, + "learning_rate": 0.0001186, + "loss": 0.9423, + "step": 594 + }, + { + "epoch": 0.19104190078664313, + "grad_norm": 2.0617730617523193, + "learning_rate": 0.0001188, + "loss": 1.0237, + "step": 595 + }, + { + "epoch": 0.19136297961149462, + "grad_norm": 2.295034408569336, + "learning_rate": 0.000119, + "loss": 0.993, + "step": 596 + }, + { + "epoch": 0.19168405843634612, + "grad_norm": 1.7605993747711182, + "learning_rate": 0.0001192, + "loss": 1.0313, + "step": 597 + }, + { + "epoch": 0.1920051372611976, + "grad_norm": 1.9907429218292236, + "learning_rate": 0.0001194, + "loss": 0.9979, + "step": 598 + }, + { + "epoch": 0.19232621608604913, + "grad_norm": 2.0522098541259766, + "learning_rate": 0.00011960000000000001, + "loss": 0.8497, + "step": 599 + }, + { + "epoch": 0.19264729491090063, + "grad_norm": 1.6448557376861572, + "learning_rate": 0.0001198, + "loss": 0.6443, + "step": 600 + }, + { + "epoch": 0.19296837373575212, + "grad_norm": 95.2552719116211, + "learning_rate": 0.00012, + "loss": 1.6446, + "step": 601 + }, + { + "epoch": 0.19328945256060362, + "grad_norm": 50.20346450805664, + "learning_rate": 0.00012020000000000001, + "loss": 1.4505, + "step": 602 + }, + { + "epoch": 0.19361053138545514, + "grad_norm": 3.067426919937134, + "learning_rate": 0.0001204, + "loss": 1.0486, + "step": 603 + }, + { + "epoch": 0.19393161021030664, + "grad_norm": 3.5429553985595703, + "learning_rate": 0.0001206, + "loss": 0.9873, + "step": 604 + }, + { + "epoch": 0.19425268903515813, + "grad_norm": 3.2231061458587646, + "learning_rate": 0.0001208, + "loss": 0.9301, + "step": 605 + }, + { + "epoch": 0.19457376786000963, + "grad_norm": 2.5521445274353027, + "learning_rate": 0.000121, + "loss": 0.8936, + "step": 606 + }, + { + "epoch": 0.19489484668486112, + "grad_norm": 2.4326229095458984, + "learning_rate": 0.0001212, + "loss": 0.9659, + "step": 607 + }, + { + "epoch": 0.19521592550971265, + "grad_norm": 4.246782302856445, + "learning_rate": 0.0001214, + "loss": 1.0794, + "step": 608 + }, + { + "epoch": 0.19553700433456414, + "grad_norm": 3.502556562423706, + "learning_rate": 0.0001216, + "loss": 1.0255, + "step": 609 + }, + { + "epoch": 0.19585808315941564, + "grad_norm": 2.036647319793701, + "learning_rate": 0.0001218, + "loss": 0.8848, + "step": 610 + }, + { + "epoch": 0.19617916198426713, + "grad_norm": 2.284869432449341, + "learning_rate": 0.000122, + "loss": 0.9834, + "step": 611 + }, + { + "epoch": 0.19650024080911863, + "grad_norm": 1.4191306829452515, + "learning_rate": 0.00012220000000000002, + "loss": 0.945, + "step": 612 + }, + { + "epoch": 0.19682131963397015, + "grad_norm": 2.529686450958252, + "learning_rate": 0.0001224, + "loss": 1.0871, + "step": 613 + }, + { + "epoch": 0.19714239845882164, + "grad_norm": 1.5338515043258667, + "learning_rate": 0.0001226, + "loss": 0.9792, + "step": 614 + }, + { + "epoch": 0.19746347728367314, + "grad_norm": 1.4318336248397827, + "learning_rate": 0.0001228, + "loss": 1.0498, + "step": 615 + }, + { + "epoch": 0.19778455610852463, + "grad_norm": 1.7187212705612183, + "learning_rate": 0.000123, + "loss": 1.0418, + "step": 616 + }, + { + "epoch": 0.19810563493337616, + "grad_norm": 1.5892479419708252, + "learning_rate": 0.0001232, + "loss": 1.1123, + "step": 617 + }, + { + "epoch": 0.19842671375822765, + "grad_norm": 2.3069276809692383, + "learning_rate": 0.00012340000000000002, + "loss": 0.8825, + "step": 618 + }, + { + "epoch": 0.19874779258307915, + "grad_norm": 2.5315756797790527, + "learning_rate": 0.0001236, + "loss": 0.9985, + "step": 619 + }, + { + "epoch": 0.19906887140793064, + "grad_norm": 3.1710851192474365, + "learning_rate": 0.0001238, + "loss": 1.0591, + "step": 620 + }, + { + "epoch": 0.19938995023278214, + "grad_norm": 3.0601823329925537, + "learning_rate": 0.000124, + "loss": 1.1456, + "step": 621 + }, + { + "epoch": 0.19971102905763366, + "grad_norm": 4.170618057250977, + "learning_rate": 0.0001242, + "loss": 1.0819, + "step": 622 + }, + { + "epoch": 0.20003210788248516, + "grad_norm": 6.973270416259766, + "learning_rate": 0.00012440000000000002, + "loss": 1.0412, + "step": 623 + }, + { + "epoch": 0.20035318670733665, + "grad_norm": 1.518310546875, + "learning_rate": 0.0001246, + "loss": 0.9291, + "step": 624 + }, + { + "epoch": 0.20067426553218815, + "grad_norm": 1.8494837284088135, + "learning_rate": 0.0001248, + "loss": 0.8893, + "step": 625 + }, + { + "epoch": 0.20099534435703964, + "grad_norm": 1.8757221698760986, + "learning_rate": 0.000125, + "loss": 0.9862, + "step": 626 + }, + { + "epoch": 0.20131642318189116, + "grad_norm": 2.3286845684051514, + "learning_rate": 0.0001252, + "loss": 1.3974, + "step": 627 + }, + { + "epoch": 0.20163750200674266, + "grad_norm": 2.6092865467071533, + "learning_rate": 0.0001254, + "loss": 1.3019, + "step": 628 + }, + { + "epoch": 0.20195858083159415, + "grad_norm": 2.212606906890869, + "learning_rate": 0.00012560000000000002, + "loss": 1.1727, + "step": 629 + }, + { + "epoch": 0.20227965965644565, + "grad_norm": 1.5337756872177124, + "learning_rate": 0.0001258, + "loss": 1.058, + "step": 630 + }, + { + "epoch": 0.20260073848129717, + "grad_norm": 2.063206672668457, + "learning_rate": 0.000126, + "loss": 1.1522, + "step": 631 + }, + { + "epoch": 0.20292181730614867, + "grad_norm": 2.371931552886963, + "learning_rate": 0.0001262, + "loss": 1.1871, + "step": 632 + }, + { + "epoch": 0.20324289613100016, + "grad_norm": 1.9318181276321411, + "learning_rate": 0.0001264, + "loss": 0.8263, + "step": 633 + }, + { + "epoch": 0.20356397495585166, + "grad_norm": 1.6861572265625, + "learning_rate": 0.00012660000000000001, + "loss": 0.9605, + "step": 634 + }, + { + "epoch": 0.20388505378070315, + "grad_norm": 1.6631700992584229, + "learning_rate": 0.00012680000000000002, + "loss": 1.085, + "step": 635 + }, + { + "epoch": 0.20420613260555467, + "grad_norm": 2.279987096786499, + "learning_rate": 0.000127, + "loss": 1.0187, + "step": 636 + }, + { + "epoch": 0.20452721143040617, + "grad_norm": 1.2203277349472046, + "learning_rate": 0.0001272, + "loss": 0.8815, + "step": 637 + }, + { + "epoch": 0.20484829025525766, + "grad_norm": 1.7548545598983765, + "learning_rate": 0.0001274, + "loss": 1.1048, + "step": 638 + }, + { + "epoch": 0.20516936908010916, + "grad_norm": 2.2074527740478516, + "learning_rate": 0.0001276, + "loss": 0.9279, + "step": 639 + }, + { + "epoch": 0.20549044790496065, + "grad_norm": 2.118504762649536, + "learning_rate": 0.00012780000000000002, + "loss": 1.3603, + "step": 640 + }, + { + "epoch": 0.20581152672981218, + "grad_norm": 1.5209214687347412, + "learning_rate": 0.00012800000000000002, + "loss": 0.9429, + "step": 641 + }, + { + "epoch": 0.20613260555466367, + "grad_norm": 1.7410812377929688, + "learning_rate": 0.0001282, + "loss": 0.7626, + "step": 642 + }, + { + "epoch": 0.20645368437951517, + "grad_norm": 2.151503562927246, + "learning_rate": 0.0001284, + "loss": 1.1012, + "step": 643 + }, + { + "epoch": 0.20677476320436666, + "grad_norm": 1.4617127180099487, + "learning_rate": 0.0001286, + "loss": 1.072, + "step": 644 + }, + { + "epoch": 0.20709584202921819, + "grad_norm": 1.6642550230026245, + "learning_rate": 0.00012880000000000001, + "loss": 1.08, + "step": 645 + }, + { + "epoch": 0.20741692085406968, + "grad_norm": 1.8791637420654297, + "learning_rate": 0.00012900000000000002, + "loss": 0.8933, + "step": 646 + }, + { + "epoch": 0.20773799967892118, + "grad_norm": 1.970080018043518, + "learning_rate": 0.00012920000000000002, + "loss": 0.8507, + "step": 647 + }, + { + "epoch": 0.20805907850377267, + "grad_norm": 1.6559851169586182, + "learning_rate": 0.0001294, + "loss": 0.8517, + "step": 648 + }, + { + "epoch": 0.20838015732862417, + "grad_norm": 3.3205556869506836, + "learning_rate": 0.0001296, + "loss": 0.8304, + "step": 649 + }, + { + "epoch": 0.2087012361534757, + "grad_norm": 2.0807406902313232, + "learning_rate": 0.0001298, + "loss": 0.6431, + "step": 650 + }, + { + "epoch": 0.20902231497832718, + "grad_norm": 2.3025104999542236, + "learning_rate": 0.00013000000000000002, + "loss": 1.3972, + "step": 651 + }, + { + "epoch": 0.20934339380317868, + "grad_norm": 2.2260947227478027, + "learning_rate": 0.00013020000000000002, + "loss": 1.3472, + "step": 652 + }, + { + "epoch": 0.20966447262803017, + "grad_norm": 2.1171836853027344, + "learning_rate": 0.0001304, + "loss": 0.9882, + "step": 653 + }, + { + "epoch": 0.20998555145288167, + "grad_norm": 2.8188836574554443, + "learning_rate": 0.0001306, + "loss": 0.9835, + "step": 654 + }, + { + "epoch": 0.2103066302777332, + "grad_norm": 2.3081471920013428, + "learning_rate": 0.0001308, + "loss": 0.8868, + "step": 655 + }, + { + "epoch": 0.2106277091025847, + "grad_norm": 2.6961159706115723, + "learning_rate": 0.000131, + "loss": 0.8788, + "step": 656 + }, + { + "epoch": 0.21094878792743618, + "grad_norm": 2.381976366043091, + "learning_rate": 0.00013120000000000002, + "loss": 0.8831, + "step": 657 + }, + { + "epoch": 0.21126986675228768, + "grad_norm": 1.8818697929382324, + "learning_rate": 0.00013140000000000002, + "loss": 0.9654, + "step": 658 + }, + { + "epoch": 0.2115909455771392, + "grad_norm": 1.7192848920822144, + "learning_rate": 0.0001316, + "loss": 0.9574, + "step": 659 + }, + { + "epoch": 0.2119120244019907, + "grad_norm": 1.9481501579284668, + "learning_rate": 0.0001318, + "loss": 1.15, + "step": 660 + }, + { + "epoch": 0.2122331032268422, + "grad_norm": 1.8775956630706787, + "learning_rate": 0.000132, + "loss": 0.9595, + "step": 661 + }, + { + "epoch": 0.21255418205169369, + "grad_norm": 1.4788551330566406, + "learning_rate": 0.00013220000000000001, + "loss": 1.0547, + "step": 662 + }, + { + "epoch": 0.21287526087654518, + "grad_norm": 1.339841604232788, + "learning_rate": 0.00013240000000000002, + "loss": 1.0089, + "step": 663 + }, + { + "epoch": 0.2131963397013967, + "grad_norm": 2.0733635425567627, + "learning_rate": 0.00013260000000000002, + "loss": 0.8325, + "step": 664 + }, + { + "epoch": 0.2135174185262482, + "grad_norm": 1.9117823839187622, + "learning_rate": 0.0001328, + "loss": 1.048, + "step": 665 + }, + { + "epoch": 0.2138384973510997, + "grad_norm": 1.8078653812408447, + "learning_rate": 0.000133, + "loss": 0.9458, + "step": 666 + }, + { + "epoch": 0.2141595761759512, + "grad_norm": 1.7276756763458252, + "learning_rate": 0.0001332, + "loss": 1.1302, + "step": 667 + }, + { + "epoch": 0.21448065500080268, + "grad_norm": 2.459287166595459, + "learning_rate": 0.00013340000000000002, + "loss": 1.041, + "step": 668 + }, + { + "epoch": 0.2148017338256542, + "grad_norm": 1.7712688446044922, + "learning_rate": 0.00013360000000000002, + "loss": 1.0259, + "step": 669 + }, + { + "epoch": 0.2151228126505057, + "grad_norm": 2.60807728767395, + "learning_rate": 0.00013380000000000003, + "loss": 1.0492, + "step": 670 + }, + { + "epoch": 0.2154438914753572, + "grad_norm": 1.5164037942886353, + "learning_rate": 0.000134, + "loss": 1.06, + "step": 671 + }, + { + "epoch": 0.2157649703002087, + "grad_norm": 1.355437994003296, + "learning_rate": 0.0001342, + "loss": 1.0805, + "step": 672 + }, + { + "epoch": 0.21608604912506021, + "grad_norm": 1.6522624492645264, + "learning_rate": 0.00013440000000000001, + "loss": 1.1322, + "step": 673 + }, + { + "epoch": 0.2164071279499117, + "grad_norm": 2.0744080543518066, + "learning_rate": 0.00013460000000000002, + "loss": 1.1511, + "step": 674 + }, + { + "epoch": 0.2167282067747632, + "grad_norm": 3.5550551414489746, + "learning_rate": 0.00013480000000000002, + "loss": 1.1194, + "step": 675 + }, + { + "epoch": 0.2170492855996147, + "grad_norm": 1.7500160932540894, + "learning_rate": 0.00013500000000000003, + "loss": 1.0948, + "step": 676 + }, + { + "epoch": 0.2173703644244662, + "grad_norm": 2.251464605331421, + "learning_rate": 0.0001352, + "loss": 1.0829, + "step": 677 + }, + { + "epoch": 0.21769144324931772, + "grad_norm": 1.911451816558838, + "learning_rate": 0.0001354, + "loss": 1.1959, + "step": 678 + }, + { + "epoch": 0.2180125220741692, + "grad_norm": 2.199814796447754, + "learning_rate": 0.00013560000000000002, + "loss": 1.0434, + "step": 679 + }, + { + "epoch": 0.2183336008990207, + "grad_norm": 2.2858433723449707, + "learning_rate": 0.00013580000000000002, + "loss": 0.9276, + "step": 680 + }, + { + "epoch": 0.2186546797238722, + "grad_norm": 1.9057203531265259, + "learning_rate": 0.00013600000000000003, + "loss": 1.1686, + "step": 681 + }, + { + "epoch": 0.2189757585487237, + "grad_norm": 1.6700472831726074, + "learning_rate": 0.0001362, + "loss": 1.0646, + "step": 682 + }, + { + "epoch": 0.21929683737357522, + "grad_norm": 1.5589715242385864, + "learning_rate": 0.0001364, + "loss": 1.0016, + "step": 683 + }, + { + "epoch": 0.21961791619842672, + "grad_norm": 1.5741453170776367, + "learning_rate": 0.0001366, + "loss": 1.0659, + "step": 684 + }, + { + "epoch": 0.2199389950232782, + "grad_norm": 2.477905511856079, + "learning_rate": 0.00013680000000000002, + "loss": 1.0588, + "step": 685 + }, + { + "epoch": 0.2202600738481297, + "grad_norm": 1.8854243755340576, + "learning_rate": 0.00013700000000000002, + "loss": 0.7684, + "step": 686 + }, + { + "epoch": 0.22058115267298123, + "grad_norm": 1.5644632577896118, + "learning_rate": 0.00013720000000000003, + "loss": 1.1827, + "step": 687 + }, + { + "epoch": 0.22090223149783272, + "grad_norm": 2.956141948699951, + "learning_rate": 0.0001374, + "loss": 1.0493, + "step": 688 + }, + { + "epoch": 0.22122331032268422, + "grad_norm": 1.7298507690429688, + "learning_rate": 0.00013759999999999998, + "loss": 1.1918, + "step": 689 + }, + { + "epoch": 0.22154438914753571, + "grad_norm": 1.3661199808120728, + "learning_rate": 0.0001378, + "loss": 0.9168, + "step": 690 + }, + { + "epoch": 0.2218654679723872, + "grad_norm": 3.3666417598724365, + "learning_rate": 0.000138, + "loss": 0.9071, + "step": 691 + }, + { + "epoch": 0.22218654679723873, + "grad_norm": 1.8022873401641846, + "learning_rate": 0.0001382, + "loss": 1.0451, + "step": 692 + }, + { + "epoch": 0.22250762562209023, + "grad_norm": 2.2194650173187256, + "learning_rate": 0.0001384, + "loss": 1.2171, + "step": 693 + }, + { + "epoch": 0.22282870444694172, + "grad_norm": 2.7648520469665527, + "learning_rate": 0.0001386, + "loss": 0.7702, + "step": 694 + }, + { + "epoch": 0.22314978327179322, + "grad_norm": 2.6405279636383057, + "learning_rate": 0.00013879999999999999, + "loss": 0.8529, + "step": 695 + }, + { + "epoch": 0.22347086209664474, + "grad_norm": 2.8034744262695312, + "learning_rate": 0.000139, + "loss": 1.0367, + "step": 696 + }, + { + "epoch": 0.22379194092149624, + "grad_norm": 2.957364320755005, + "learning_rate": 0.0001392, + "loss": 0.9444, + "step": 697 + }, + { + "epoch": 0.22411301974634773, + "grad_norm": 1.5870848894119263, + "learning_rate": 0.0001394, + "loss": 0.9007, + "step": 698 + }, + { + "epoch": 0.22443409857119923, + "grad_norm": 1.8650282621383667, + "learning_rate": 0.0001396, + "loss": 0.7505, + "step": 699 + }, + { + "epoch": 0.22475517739605072, + "grad_norm": 3.5914523601531982, + "learning_rate": 0.0001398, + "loss": 0.8165, + "step": 700 + }, + { + "epoch": 0.22507625622090224, + "grad_norm": 6.338365077972412, + "learning_rate": 0.00014, + "loss": 1.48, + "step": 701 + }, + { + "epoch": 0.22539733504575374, + "grad_norm": 2.300706148147583, + "learning_rate": 0.0001402, + "loss": 1.3409, + "step": 702 + }, + { + "epoch": 0.22571841387060523, + "grad_norm": 2.9373714923858643, + "learning_rate": 0.0001404, + "loss": 1.3802, + "step": 703 + }, + { + "epoch": 0.22603949269545673, + "grad_norm": 2.5629308223724365, + "learning_rate": 0.0001406, + "loss": 0.8298, + "step": 704 + }, + { + "epoch": 0.22636057152030822, + "grad_norm": 2.515796184539795, + "learning_rate": 0.0001408, + "loss": 0.7154, + "step": 705 + }, + { + "epoch": 0.22668165034515975, + "grad_norm": 2.616245985031128, + "learning_rate": 0.000141, + "loss": 0.8415, + "step": 706 + }, + { + "epoch": 0.22700272917001124, + "grad_norm": 2.6605446338653564, + "learning_rate": 0.0001412, + "loss": 0.829, + "step": 707 + }, + { + "epoch": 0.22732380799486274, + "grad_norm": 2.9635884761810303, + "learning_rate": 0.0001414, + "loss": 0.9828, + "step": 708 + }, + { + "epoch": 0.22764488681971423, + "grad_norm": 3.6770541667938232, + "learning_rate": 0.0001416, + "loss": 1.0846, + "step": 709 + }, + { + "epoch": 0.22796596564456575, + "grad_norm": 2.220874071121216, + "learning_rate": 0.0001418, + "loss": 0.9111, + "step": 710 + }, + { + "epoch": 0.22828704446941725, + "grad_norm": 2.443786859512329, + "learning_rate": 0.000142, + "loss": 0.9152, + "step": 711 + }, + { + "epoch": 0.22860812329426874, + "grad_norm": 2.4385950565338135, + "learning_rate": 0.0001422, + "loss": 0.9161, + "step": 712 + }, + { + "epoch": 0.22892920211912024, + "grad_norm": 1.6144802570343018, + "learning_rate": 0.0001424, + "loss": 0.9943, + "step": 713 + }, + { + "epoch": 0.22925028094397173, + "grad_norm": 2.6351311206817627, + "learning_rate": 0.0001426, + "loss": 0.9274, + "step": 714 + }, + { + "epoch": 0.22957135976882326, + "grad_norm": 1.7359614372253418, + "learning_rate": 0.0001428, + "loss": 1.099, + "step": 715 + }, + { + "epoch": 0.22989243859367475, + "grad_norm": 1.4439224004745483, + "learning_rate": 0.000143, + "loss": 1.0443, + "step": 716 + }, + { + "epoch": 0.23021351741852625, + "grad_norm": 2.533806085586548, + "learning_rate": 0.0001432, + "loss": 1.0614, + "step": 717 + }, + { + "epoch": 0.23053459624337774, + "grad_norm": 2.2040460109710693, + "learning_rate": 0.0001434, + "loss": 0.8822, + "step": 718 + }, + { + "epoch": 0.23085567506822924, + "grad_norm": 1.4895590543746948, + "learning_rate": 0.0001436, + "loss": 1.244, + "step": 719 + }, + { + "epoch": 0.23117675389308076, + "grad_norm": 2.5897271633148193, + "learning_rate": 0.0001438, + "loss": 1.2112, + "step": 720 + }, + { + "epoch": 0.23149783271793226, + "grad_norm": 1.7311580181121826, + "learning_rate": 0.000144, + "loss": 0.9964, + "step": 721 + }, + { + "epoch": 0.23181891154278375, + "grad_norm": 1.475502610206604, + "learning_rate": 0.0001442, + "loss": 1.0241, + "step": 722 + }, + { + "epoch": 0.23213999036763525, + "grad_norm": 2.242170810699463, + "learning_rate": 0.0001444, + "loss": 1.2009, + "step": 723 + }, + { + "epoch": 0.23246106919248677, + "grad_norm": 1.811926007270813, + "learning_rate": 0.0001446, + "loss": 0.7842, + "step": 724 + }, + { + "epoch": 0.23278214801733826, + "grad_norm": 1.8713488578796387, + "learning_rate": 0.0001448, + "loss": 1.0161, + "step": 725 + }, + { + "epoch": 0.23310322684218976, + "grad_norm": 1.8007389307022095, + "learning_rate": 0.000145, + "loss": 1.1193, + "step": 726 + }, + { + "epoch": 0.23342430566704125, + "grad_norm": 1.432399868965149, + "learning_rate": 0.0001452, + "loss": 0.9496, + "step": 727 + }, + { + "epoch": 0.23374538449189275, + "grad_norm": 1.667504072189331, + "learning_rate": 0.0001454, + "loss": 1.0116, + "step": 728 + }, + { + "epoch": 0.23406646331674427, + "grad_norm": 1.476586103439331, + "learning_rate": 0.00014560000000000002, + "loss": 1.186, + "step": 729 + }, + { + "epoch": 0.23438754214159577, + "grad_norm": 1.715766429901123, + "learning_rate": 0.0001458, + "loss": 0.9619, + "step": 730 + }, + { + "epoch": 0.23470862096644726, + "grad_norm": 1.6196917295455933, + "learning_rate": 0.000146, + "loss": 0.7596, + "step": 731 + }, + { + "epoch": 0.23502969979129876, + "grad_norm": 2.2061188220977783, + "learning_rate": 0.0001462, + "loss": 0.8178, + "step": 732 + }, + { + "epoch": 0.23535077861615025, + "grad_norm": 1.9532783031463623, + "learning_rate": 0.0001464, + "loss": 1.0697, + "step": 733 + }, + { + "epoch": 0.23567185744100178, + "grad_norm": 2.0365941524505615, + "learning_rate": 0.0001466, + "loss": 0.9422, + "step": 734 + }, + { + "epoch": 0.23599293626585327, + "grad_norm": 2.11383318901062, + "learning_rate": 0.00014680000000000002, + "loss": 1.1384, + "step": 735 + }, + { + "epoch": 0.23631401509070477, + "grad_norm": 1.7044955492019653, + "learning_rate": 0.000147, + "loss": 1.0329, + "step": 736 + }, + { + "epoch": 0.23663509391555626, + "grad_norm": 1.1772584915161133, + "learning_rate": 0.0001472, + "loss": 0.7503, + "step": 737 + }, + { + "epoch": 0.23695617274040778, + "grad_norm": 1.8224774599075317, + "learning_rate": 0.0001474, + "loss": 1.0196, + "step": 738 + }, + { + "epoch": 0.23727725156525928, + "grad_norm": 1.4998708963394165, + "learning_rate": 0.0001476, + "loss": 0.8942, + "step": 739 + }, + { + "epoch": 0.23759833039011077, + "grad_norm": 1.2025278806686401, + "learning_rate": 0.00014780000000000001, + "loss": 0.8372, + "step": 740 + }, + { + "epoch": 0.23791940921496227, + "grad_norm": 1.9754714965820312, + "learning_rate": 0.000148, + "loss": 1.1058, + "step": 741 + }, + { + "epoch": 0.23824048803981376, + "grad_norm": 1.4123296737670898, + "learning_rate": 0.0001482, + "loss": 0.897, + "step": 742 + }, + { + "epoch": 0.2385615668646653, + "grad_norm": 1.8686493635177612, + "learning_rate": 0.0001484, + "loss": 1.1749, + "step": 743 + }, + { + "epoch": 0.23888264568951678, + "grad_norm": 2.425736904144287, + "learning_rate": 0.0001486, + "loss": 0.9771, + "step": 744 + }, + { + "epoch": 0.23920372451436828, + "grad_norm": 2.431823492050171, + "learning_rate": 0.0001488, + "loss": 0.8766, + "step": 745 + }, + { + "epoch": 0.23952480333921977, + "grad_norm": 3.2059309482574463, + "learning_rate": 0.00014900000000000002, + "loss": 0.86, + "step": 746 + }, + { + "epoch": 0.23984588216407127, + "grad_norm": 1.9675642251968384, + "learning_rate": 0.0001492, + "loss": 0.7826, + "step": 747 + }, + { + "epoch": 0.2401669609889228, + "grad_norm": 2.1029534339904785, + "learning_rate": 0.0001494, + "loss": 0.9178, + "step": 748 + }, + { + "epoch": 0.24048803981377428, + "grad_norm": 1.725165605545044, + "learning_rate": 0.0001496, + "loss": 0.7452, + "step": 749 + }, + { + "epoch": 0.24080911863862578, + "grad_norm": 2.5603771209716797, + "learning_rate": 0.0001498, + "loss": 0.7243, + "step": 750 + }, + { + "epoch": 0.24113019746347727, + "grad_norm": 3.2743546962738037, + "learning_rate": 0.00015000000000000001, + "loss": 1.5645, + "step": 751 + }, + { + "epoch": 0.2414512762883288, + "grad_norm": 2.220749616622925, + "learning_rate": 0.00015020000000000002, + "loss": 1.2963, + "step": 752 + }, + { + "epoch": 0.2417723551131803, + "grad_norm": 2.2174606323242188, + "learning_rate": 0.0001504, + "loss": 0.9431, + "step": 753 + }, + { + "epoch": 0.2420934339380318, + "grad_norm": 2.5543525218963623, + "learning_rate": 0.0001506, + "loss": 0.7803, + "step": 754 + }, + { + "epoch": 0.24241451276288328, + "grad_norm": 2.6568639278411865, + "learning_rate": 0.0001508, + "loss": 0.7203, + "step": 755 + }, + { + "epoch": 0.24273559158773478, + "grad_norm": 2.6093509197235107, + "learning_rate": 0.000151, + "loss": 0.7474, + "step": 756 + }, + { + "epoch": 0.2430566704125863, + "grad_norm": 3.100468635559082, + "learning_rate": 0.00015120000000000002, + "loss": 1.1042, + "step": 757 + }, + { + "epoch": 0.2433777492374378, + "grad_norm": 2.6489973068237305, + "learning_rate": 0.00015140000000000002, + "loss": 1.1115, + "step": 758 + }, + { + "epoch": 0.2436988280622893, + "grad_norm": 1.9445077180862427, + "learning_rate": 0.0001516, + "loss": 1.2512, + "step": 759 + }, + { + "epoch": 0.2440199068871408, + "grad_norm": 2.7067511081695557, + "learning_rate": 0.0001518, + "loss": 1.0337, + "step": 760 + }, + { + "epoch": 0.24434098571199228, + "grad_norm": 1.793434977531433, + "learning_rate": 0.000152, + "loss": 1.2837, + "step": 761 + }, + { + "epoch": 0.2446620645368438, + "grad_norm": 2.2494122982025146, + "learning_rate": 0.0001522, + "loss": 0.9891, + "step": 762 + }, + { + "epoch": 0.2449831433616953, + "grad_norm": 2.235886812210083, + "learning_rate": 0.00015240000000000002, + "loss": 1.0547, + "step": 763 + }, + { + "epoch": 0.2453042221865468, + "grad_norm": 1.8249210119247437, + "learning_rate": 0.00015260000000000002, + "loss": 0.9715, + "step": 764 + }, + { + "epoch": 0.2456253010113983, + "grad_norm": 1.6558443307876587, + "learning_rate": 0.0001528, + "loss": 0.9652, + "step": 765 + }, + { + "epoch": 0.2459463798362498, + "grad_norm": 1.6599318981170654, + "learning_rate": 0.000153, + "loss": 1.2118, + "step": 766 + }, + { + "epoch": 0.2462674586611013, + "grad_norm": 2.2288310527801514, + "learning_rate": 0.0001532, + "loss": 1.0511, + "step": 767 + }, + { + "epoch": 0.2465885374859528, + "grad_norm": 2.151366949081421, + "learning_rate": 0.00015340000000000002, + "loss": 1.0277, + "step": 768 + }, + { + "epoch": 0.2469096163108043, + "grad_norm": 1.8321627378463745, + "learning_rate": 0.00015360000000000002, + "loss": 1.0368, + "step": 769 + }, + { + "epoch": 0.2472306951356558, + "grad_norm": 1.2890126705169678, + "learning_rate": 0.0001538, + "loss": 1.0081, + "step": 770 + }, + { + "epoch": 0.24755177396050732, + "grad_norm": 2.4398739337921143, + "learning_rate": 0.000154, + "loss": 0.8832, + "step": 771 + }, + { + "epoch": 0.2478728527853588, + "grad_norm": 2.993023157119751, + "learning_rate": 0.0001542, + "loss": 1.2266, + "step": 772 + }, + { + "epoch": 0.2481939316102103, + "grad_norm": 1.544333577156067, + "learning_rate": 0.0001544, + "loss": 1.0772, + "step": 773 + }, + { + "epoch": 0.2485150104350618, + "grad_norm": 1.9021294116973877, + "learning_rate": 0.00015460000000000002, + "loss": 1.1266, + "step": 774 + }, + { + "epoch": 0.2488360892599133, + "grad_norm": 2.129915952682495, + "learning_rate": 0.00015480000000000002, + "loss": 1.0554, + "step": 775 + }, + { + "epoch": 0.24915716808476482, + "grad_norm": 2.156653642654419, + "learning_rate": 0.000155, + "loss": 0.9094, + "step": 776 + }, + { + "epoch": 0.2494782469096163, + "grad_norm": 2.2110507488250732, + "learning_rate": 0.0001552, + "loss": 1.1234, + "step": 777 + }, + { + "epoch": 0.2497993257344678, + "grad_norm": 1.9623451232910156, + "learning_rate": 0.0001554, + "loss": 1.071, + "step": 778 + }, + { + "epoch": 0.25012040455931933, + "grad_norm": 2.1549320220947266, + "learning_rate": 0.00015560000000000001, + "loss": 1.1625, + "step": 779 + }, + { + "epoch": 0.2504414833841708, + "grad_norm": 1.2295719385147095, + "learning_rate": 0.00015580000000000002, + "loss": 0.8526, + "step": 780 + }, + { + "epoch": 0.2507625622090223, + "grad_norm": 1.3640968799591064, + "learning_rate": 0.00015600000000000002, + "loss": 0.9144, + "step": 781 + }, + { + "epoch": 0.2510836410338738, + "grad_norm": 2.0186564922332764, + "learning_rate": 0.0001562, + "loss": 1.0645, + "step": 782 + }, + { + "epoch": 0.2514047198587253, + "grad_norm": 2.253253698348999, + "learning_rate": 0.0001564, + "loss": 1.087, + "step": 783 + }, + { + "epoch": 0.2517257986835768, + "grad_norm": 2.00610089302063, + "learning_rate": 0.0001566, + "loss": 1.0834, + "step": 784 + }, + { + "epoch": 0.2520468775084283, + "grad_norm": 1.227858543395996, + "learning_rate": 0.00015680000000000002, + "loss": 0.8328, + "step": 785 + }, + { + "epoch": 0.2523679563332798, + "grad_norm": 1.6620421409606934, + "learning_rate": 0.00015700000000000002, + "loss": 0.9007, + "step": 786 + }, + { + "epoch": 0.25268903515813135, + "grad_norm": 1.969337821006775, + "learning_rate": 0.00015720000000000003, + "loss": 0.8842, + "step": 787 + }, + { + "epoch": 0.25301011398298284, + "grad_norm": 1.8422551155090332, + "learning_rate": 0.0001574, + "loss": 1.0329, + "step": 788 + }, + { + "epoch": 0.25333119280783434, + "grad_norm": 2.0123472213745117, + "learning_rate": 0.0001576, + "loss": 1.0015, + "step": 789 + }, + { + "epoch": 0.25365227163268583, + "grad_norm": 1.7080883979797363, + "learning_rate": 0.00015780000000000001, + "loss": 1.0998, + "step": 790 + }, + { + "epoch": 0.25397335045753733, + "grad_norm": 3.233981132507324, + "learning_rate": 0.00015800000000000002, + "loss": 1.132, + "step": 791 + }, + { + "epoch": 0.2542944292823888, + "grad_norm": 1.4392627477645874, + "learning_rate": 0.00015820000000000002, + "loss": 0.8428, + "step": 792 + }, + { + "epoch": 0.2546155081072403, + "grad_norm": 1.7838289737701416, + "learning_rate": 0.00015840000000000003, + "loss": 0.986, + "step": 793 + }, + { + "epoch": 0.2549365869320918, + "grad_norm": 2.152930498123169, + "learning_rate": 0.0001586, + "loss": 0.7264, + "step": 794 + }, + { + "epoch": 0.2552576657569433, + "grad_norm": 1.7008942365646362, + "learning_rate": 0.0001588, + "loss": 1.2359, + "step": 795 + }, + { + "epoch": 0.2555787445817948, + "grad_norm": 1.2774722576141357, + "learning_rate": 0.00015900000000000002, + "loss": 0.8856, + "step": 796 + }, + { + "epoch": 0.25589982340664635, + "grad_norm": 1.5711169242858887, + "learning_rate": 0.00015920000000000002, + "loss": 0.8394, + "step": 797 + }, + { + "epoch": 0.25622090223149785, + "grad_norm": 2.9855380058288574, + "learning_rate": 0.00015940000000000003, + "loss": 0.8879, + "step": 798 + }, + { + "epoch": 0.25654198105634934, + "grad_norm": 2.1591596603393555, + "learning_rate": 0.0001596, + "loss": 0.8859, + "step": 799 + }, + { + "epoch": 0.25686305988120084, + "grad_norm": 1.9048887491226196, + "learning_rate": 0.0001598, + "loss": 0.5577, + "step": 800 + }, + { + "epoch": 0.25718413870605233, + "grad_norm": 18.338882446289062, + "learning_rate": 0.00016, + "loss": 1.7698, + "step": 801 + }, + { + "epoch": 0.25750521753090383, + "grad_norm": 2.443437099456787, + "learning_rate": 0.00016020000000000002, + "loss": 1.5749, + "step": 802 + }, + { + "epoch": 0.2578262963557553, + "grad_norm": 2.9275338649749756, + "learning_rate": 0.00016040000000000002, + "loss": 1.1417, + "step": 803 + }, + { + "epoch": 0.2581473751806068, + "grad_norm": 2.1856069564819336, + "learning_rate": 0.00016060000000000003, + "loss": 0.919, + "step": 804 + }, + { + "epoch": 0.2584684540054583, + "grad_norm": 2.806535005569458, + "learning_rate": 0.0001608, + "loss": 0.8787, + "step": 805 + }, + { + "epoch": 0.25878953283030987, + "grad_norm": 2.342740774154663, + "learning_rate": 0.000161, + "loss": 0.8913, + "step": 806 + }, + { + "epoch": 0.25911061165516136, + "grad_norm": 2.3539390563964844, + "learning_rate": 0.00016120000000000002, + "loss": 0.9415, + "step": 807 + }, + { + "epoch": 0.25943169048001286, + "grad_norm": 2.2060728073120117, + "learning_rate": 0.00016140000000000002, + "loss": 1.1096, + "step": 808 + }, + { + "epoch": 0.25975276930486435, + "grad_norm": 1.5557055473327637, + "learning_rate": 0.00016160000000000002, + "loss": 0.9594, + "step": 809 + }, + { + "epoch": 0.26007384812971585, + "grad_norm": 1.8541979789733887, + "learning_rate": 0.00016180000000000003, + "loss": 0.9541, + "step": 810 + }, + { + "epoch": 0.26039492695456734, + "grad_norm": 2.1980228424072266, + "learning_rate": 0.000162, + "loss": 1.0466, + "step": 811 + }, + { + "epoch": 0.26071600577941884, + "grad_norm": 3.6660101413726807, + "learning_rate": 0.0001622, + "loss": 1.045, + "step": 812 + }, + { + "epoch": 0.26103708460427033, + "grad_norm": 1.4750818014144897, + "learning_rate": 0.00016240000000000002, + "loss": 0.9617, + "step": 813 + }, + { + "epoch": 0.2613581634291218, + "grad_norm": 1.7362655401229858, + "learning_rate": 0.0001626, + "loss": 1.0644, + "step": 814 + }, + { + "epoch": 0.2616792422539734, + "grad_norm": 3.413910150527954, + "learning_rate": 0.0001628, + "loss": 0.9958, + "step": 815 + }, + { + "epoch": 0.26200032107882487, + "grad_norm": 1.9933480024337769, + "learning_rate": 0.000163, + "loss": 1.0984, + "step": 816 + }, + { + "epoch": 0.26232139990367637, + "grad_norm": 2.60626220703125, + "learning_rate": 0.0001632, + "loss": 0.9565, + "step": 817 + }, + { + "epoch": 0.26264247872852786, + "grad_norm": 2.5874414443969727, + "learning_rate": 0.0001634, + "loss": 1.1555, + "step": 818 + }, + { + "epoch": 0.26296355755337936, + "grad_norm": 1.991832971572876, + "learning_rate": 0.0001636, + "loss": 0.879, + "step": 819 + }, + { + "epoch": 0.26328463637823085, + "grad_norm": 2.291994333267212, + "learning_rate": 0.0001638, + "loss": 1.1457, + "step": 820 + }, + { + "epoch": 0.26360571520308235, + "grad_norm": 2.4672718048095703, + "learning_rate": 0.000164, + "loss": 1.0751, + "step": 821 + }, + { + "epoch": 0.26392679402793384, + "grad_norm": 1.6756705045700073, + "learning_rate": 0.0001642, + "loss": 0.8653, + "step": 822 + }, + { + "epoch": 0.26424787285278534, + "grad_norm": 1.8445497751235962, + "learning_rate": 0.0001644, + "loss": 0.9074, + "step": 823 + }, + { + "epoch": 0.26456895167763683, + "grad_norm": 1.6544156074523926, + "learning_rate": 0.0001646, + "loss": 0.9691, + "step": 824 + }, + { + "epoch": 0.2648900305024884, + "grad_norm": 1.8549728393554688, + "learning_rate": 0.0001648, + "loss": 1.1391, + "step": 825 + }, + { + "epoch": 0.2652111093273399, + "grad_norm": 2.89859938621521, + "learning_rate": 0.000165, + "loss": 1.0288, + "step": 826 + }, + { + "epoch": 0.2655321881521914, + "grad_norm": 3.0639214515686035, + "learning_rate": 0.0001652, + "loss": 1.1499, + "step": 827 + }, + { + "epoch": 0.26585326697704287, + "grad_norm": 1.4466873407363892, + "learning_rate": 0.0001654, + "loss": 0.9982, + "step": 828 + }, + { + "epoch": 0.26617434580189436, + "grad_norm": 2.002682685852051, + "learning_rate": 0.0001656, + "loss": 0.9603, + "step": 829 + }, + { + "epoch": 0.26649542462674586, + "grad_norm": 2.583599805831909, + "learning_rate": 0.0001658, + "loss": 1.1498, + "step": 830 + }, + { + "epoch": 0.26681650345159735, + "grad_norm": 1.9900764226913452, + "learning_rate": 0.000166, + "loss": 1.1214, + "step": 831 + }, + { + "epoch": 0.26713758227644885, + "grad_norm": 1.384462594985962, + "learning_rate": 0.0001662, + "loss": 0.7923, + "step": 832 + }, + { + "epoch": 0.26745866110130034, + "grad_norm": 7.388343334197998, + "learning_rate": 0.0001664, + "loss": 1.2836, + "step": 833 + }, + { + "epoch": 0.2677797399261519, + "grad_norm": 1.9020540714263916, + "learning_rate": 0.0001666, + "loss": 1.0163, + "step": 834 + }, + { + "epoch": 0.2681008187510034, + "grad_norm": 1.6172999143600464, + "learning_rate": 0.0001668, + "loss": 1.1151, + "step": 835 + }, + { + "epoch": 0.2684218975758549, + "grad_norm": 1.3895915746688843, + "learning_rate": 0.000167, + "loss": 1.2039, + "step": 836 + }, + { + "epoch": 0.2687429764007064, + "grad_norm": 4.345794200897217, + "learning_rate": 0.0001672, + "loss": 1.4846, + "step": 837 + }, + { + "epoch": 0.2690640552255579, + "grad_norm": 1.9014428853988647, + "learning_rate": 0.0001674, + "loss": 1.164, + "step": 838 + }, + { + "epoch": 0.26938513405040937, + "grad_norm": 1.5048143863677979, + "learning_rate": 0.0001676, + "loss": 0.9134, + "step": 839 + }, + { + "epoch": 0.26970621287526086, + "grad_norm": 2.302903890609741, + "learning_rate": 0.0001678, + "loss": 0.9193, + "step": 840 + }, + { + "epoch": 0.27002729170011236, + "grad_norm": 2.238719940185547, + "learning_rate": 0.000168, + "loss": 0.9618, + "step": 841 + }, + { + "epoch": 0.27034837052496385, + "grad_norm": 1.821960210800171, + "learning_rate": 0.0001682, + "loss": 0.9859, + "step": 842 + }, + { + "epoch": 0.2706694493498154, + "grad_norm": 1.5913723707199097, + "learning_rate": 0.0001684, + "loss": 0.7387, + "step": 843 + }, + { + "epoch": 0.2709905281746669, + "grad_norm": 2.4586009979248047, + "learning_rate": 0.0001686, + "loss": 1.1114, + "step": 844 + }, + { + "epoch": 0.2713116069995184, + "grad_norm": 1.5896835327148438, + "learning_rate": 0.0001688, + "loss": 0.8628, + "step": 845 + }, + { + "epoch": 0.2716326858243699, + "grad_norm": 1.7305841445922852, + "learning_rate": 0.00016900000000000002, + "loss": 0.952, + "step": 846 + }, + { + "epoch": 0.2719537646492214, + "grad_norm": 2.2322258949279785, + "learning_rate": 0.0001692, + "loss": 0.8601, + "step": 847 + }, + { + "epoch": 0.2722748434740729, + "grad_norm": 3.5011134147644043, + "learning_rate": 0.0001694, + "loss": 0.8119, + "step": 848 + }, + { + "epoch": 0.2725959222989244, + "grad_norm": 1.6285743713378906, + "learning_rate": 0.0001696, + "loss": 0.8586, + "step": 849 + }, + { + "epoch": 0.27291700112377587, + "grad_norm": 2.2089757919311523, + "learning_rate": 0.0001698, + "loss": 0.7086, + "step": 850 + }, + { + "epoch": 0.27323807994862737, + "grad_norm": 3.487232208251953, + "learning_rate": 0.00017, + "loss": 1.52, + "step": 851 + }, + { + "epoch": 0.2735591587734789, + "grad_norm": 2.836216688156128, + "learning_rate": 0.00017020000000000002, + "loss": 1.4958, + "step": 852 + }, + { + "epoch": 0.2738802375983304, + "grad_norm": 2.933957099914551, + "learning_rate": 0.0001704, + "loss": 1.1824, + "step": 853 + }, + { + "epoch": 0.2742013164231819, + "grad_norm": 2.5374979972839355, + "learning_rate": 0.0001706, + "loss": 1.0371, + "step": 854 + }, + { + "epoch": 0.2745223952480334, + "grad_norm": 2.2211618423461914, + "learning_rate": 0.0001708, + "loss": 0.8578, + "step": 855 + }, + { + "epoch": 0.2748434740728849, + "grad_norm": 2.6845898628234863, + "learning_rate": 0.000171, + "loss": 0.8998, + "step": 856 + }, + { + "epoch": 0.2751645528977364, + "grad_norm": 2.2504897117614746, + "learning_rate": 0.00017120000000000001, + "loss": 0.8661, + "step": 857 + }, + { + "epoch": 0.2754856317225879, + "grad_norm": 2.5308399200439453, + "learning_rate": 0.0001714, + "loss": 0.8642, + "step": 858 + }, + { + "epoch": 0.2758067105474394, + "grad_norm": 2.6598222255706787, + "learning_rate": 0.0001716, + "loss": 0.8098, + "step": 859 + }, + { + "epoch": 0.2761277893722909, + "grad_norm": 1.8385624885559082, + "learning_rate": 0.0001718, + "loss": 1.0242, + "step": 860 + }, + { + "epoch": 0.2764488681971424, + "grad_norm": 2.0007526874542236, + "learning_rate": 0.000172, + "loss": 1.166, + "step": 861 + }, + { + "epoch": 0.2767699470219939, + "grad_norm": 3.164480209350586, + "learning_rate": 0.0001722, + "loss": 0.9461, + "step": 862 + }, + { + "epoch": 0.2770910258468454, + "grad_norm": 1.8709050416946411, + "learning_rate": 0.00017240000000000002, + "loss": 1.0128, + "step": 863 + }, + { + "epoch": 0.2774121046716969, + "grad_norm": 1.6714998483657837, + "learning_rate": 0.0001726, + "loss": 0.8408, + "step": 864 + }, + { + "epoch": 0.2777331834965484, + "grad_norm": 1.7060233354568481, + "learning_rate": 0.0001728, + "loss": 0.796, + "step": 865 + }, + { + "epoch": 0.2780542623213999, + "grad_norm": 1.3818310499191284, + "learning_rate": 0.000173, + "loss": 1.0166, + "step": 866 + }, + { + "epoch": 0.2783753411462514, + "grad_norm": 1.8317921161651611, + "learning_rate": 0.0001732, + "loss": 1.0847, + "step": 867 + }, + { + "epoch": 0.2786964199711029, + "grad_norm": 2.0381312370300293, + "learning_rate": 0.0001734, + "loss": 0.9889, + "step": 868 + }, + { + "epoch": 0.2790174987959544, + "grad_norm": 2.772416353225708, + "learning_rate": 0.00017360000000000002, + "loss": 1.1122, + "step": 869 + }, + { + "epoch": 0.2793385776208059, + "grad_norm": 4.282870292663574, + "learning_rate": 0.0001738, + "loss": 1.2099, + "step": 870 + }, + { + "epoch": 0.27965965644565743, + "grad_norm": 1.2967722415924072, + "learning_rate": 0.000174, + "loss": 0.9476, + "step": 871 + }, + { + "epoch": 0.27998073527050893, + "grad_norm": 1.8298275470733643, + "learning_rate": 0.0001742, + "loss": 1.0377, + "step": 872 + }, + { + "epoch": 0.2803018140953604, + "grad_norm": 2.2432875633239746, + "learning_rate": 0.0001744, + "loss": 1.0409, + "step": 873 + }, + { + "epoch": 0.2806228929202119, + "grad_norm": 1.3698315620422363, + "learning_rate": 0.00017460000000000002, + "loss": 1.0456, + "step": 874 + }, + { + "epoch": 0.2809439717450634, + "grad_norm": 1.9452381134033203, + "learning_rate": 0.00017480000000000002, + "loss": 0.8272, + "step": 875 + }, + { + "epoch": 0.2812650505699149, + "grad_norm": 1.601507306098938, + "learning_rate": 0.000175, + "loss": 1.0913, + "step": 876 + }, + { + "epoch": 0.2815861293947664, + "grad_norm": 3.9005463123321533, + "learning_rate": 0.0001752, + "loss": 1.4486, + "step": 877 + }, + { + "epoch": 0.2819072082196179, + "grad_norm": 2.154240608215332, + "learning_rate": 0.0001754, + "loss": 0.9732, + "step": 878 + }, + { + "epoch": 0.2822282870444694, + "grad_norm": 1.2495157718658447, + "learning_rate": 0.0001756, + "loss": 1.0217, + "step": 879 + }, + { + "epoch": 0.28254936586932095, + "grad_norm": 6.445877552032471, + "learning_rate": 0.00017580000000000002, + "loss": 1.1539, + "step": 880 + }, + { + "epoch": 0.28287044469417244, + "grad_norm": 2.6896142959594727, + "learning_rate": 0.00017600000000000002, + "loss": 1.0758, + "step": 881 + }, + { + "epoch": 0.28319152351902394, + "grad_norm": 1.3675345182418823, + "learning_rate": 0.0001762, + "loss": 1.0229, + "step": 882 + }, + { + "epoch": 0.28351260234387543, + "grad_norm": 9.003292083740234, + "learning_rate": 0.0001764, + "loss": 1.1875, + "step": 883 + }, + { + "epoch": 0.2838336811687269, + "grad_norm": 1.7090665102005005, + "learning_rate": 0.0001766, + "loss": 1.2299, + "step": 884 + }, + { + "epoch": 0.2841547599935784, + "grad_norm": 2.320629119873047, + "learning_rate": 0.00017680000000000001, + "loss": 1.3493, + "step": 885 + }, + { + "epoch": 0.2844758388184299, + "grad_norm": 3.2875213623046875, + "learning_rate": 0.00017700000000000002, + "loss": 1.0224, + "step": 886 + }, + { + "epoch": 0.2847969176432814, + "grad_norm": 3.1284916400909424, + "learning_rate": 0.0001772, + "loss": 1.0231, + "step": 887 + }, + { + "epoch": 0.2851179964681329, + "grad_norm": 1.9187723398208618, + "learning_rate": 0.0001774, + "loss": 1.0035, + "step": 888 + }, + { + "epoch": 0.2854390752929844, + "grad_norm": 1.3869761228561401, + "learning_rate": 0.0001776, + "loss": 0.8738, + "step": 889 + }, + { + "epoch": 0.28576015411783595, + "grad_norm": 1.281718134880066, + "learning_rate": 0.0001778, + "loss": 0.8464, + "step": 890 + }, + { + "epoch": 0.28608123294268745, + "grad_norm": 1.203602910041809, + "learning_rate": 0.00017800000000000002, + "loss": 0.9197, + "step": 891 + }, + { + "epoch": 0.28640231176753894, + "grad_norm": 1.9265743494033813, + "learning_rate": 0.00017820000000000002, + "loss": 1.0422, + "step": 892 + }, + { + "epoch": 0.28672339059239044, + "grad_norm": 1.4553351402282715, + "learning_rate": 0.0001784, + "loss": 1.1858, + "step": 893 + }, + { + "epoch": 0.28704446941724193, + "grad_norm": 1.6324553489685059, + "learning_rate": 0.0001786, + "loss": 0.9635, + "step": 894 + }, + { + "epoch": 0.2873655482420934, + "grad_norm": 4.445520401000977, + "learning_rate": 0.0001788, + "loss": 1.1842, + "step": 895 + }, + { + "epoch": 0.2876866270669449, + "grad_norm": 2.625737428665161, + "learning_rate": 0.00017900000000000001, + "loss": 1.0714, + "step": 896 + }, + { + "epoch": 0.2880077058917964, + "grad_norm": 1.3031972646713257, + "learning_rate": 0.00017920000000000002, + "loss": 0.9172, + "step": 897 + }, + { + "epoch": 0.2883287847166479, + "grad_norm": 1.1816191673278809, + "learning_rate": 0.00017940000000000002, + "loss": 0.8658, + "step": 898 + }, + { + "epoch": 0.28864986354149946, + "grad_norm": 2.7634835243225098, + "learning_rate": 0.0001796, + "loss": 0.8205, + "step": 899 + }, + { + "epoch": 0.28897094236635096, + "grad_norm": 1.715277910232544, + "learning_rate": 0.0001798, + "loss": 0.6063, + "step": 900 + }, + { + "epoch": 0.28929202119120245, + "grad_norm": 5.679165840148926, + "learning_rate": 0.00018, + "loss": 1.6148, + "step": 901 + }, + { + "epoch": 0.28961310001605395, + "grad_norm": 8.520236015319824, + "learning_rate": 0.00018020000000000002, + "loss": 1.669, + "step": 902 + }, + { + "epoch": 0.28993417884090544, + "grad_norm": 5.708471298217773, + "learning_rate": 0.00018040000000000002, + "loss": 1.0973, + "step": 903 + }, + { + "epoch": 0.29025525766575694, + "grad_norm": 4.294043064117432, + "learning_rate": 0.00018060000000000003, + "loss": 0.9894, + "step": 904 + }, + { + "epoch": 0.29057633649060843, + "grad_norm": 4.61492919921875, + "learning_rate": 0.0001808, + "loss": 0.9369, + "step": 905 + }, + { + "epoch": 0.29089741531545993, + "grad_norm": 2.621717691421509, + "learning_rate": 0.000181, + "loss": 1.0031, + "step": 906 + }, + { + "epoch": 0.2912184941403114, + "grad_norm": 2.668614149093628, + "learning_rate": 0.0001812, + "loss": 0.9567, + "step": 907 + }, + { + "epoch": 0.291539572965163, + "grad_norm": 2.6157805919647217, + "learning_rate": 0.00018140000000000002, + "loss": 1.0925, + "step": 908 + }, + { + "epoch": 0.29186065179001447, + "grad_norm": 2.2772955894470215, + "learning_rate": 0.00018160000000000002, + "loss": 1.0826, + "step": 909 + }, + { + "epoch": 0.29218173061486596, + "grad_norm": 2.532787561416626, + "learning_rate": 0.00018180000000000003, + "loss": 1.0492, + "step": 910 + }, + { + "epoch": 0.29250280943971746, + "grad_norm": 2.4008162021636963, + "learning_rate": 0.000182, + "loss": 0.9815, + "step": 911 + }, + { + "epoch": 0.29282388826456895, + "grad_norm": 1.3333711624145508, + "learning_rate": 0.0001822, + "loss": 1.071, + "step": 912 + }, + { + "epoch": 0.29314496708942045, + "grad_norm": 2.2058145999908447, + "learning_rate": 0.00018240000000000002, + "loss": 1.1503, + "step": 913 + }, + { + "epoch": 0.29346604591427194, + "grad_norm": 1.4182907342910767, + "learning_rate": 0.00018260000000000002, + "loss": 1.1016, + "step": 914 + }, + { + "epoch": 0.29378712473912344, + "grad_norm": 2.7944185733795166, + "learning_rate": 0.00018280000000000003, + "loss": 1.233, + "step": 915 + }, + { + "epoch": 0.29410820356397493, + "grad_norm": 1.5195378065109253, + "learning_rate": 0.000183, + "loss": 1.1977, + "step": 916 + }, + { + "epoch": 0.29442928238882643, + "grad_norm": 1.8296455144882202, + "learning_rate": 0.0001832, + "loss": 0.9808, + "step": 917 + }, + { + "epoch": 0.294750361213678, + "grad_norm": 1.8149925470352173, + "learning_rate": 0.0001834, + "loss": 1.0705, + "step": 918 + }, + { + "epoch": 0.2950714400385295, + "grad_norm": 1.741716980934143, + "learning_rate": 0.00018360000000000002, + "loss": 1.1261, + "step": 919 + }, + { + "epoch": 0.29539251886338097, + "grad_norm": 2.0786545276641846, + "learning_rate": 0.00018380000000000002, + "loss": 1.1481, + "step": 920 + }, + { + "epoch": 0.29571359768823247, + "grad_norm": 1.7828667163848877, + "learning_rate": 0.00018400000000000003, + "loss": 1.034, + "step": 921 + }, + { + "epoch": 0.29603467651308396, + "grad_norm": 1.6644797325134277, + "learning_rate": 0.0001842, + "loss": 1.1322, + "step": 922 + }, + { + "epoch": 0.29635575533793546, + "grad_norm": 1.3413432836532593, + "learning_rate": 0.0001844, + "loss": 1.1023, + "step": 923 + }, + { + "epoch": 0.29667683416278695, + "grad_norm": 1.985334873199463, + "learning_rate": 0.00018460000000000001, + "loss": 0.9872, + "step": 924 + }, + { + "epoch": 0.29699791298763845, + "grad_norm": 2.2249557971954346, + "learning_rate": 0.00018480000000000002, + "loss": 1.0467, + "step": 925 + }, + { + "epoch": 0.29731899181248994, + "grad_norm": 1.843141794204712, + "learning_rate": 0.00018500000000000002, + "loss": 1.0913, + "step": 926 + }, + { + "epoch": 0.2976400706373415, + "grad_norm": 1.8132327795028687, + "learning_rate": 0.00018520000000000003, + "loss": 0.9764, + "step": 927 + }, + { + "epoch": 0.297961149462193, + "grad_norm": 2.068922996520996, + "learning_rate": 0.0001854, + "loss": 0.9522, + "step": 928 + }, + { + "epoch": 0.2982822282870445, + "grad_norm": 1.975690245628357, + "learning_rate": 0.0001856, + "loss": 0.961, + "step": 929 + }, + { + "epoch": 0.298603307111896, + "grad_norm": 2.761979103088379, + "learning_rate": 0.00018580000000000002, + "loss": 1.1271, + "step": 930 + }, + { + "epoch": 0.29892438593674747, + "grad_norm": 2.052741527557373, + "learning_rate": 0.00018600000000000002, + "loss": 1.0824, + "step": 931 + }, + { + "epoch": 0.29924546476159897, + "grad_norm": 2.1390645503997803, + "learning_rate": 0.00018620000000000003, + "loss": 1.2875, + "step": 932 + }, + { + "epoch": 0.29956654358645046, + "grad_norm": 2.17155385017395, + "learning_rate": 0.00018640000000000003, + "loss": 1.0559, + "step": 933 + }, + { + "epoch": 0.29988762241130196, + "grad_norm": 1.4446396827697754, + "learning_rate": 0.0001866, + "loss": 1.052, + "step": 934 + }, + { + "epoch": 0.30020870123615345, + "grad_norm": 2.108684778213501, + "learning_rate": 0.00018680000000000001, + "loss": 0.9973, + "step": 935 + }, + { + "epoch": 0.300529780061005, + "grad_norm": 1.7543820142745972, + "learning_rate": 0.00018700000000000002, + "loss": 1.0001, + "step": 936 + }, + { + "epoch": 0.3008508588858565, + "grad_norm": 1.7651071548461914, + "learning_rate": 0.00018720000000000002, + "loss": 0.8726, + "step": 937 + }, + { + "epoch": 0.301171937710708, + "grad_norm": 1.3931697607040405, + "learning_rate": 0.00018740000000000003, + "loss": 0.9752, + "step": 938 + }, + { + "epoch": 0.3014930165355595, + "grad_norm": 1.4437761306762695, + "learning_rate": 0.0001876, + "loss": 0.8859, + "step": 939 + }, + { + "epoch": 0.301814095360411, + "grad_norm": 2.0850377082824707, + "learning_rate": 0.0001878, + "loss": 1.0133, + "step": 940 + }, + { + "epoch": 0.3021351741852625, + "grad_norm": 2.9318671226501465, + "learning_rate": 0.000188, + "loss": 1.0726, + "step": 941 + }, + { + "epoch": 0.302456253010114, + "grad_norm": 1.589202880859375, + "learning_rate": 0.0001882, + "loss": 0.8896, + "step": 942 + }, + { + "epoch": 0.30277733183496547, + "grad_norm": 1.5876160860061646, + "learning_rate": 0.0001884, + "loss": 0.8636, + "step": 943 + }, + { + "epoch": 0.30309841065981696, + "grad_norm": 1.6258230209350586, + "learning_rate": 0.0001886, + "loss": 1.0084, + "step": 944 + }, + { + "epoch": 0.30341948948466846, + "grad_norm": 2.1613311767578125, + "learning_rate": 0.0001888, + "loss": 0.8062, + "step": 945 + }, + { + "epoch": 0.30374056830952, + "grad_norm": 1.6936204433441162, + "learning_rate": 0.00018899999999999999, + "loss": 1.1147, + "step": 946 + }, + { + "epoch": 0.3040616471343715, + "grad_norm": 2.857898473739624, + "learning_rate": 0.0001892, + "loss": 0.8206, + "step": 947 + }, + { + "epoch": 0.304382725959223, + "grad_norm": 1.1513748168945312, + "learning_rate": 0.0001894, + "loss": 0.7425, + "step": 948 + }, + { + "epoch": 0.3047038047840745, + "grad_norm": 1.6653155088424683, + "learning_rate": 0.0001896, + "loss": 0.9059, + "step": 949 + }, + { + "epoch": 0.305024883608926, + "grad_norm": 1.6235933303833008, + "learning_rate": 0.0001898, + "loss": 0.7386, + "step": 950 + }, + { + "epoch": 0.3053459624337775, + "grad_norm": 3.6082000732421875, + "learning_rate": 0.00019, + "loss": 1.5358, + "step": 951 + }, + { + "epoch": 0.305667041258629, + "grad_norm": 2.132099151611328, + "learning_rate": 0.0001902, + "loss": 1.4807, + "step": 952 + }, + { + "epoch": 0.3059881200834805, + "grad_norm": 1.9843887090682983, + "learning_rate": 0.0001904, + "loss": 0.91, + "step": 953 + }, + { + "epoch": 0.30630919890833197, + "grad_norm": 1.8752169609069824, + "learning_rate": 0.0001906, + "loss": 0.686, + "step": 954 + }, + { + "epoch": 0.3066302777331835, + "grad_norm": 3.0422487258911133, + "learning_rate": 0.0001908, + "loss": 1.0079, + "step": 955 + }, + { + "epoch": 0.306951356558035, + "grad_norm": 2.0845787525177, + "learning_rate": 0.000191, + "loss": 0.8389, + "step": 956 + }, + { + "epoch": 0.3072724353828865, + "grad_norm": 2.3233184814453125, + "learning_rate": 0.0001912, + "loss": 0.9484, + "step": 957 + }, + { + "epoch": 0.307593514207738, + "grad_norm": 2.147631883621216, + "learning_rate": 0.0001914, + "loss": 1.0776, + "step": 958 + }, + { + "epoch": 0.3079145930325895, + "grad_norm": 1.7034640312194824, + "learning_rate": 0.0001916, + "loss": 0.9197, + "step": 959 + }, + { + "epoch": 0.308235671857441, + "grad_norm": 1.8278881311416626, + "learning_rate": 0.0001918, + "loss": 1.1262, + "step": 960 + }, + { + "epoch": 0.3085567506822925, + "grad_norm": 2.158137321472168, + "learning_rate": 0.000192, + "loss": 1.0464, + "step": 961 + }, + { + "epoch": 0.308877829507144, + "grad_norm": 1.8429055213928223, + "learning_rate": 0.0001922, + "loss": 1.1075, + "step": 962 + }, + { + "epoch": 0.3091989083319955, + "grad_norm": 1.6389920711517334, + "learning_rate": 0.00019240000000000001, + "loss": 1.1258, + "step": 963 + }, + { + "epoch": 0.30951998715684703, + "grad_norm": 1.337121605873108, + "learning_rate": 0.0001926, + "loss": 0.789, + "step": 964 + }, + { + "epoch": 0.3098410659816985, + "grad_norm": 1.542604684829712, + "learning_rate": 0.0001928, + "loss": 1.1683, + "step": 965 + }, + { + "epoch": 0.31016214480655, + "grad_norm": 1.8985679149627686, + "learning_rate": 0.000193, + "loss": 0.8494, + "step": 966 + }, + { + "epoch": 0.3104832236314015, + "grad_norm": 1.403977870941162, + "learning_rate": 0.0001932, + "loss": 1.2153, + "step": 967 + }, + { + "epoch": 0.310804302456253, + "grad_norm": 1.4651598930358887, + "learning_rate": 0.0001934, + "loss": 1.0217, + "step": 968 + }, + { + "epoch": 0.3111253812811045, + "grad_norm": 1.3526400327682495, + "learning_rate": 0.00019360000000000002, + "loss": 0.956, + "step": 969 + }, + { + "epoch": 0.311446460105956, + "grad_norm": 3.0191409587860107, + "learning_rate": 0.0001938, + "loss": 1.1278, + "step": 970 + }, + { + "epoch": 0.3117675389308075, + "grad_norm": 1.9719985723495483, + "learning_rate": 0.000194, + "loss": 1.1472, + "step": 971 + }, + { + "epoch": 0.312088617755659, + "grad_norm": 2.3695621490478516, + "learning_rate": 0.0001942, + "loss": 1.1406, + "step": 972 + }, + { + "epoch": 0.31240969658051054, + "grad_norm": 1.8187494277954102, + "learning_rate": 0.0001944, + "loss": 1.0193, + "step": 973 + }, + { + "epoch": 0.31273077540536204, + "grad_norm": 1.494602084159851, + "learning_rate": 0.00019460000000000001, + "loss": 1.0561, + "step": 974 + }, + { + "epoch": 0.31305185423021353, + "grad_norm": 2.378328323364258, + "learning_rate": 0.0001948, + "loss": 0.9976, + "step": 975 + }, + { + "epoch": 0.31337293305506503, + "grad_norm": 1.5211881399154663, + "learning_rate": 0.000195, + "loss": 1.1273, + "step": 976 + }, + { + "epoch": 0.3136940118799165, + "grad_norm": 1.2796664237976074, + "learning_rate": 0.0001952, + "loss": 0.8582, + "step": 977 + }, + { + "epoch": 0.314015090704768, + "grad_norm": 1.8555333614349365, + "learning_rate": 0.0001954, + "loss": 1.0359, + "step": 978 + }, + { + "epoch": 0.3143361695296195, + "grad_norm": 1.1893086433410645, + "learning_rate": 0.0001956, + "loss": 0.8822, + "step": 979 + }, + { + "epoch": 0.314657248354471, + "grad_norm": 2.2236552238464355, + "learning_rate": 0.00019580000000000002, + "loss": 1.0201, + "step": 980 + }, + { + "epoch": 0.3149783271793225, + "grad_norm": 2.314167022705078, + "learning_rate": 0.000196, + "loss": 0.9865, + "step": 981 + }, + { + "epoch": 0.315299406004174, + "grad_norm": 2.2062315940856934, + "learning_rate": 0.0001962, + "loss": 1.117, + "step": 982 + }, + { + "epoch": 0.31562048482902555, + "grad_norm": 2.284191846847534, + "learning_rate": 0.0001964, + "loss": 1.1285, + "step": 983 + }, + { + "epoch": 0.31594156365387704, + "grad_norm": 2.3977997303009033, + "learning_rate": 0.0001966, + "loss": 1.2463, + "step": 984 + }, + { + "epoch": 0.31626264247872854, + "grad_norm": 2.204314708709717, + "learning_rate": 0.0001968, + "loss": 0.8849, + "step": 985 + }, + { + "epoch": 0.31658372130358003, + "grad_norm": 3.1310677528381348, + "learning_rate": 0.00019700000000000002, + "loss": 1.2409, + "step": 986 + }, + { + "epoch": 0.31690480012843153, + "grad_norm": 1.4078696966171265, + "learning_rate": 0.0001972, + "loss": 0.8572, + "step": 987 + }, + { + "epoch": 0.317225878953283, + "grad_norm": 1.9044156074523926, + "learning_rate": 0.0001974, + "loss": 0.9414, + "step": 988 + }, + { + "epoch": 0.3175469577781345, + "grad_norm": 1.3988277912139893, + "learning_rate": 0.0001976, + "loss": 1.1101, + "step": 989 + }, + { + "epoch": 0.317868036602986, + "grad_norm": 1.4820644855499268, + "learning_rate": 0.0001978, + "loss": 1.0266, + "step": 990 + }, + { + "epoch": 0.3181891154278375, + "grad_norm": 1.5065770149230957, + "learning_rate": 0.00019800000000000002, + "loss": 0.8049, + "step": 991 + }, + { + "epoch": 0.31851019425268906, + "grad_norm": 1.5369571447372437, + "learning_rate": 0.00019820000000000002, + "loss": 0.9029, + "step": 992 + }, + { + "epoch": 0.31883127307754056, + "grad_norm": 4.79142951965332, + "learning_rate": 0.0001984, + "loss": 0.9041, + "step": 993 + }, + { + "epoch": 0.31915235190239205, + "grad_norm": 2.016233205795288, + "learning_rate": 0.0001986, + "loss": 1.0123, + "step": 994 + }, + { + "epoch": 0.31947343072724355, + "grad_norm": 1.5374374389648438, + "learning_rate": 0.0001988, + "loss": 0.8762, + "step": 995 + }, + { + "epoch": 0.31979450955209504, + "grad_norm": 1.3565267324447632, + "learning_rate": 0.000199, + "loss": 0.7286, + "step": 996 + }, + { + "epoch": 0.32011558837694654, + "grad_norm": 2.1161487102508545, + "learning_rate": 0.00019920000000000002, + "loss": 1.025, + "step": 997 + }, + { + "epoch": 0.32043666720179803, + "grad_norm": 1.741025686264038, + "learning_rate": 0.00019940000000000002, + "loss": 0.7437, + "step": 998 + }, + { + "epoch": 0.3207577460266495, + "grad_norm": 2.049365997314453, + "learning_rate": 0.0001996, + "loss": 0.7756, + "step": 999 + }, + { + "epoch": 0.321078824851501, + "grad_norm": 1.8877449035644531, + "learning_rate": 0.0001998, + "loss": 0.695, + "step": 1000 + }, + { + "epoch": 0.32139990367635257, + "grad_norm": 2.1886143684387207, + "learning_rate": 0.0002, + "loss": 1.4415, + "step": 1001 + }, + { + "epoch": 0.32172098250120407, + "grad_norm": 2.847805976867676, + "learning_rate": 0.00019999999390765168, + "loss": 1.2191, + "step": 1002 + }, + { + "epoch": 0.32204206132605556, + "grad_norm": 2.2231152057647705, + "learning_rate": 0.0001999999756306074, + "loss": 1.0706, + "step": 1003 + }, + { + "epoch": 0.32236314015090706, + "grad_norm": 2.219747304916382, + "learning_rate": 0.00019999994516886946, + "loss": 1.0274, + "step": 1004 + }, + { + "epoch": 0.32268421897575855, + "grad_norm": 2.3217408657073975, + "learning_rate": 0.00019999990252244151, + "loss": 0.9952, + "step": 1005 + }, + { + "epoch": 0.32300529780061005, + "grad_norm": 2.1023569107055664, + "learning_rate": 0.00019999984769132877, + "loss": 0.9075, + "step": 1006 + }, + { + "epoch": 0.32332637662546154, + "grad_norm": 2.414299249649048, + "learning_rate": 0.00019999978067553796, + "loss": 0.916, + "step": 1007 + }, + { + "epoch": 0.32364745545031304, + "grad_norm": 1.7825126647949219, + "learning_rate": 0.00019999970147507713, + "loss": 1.0684, + "step": 1008 + }, + { + "epoch": 0.32396853427516453, + "grad_norm": 1.9296294450759888, + "learning_rate": 0.00019999961008995606, + "loss": 1.1201, + "step": 1009 + }, + { + "epoch": 0.324289613100016, + "grad_norm": 1.7151492834091187, + "learning_rate": 0.00019999950652018584, + "loss": 0.947, + "step": 1010 + }, + { + "epoch": 0.3246106919248676, + "grad_norm": 1.7226728200912476, + "learning_rate": 0.00019999939076577905, + "loss": 1.0448, + "step": 1011 + }, + { + "epoch": 0.3249317707497191, + "grad_norm": 1.8700830936431885, + "learning_rate": 0.00019999926282674983, + "loss": 0.9668, + "step": 1012 + }, + { + "epoch": 0.32525284957457057, + "grad_norm": 1.468245267868042, + "learning_rate": 0.00019999912270311375, + "loss": 0.8688, + "step": 1013 + }, + { + "epoch": 0.32557392839942206, + "grad_norm": 2.5088555812835693, + "learning_rate": 0.00019999897039488796, + "loss": 1.0542, + "step": 1014 + }, + { + "epoch": 0.32589500722427356, + "grad_norm": 1.5317341089248657, + "learning_rate": 0.0001999988059020909, + "loss": 1.1052, + "step": 1015 + }, + { + "epoch": 0.32621608604912505, + "grad_norm": 1.1775538921356201, + "learning_rate": 0.00019999862922474268, + "loss": 1.0154, + "step": 1016 + }, + { + "epoch": 0.32653716487397655, + "grad_norm": 1.4249135255813599, + "learning_rate": 0.00019999844036286483, + "loss": 0.9882, + "step": 1017 + }, + { + "epoch": 0.32685824369882804, + "grad_norm": 1.9009687900543213, + "learning_rate": 0.00019999823931648035, + "loss": 1.173, + "step": 1018 + }, + { + "epoch": 0.32717932252367954, + "grad_norm": 2.4772071838378906, + "learning_rate": 0.0001999980260856137, + "loss": 1.2154, + "step": 1019 + }, + { + "epoch": 0.3275004013485311, + "grad_norm": 2.5375864505767822, + "learning_rate": 0.00019999780067029094, + "loss": 1.1588, + "step": 1020 + }, + { + "epoch": 0.3278214801733826, + "grad_norm": 2.0548741817474365, + "learning_rate": 0.00019999756307053948, + "loss": 1.17, + "step": 1021 + }, + { + "epoch": 0.3281425589982341, + "grad_norm": 2.063422441482544, + "learning_rate": 0.00019999731328638828, + "loss": 0.9767, + "step": 1022 + }, + { + "epoch": 0.3284636378230856, + "grad_norm": 1.4420465230941772, + "learning_rate": 0.0001999970513178678, + "loss": 0.9672, + "step": 1023 + }, + { + "epoch": 0.32878471664793707, + "grad_norm": 8.236002922058105, + "learning_rate": 0.00019999677716500992, + "loss": 1.4308, + "step": 1024 + }, + { + "epoch": 0.32910579547278856, + "grad_norm": 1.764096736907959, + "learning_rate": 0.0001999964908278481, + "loss": 0.889, + "step": 1025 + }, + { + "epoch": 0.32942687429764006, + "grad_norm": 1.7221022844314575, + "learning_rate": 0.00019999619230641713, + "loss": 1.0662, + "step": 1026 + }, + { + "epoch": 0.32974795312249155, + "grad_norm": 1.4062892198562622, + "learning_rate": 0.00019999588160075348, + "loss": 1.034, + "step": 1027 + }, + { + "epoch": 0.33006903194734305, + "grad_norm": 1.5284225940704346, + "learning_rate": 0.000199995558710895, + "loss": 1.0265, + "step": 1028 + }, + { + "epoch": 0.3303901107721946, + "grad_norm": 2.0174477100372314, + "learning_rate": 0.000199995223636881, + "loss": 1.0751, + "step": 1029 + }, + { + "epoch": 0.3307111895970461, + "grad_norm": 1.3949508666992188, + "learning_rate": 0.0001999948763787523, + "loss": 1.0296, + "step": 1030 + }, + { + "epoch": 0.3310322684218976, + "grad_norm": 1.5616528987884521, + "learning_rate": 0.00019999451693655123, + "loss": 1.0017, + "step": 1031 + }, + { + "epoch": 0.3313533472467491, + "grad_norm": 1.674304723739624, + "learning_rate": 0.00019999414531032158, + "loss": 1.1445, + "step": 1032 + }, + { + "epoch": 0.3316744260716006, + "grad_norm": 2.715172290802002, + "learning_rate": 0.00019999376150010866, + "loss": 1.1429, + "step": 1033 + }, + { + "epoch": 0.3319955048964521, + "grad_norm": 2.873642683029175, + "learning_rate": 0.0001999933655059592, + "loss": 0.6858, + "step": 1034 + }, + { + "epoch": 0.33231658372130357, + "grad_norm": 1.6501632928848267, + "learning_rate": 0.00019999295732792146, + "loss": 1.1858, + "step": 1035 + }, + { + "epoch": 0.33263766254615507, + "grad_norm": 1.3060718774795532, + "learning_rate": 0.00019999253696604522, + "loss": 1.0933, + "step": 1036 + }, + { + "epoch": 0.33295874137100656, + "grad_norm": 0.9465512633323669, + "learning_rate": 0.00019999210442038162, + "loss": 0.8341, + "step": 1037 + }, + { + "epoch": 0.33327982019585806, + "grad_norm": 1.2716128826141357, + "learning_rate": 0.00019999165969098343, + "loss": 0.8147, + "step": 1038 + }, + { + "epoch": 0.3336008990207096, + "grad_norm": 1.762526035308838, + "learning_rate": 0.00019999120277790477, + "loss": 1.0783, + "step": 1039 + }, + { + "epoch": 0.3339219778455611, + "grad_norm": 2.2095861434936523, + "learning_rate": 0.0001999907336812014, + "loss": 1.1156, + "step": 1040 + }, + { + "epoch": 0.3342430566704126, + "grad_norm": 1.668186068534851, + "learning_rate": 0.00019999025240093044, + "loss": 0.8137, + "step": 1041 + }, + { + "epoch": 0.3345641354952641, + "grad_norm": 1.4128243923187256, + "learning_rate": 0.0001999897589371505, + "loss": 1.0737, + "step": 1042 + }, + { + "epoch": 0.3348852143201156, + "grad_norm": 2.0269153118133545, + "learning_rate": 0.00019998925328992175, + "loss": 1.0204, + "step": 1043 + }, + { + "epoch": 0.3352062931449671, + "grad_norm": 1.2762573957443237, + "learning_rate": 0.0001999887354593058, + "loss": 1.1014, + "step": 1044 + }, + { + "epoch": 0.3355273719698186, + "grad_norm": 1.0873875617980957, + "learning_rate": 0.0001999882054453657, + "loss": 0.7875, + "step": 1045 + }, + { + "epoch": 0.3358484507946701, + "grad_norm": 1.1676608324050903, + "learning_rate": 0.00019998766324816607, + "loss": 0.9086, + "step": 1046 + }, + { + "epoch": 0.33616952961952157, + "grad_norm": 1.2658352851867676, + "learning_rate": 0.000199987108867773, + "loss": 0.7835, + "step": 1047 + }, + { + "epoch": 0.3364906084443731, + "grad_norm": 3.1894469261169434, + "learning_rate": 0.00019998654230425395, + "loss": 1.0001, + "step": 1048 + }, + { + "epoch": 0.3368116872692246, + "grad_norm": 2.7069225311279297, + "learning_rate": 0.00019998596355767805, + "loss": 0.6518, + "step": 1049 + }, + { + "epoch": 0.3371327660940761, + "grad_norm": 1.401463508605957, + "learning_rate": 0.00019998537262811577, + "loss": 0.6609, + "step": 1050 + }, + { + "epoch": 0.3374538449189276, + "grad_norm": 2.923952102661133, + "learning_rate": 0.00019998476951563915, + "loss": 1.4718, + "step": 1051 + }, + { + "epoch": 0.3377749237437791, + "grad_norm": 4.130305767059326, + "learning_rate": 0.00019998415422032163, + "loss": 1.109, + "step": 1052 + }, + { + "epoch": 0.3380960025686306, + "grad_norm": 2.697674036026001, + "learning_rate": 0.00019998352674223816, + "loss": 0.9685, + "step": 1053 + }, + { + "epoch": 0.3384170813934821, + "grad_norm": 13.59544849395752, + "learning_rate": 0.00019998288708146527, + "loss": 1.4453, + "step": 1054 + }, + { + "epoch": 0.3387381602183336, + "grad_norm": 2.960008144378662, + "learning_rate": 0.0001999822352380809, + "loss": 1.0084, + "step": 1055 + }, + { + "epoch": 0.3390592390431851, + "grad_norm": 1.8568581342697144, + "learning_rate": 0.00019998157121216444, + "loss": 0.8141, + "step": 1056 + }, + { + "epoch": 0.33938031786803663, + "grad_norm": 2.4931318759918213, + "learning_rate": 0.00019998089500379676, + "loss": 0.9077, + "step": 1057 + }, + { + "epoch": 0.3397013966928881, + "grad_norm": 2.592142105102539, + "learning_rate": 0.00019998020661306037, + "loss": 0.967, + "step": 1058 + }, + { + "epoch": 0.3400224755177396, + "grad_norm": 2.2816684246063232, + "learning_rate": 0.000199979506040039, + "loss": 1.0623, + "step": 1059 + }, + { + "epoch": 0.3403435543425911, + "grad_norm": 1.5964871644973755, + "learning_rate": 0.00019997879328481814, + "loss": 0.9404, + "step": 1060 + }, + { + "epoch": 0.3406646331674426, + "grad_norm": 1.8557053804397583, + "learning_rate": 0.00019997806834748456, + "loss": 0.9474, + "step": 1061 + }, + { + "epoch": 0.3409857119922941, + "grad_norm": 1.6931266784667969, + "learning_rate": 0.0001999773312281266, + "loss": 1.216, + "step": 1062 + }, + { + "epoch": 0.3413067908171456, + "grad_norm": 1.908123254776001, + "learning_rate": 0.00019997658192683412, + "loss": 1.0184, + "step": 1063 + }, + { + "epoch": 0.3416278696419971, + "grad_norm": 1.5717891454696655, + "learning_rate": 0.00019997582044369843, + "loss": 1.022, + "step": 1064 + }, + { + "epoch": 0.3419489484668486, + "grad_norm": 1.7411060333251953, + "learning_rate": 0.00019997504677881224, + "loss": 0.9467, + "step": 1065 + }, + { + "epoch": 0.3422700272917001, + "grad_norm": 3.3722310066223145, + "learning_rate": 0.00019997426093226986, + "loss": 0.9437, + "step": 1066 + }, + { + "epoch": 0.34259110611655164, + "grad_norm": 1.1751868724822998, + "learning_rate": 0.000199973462904167, + "loss": 0.9127, + "step": 1067 + }, + { + "epoch": 0.34291218494140313, + "grad_norm": 1.7528998851776123, + "learning_rate": 0.000199972652694601, + "loss": 0.9023, + "step": 1068 + }, + { + "epoch": 0.3432332637662546, + "grad_norm": 1.4955263137817383, + "learning_rate": 0.00019997183030367048, + "loss": 1.3134, + "step": 1069 + }, + { + "epoch": 0.3435543425911061, + "grad_norm": 2.127793550491333, + "learning_rate": 0.0001999709957314757, + "loss": 1.0524, + "step": 1070 + }, + { + "epoch": 0.3438754214159576, + "grad_norm": 1.1916999816894531, + "learning_rate": 0.00019997014897811833, + "loss": 0.8918, + "step": 1071 + }, + { + "epoch": 0.3441965002408091, + "grad_norm": 2.142920970916748, + "learning_rate": 0.00019996929004370152, + "loss": 0.9653, + "step": 1072 + }, + { + "epoch": 0.3445175790656606, + "grad_norm": 1.9465276002883911, + "learning_rate": 0.00019996841892833, + "loss": 1.1276, + "step": 1073 + }, + { + "epoch": 0.3448386578905121, + "grad_norm": 4.488142490386963, + "learning_rate": 0.00019996753563210985, + "loss": 1.2271, + "step": 1074 + }, + { + "epoch": 0.3451597367153636, + "grad_norm": 9.381378173828125, + "learning_rate": 0.00019996664015514871, + "loss": 0.894, + "step": 1075 + }, + { + "epoch": 0.34548081554021515, + "grad_norm": 1.1667776107788086, + "learning_rate": 0.00019996573249755572, + "loss": 0.8693, + "step": 1076 + }, + { + "epoch": 0.34580189436506664, + "grad_norm": 1.7447643280029297, + "learning_rate": 0.00019996481265944146, + "loss": 1.0464, + "step": 1077 + }, + { + "epoch": 0.34612297318991814, + "grad_norm": 1.9767264127731323, + "learning_rate": 0.000199963880640918, + "loss": 1.1109, + "step": 1078 + }, + { + "epoch": 0.34644405201476963, + "grad_norm": 1.110445499420166, + "learning_rate": 0.00019996293644209887, + "loss": 1.0815, + "step": 1079 + }, + { + "epoch": 0.3467651308396211, + "grad_norm": 2.0246994495391846, + "learning_rate": 0.0001999619800630992, + "loss": 1.1254, + "step": 1080 + }, + { + "epoch": 0.3470862096644726, + "grad_norm": 1.2259923219680786, + "learning_rate": 0.00019996101150403543, + "loss": 0.7593, + "step": 1081 + }, + { + "epoch": 0.3474072884893241, + "grad_norm": 1.174506425857544, + "learning_rate": 0.00019996003076502565, + "loss": 1.0263, + "step": 1082 + }, + { + "epoch": 0.3477283673141756, + "grad_norm": 1.8576600551605225, + "learning_rate": 0.00019995903784618936, + "loss": 1.0134, + "step": 1083 + }, + { + "epoch": 0.3480494461390271, + "grad_norm": 1.9805476665496826, + "learning_rate": 0.00019995803274764747, + "loss": 1.0945, + "step": 1084 + }, + { + "epoch": 0.34837052496387866, + "grad_norm": 1.4714640378952026, + "learning_rate": 0.0001999570154695225, + "loss": 1.0238, + "step": 1085 + }, + { + "epoch": 0.34869160378873015, + "grad_norm": 2.3097331523895264, + "learning_rate": 0.00019995598601193842, + "loss": 0.972, + "step": 1086 + }, + { + "epoch": 0.34901268261358165, + "grad_norm": 1.2940665483474731, + "learning_rate": 0.00019995494437502064, + "loss": 0.8954, + "step": 1087 + }, + { + "epoch": 0.34933376143843314, + "grad_norm": 1.3209530115127563, + "learning_rate": 0.00019995389055889605, + "loss": 0.9124, + "step": 1088 + }, + { + "epoch": 0.34965484026328464, + "grad_norm": 1.4666227102279663, + "learning_rate": 0.0001999528245636931, + "loss": 1.0218, + "step": 1089 + }, + { + "epoch": 0.34997591908813613, + "grad_norm": 2.331610918045044, + "learning_rate": 0.0001999517463895417, + "loss": 0.9959, + "step": 1090 + }, + { + "epoch": 0.35029699791298763, + "grad_norm": 1.3941978216171265, + "learning_rate": 0.00019995065603657316, + "loss": 0.9071, + "step": 1091 + }, + { + "epoch": 0.3506180767378391, + "grad_norm": 1.3337680101394653, + "learning_rate": 0.00019994955350492038, + "loss": 1.08, + "step": 1092 + }, + { + "epoch": 0.3509391555626906, + "grad_norm": 1.6207735538482666, + "learning_rate": 0.0001999484387947177, + "loss": 0.972, + "step": 1093 + }, + { + "epoch": 0.3512602343875421, + "grad_norm": 1.4743515253067017, + "learning_rate": 0.00019994731190610087, + "loss": 1.1447, + "step": 1094 + }, + { + "epoch": 0.35158131321239366, + "grad_norm": 1.3109478950500488, + "learning_rate": 0.0001999461728392073, + "loss": 0.9514, + "step": 1095 + }, + { + "epoch": 0.35190239203724516, + "grad_norm": 1.4319483041763306, + "learning_rate": 0.00019994502159417573, + "loss": 0.9688, + "step": 1096 + }, + { + "epoch": 0.35222347086209665, + "grad_norm": 1.4653232097625732, + "learning_rate": 0.00019994385817114646, + "loss": 0.9073, + "step": 1097 + }, + { + "epoch": 0.35254454968694815, + "grad_norm": 1.3482818603515625, + "learning_rate": 0.00019994268257026118, + "loss": 0.7425, + "step": 1098 + }, + { + "epoch": 0.35286562851179964, + "grad_norm": 2.7118563652038574, + "learning_rate": 0.00019994149479166324, + "loss": 0.8003, + "step": 1099 + }, + { + "epoch": 0.35318670733665114, + "grad_norm": 1.2395905256271362, + "learning_rate": 0.0001999402948354973, + "loss": 0.6656, + "step": 1100 + }, + { + "epoch": 0.35350778616150264, + "grad_norm": 2.8349075317382812, + "learning_rate": 0.0001999390827019096, + "loss": 1.369, + "step": 1101 + }, + { + "epoch": 0.35382886498635413, + "grad_norm": 1.6994882822036743, + "learning_rate": 0.0001999378583910478, + "loss": 1.4933, + "step": 1102 + }, + { + "epoch": 0.3541499438112056, + "grad_norm": 2.044842481613159, + "learning_rate": 0.0001999366219030611, + "loss": 1.0908, + "step": 1103 + }, + { + "epoch": 0.3544710226360572, + "grad_norm": 2.008758306503296, + "learning_rate": 0.00019993537323810014, + "loss": 0.963, + "step": 1104 + }, + { + "epoch": 0.35479210146090867, + "grad_norm": 2.8811473846435547, + "learning_rate": 0.0001999341123963171, + "loss": 0.8424, + "step": 1105 + }, + { + "epoch": 0.35511318028576017, + "grad_norm": 1.9336885213851929, + "learning_rate": 0.00019993283937786563, + "loss": 0.9013, + "step": 1106 + }, + { + "epoch": 0.35543425911061166, + "grad_norm": 1.6170105934143066, + "learning_rate": 0.0001999315541829008, + "loss": 0.7265, + "step": 1107 + }, + { + "epoch": 0.35575533793546316, + "grad_norm": 2.101649284362793, + "learning_rate": 0.0001999302568115792, + "loss": 1.1258, + "step": 1108 + }, + { + "epoch": 0.35607641676031465, + "grad_norm": 2.104947328567505, + "learning_rate": 0.00019992894726405893, + "loss": 0.9501, + "step": 1109 + }, + { + "epoch": 0.35639749558516615, + "grad_norm": 1.6322426795959473, + "learning_rate": 0.00019992762554049955, + "loss": 1.0367, + "step": 1110 + }, + { + "epoch": 0.35671857441001764, + "grad_norm": 1.6271668672561646, + "learning_rate": 0.0001999262916410621, + "loss": 0.893, + "step": 1111 + }, + { + "epoch": 0.35703965323486914, + "grad_norm": 1.3786333799362183, + "learning_rate": 0.00019992494556590916, + "loss": 0.9951, + "step": 1112 + }, + { + "epoch": 0.3573607320597207, + "grad_norm": 1.2673497200012207, + "learning_rate": 0.00019992358731520468, + "loss": 0.9283, + "step": 1113 + }, + { + "epoch": 0.3576818108845722, + "grad_norm": 1.338150143623352, + "learning_rate": 0.0001999222168891142, + "loss": 0.9409, + "step": 1114 + }, + { + "epoch": 0.3580028897094237, + "grad_norm": 1.344650149345398, + "learning_rate": 0.0001999208342878047, + "loss": 1.0276, + "step": 1115 + }, + { + "epoch": 0.3583239685342752, + "grad_norm": 1.805612325668335, + "learning_rate": 0.0001999194395114446, + "loss": 1.1324, + "step": 1116 + }, + { + "epoch": 0.35864504735912667, + "grad_norm": 21.904685974121094, + "learning_rate": 0.00019991803256020393, + "loss": 1.1808, + "step": 1117 + }, + { + "epoch": 0.35896612618397816, + "grad_norm": 1.6144940853118896, + "learning_rate": 0.000199916613434254, + "loss": 1.1416, + "step": 1118 + }, + { + "epoch": 0.35928720500882966, + "grad_norm": 2.7180378437042236, + "learning_rate": 0.00019991518213376787, + "loss": 1.0546, + "step": 1119 + }, + { + "epoch": 0.35960828383368115, + "grad_norm": 1.8319542407989502, + "learning_rate": 0.00019991373865891986, + "loss": 0.991, + "step": 1120 + }, + { + "epoch": 0.35992936265853265, + "grad_norm": 2.3465638160705566, + "learning_rate": 0.00019991228300988585, + "loss": 1.0318, + "step": 1121 + }, + { + "epoch": 0.3602504414833842, + "grad_norm": 2.1191277503967285, + "learning_rate": 0.00019991081518684321, + "loss": 1.1896, + "step": 1122 + }, + { + "epoch": 0.3605715203082357, + "grad_norm": 7.694565773010254, + "learning_rate": 0.00019990933518997084, + "loss": 1.3582, + "step": 1123 + }, + { + "epoch": 0.3608925991330872, + "grad_norm": 1.5988242626190186, + "learning_rate": 0.00019990784301944902, + "loss": 1.225, + "step": 1124 + }, + { + "epoch": 0.3612136779579387, + "grad_norm": 1.4618570804595947, + "learning_rate": 0.00019990633867545955, + "loss": 1.152, + "step": 1125 + }, + { + "epoch": 0.3615347567827902, + "grad_norm": 1.4518593549728394, + "learning_rate": 0.0001999048221581858, + "loss": 0.7903, + "step": 1126 + }, + { + "epoch": 0.3618558356076417, + "grad_norm": 2.108856439590454, + "learning_rate": 0.0001999032934678125, + "loss": 0.9205, + "step": 1127 + }, + { + "epoch": 0.36217691443249317, + "grad_norm": 1.7413921356201172, + "learning_rate": 0.0001999017526045259, + "loss": 0.9283, + "step": 1128 + }, + { + "epoch": 0.36249799325734466, + "grad_norm": 1.6088074445724487, + "learning_rate": 0.00019990019956851382, + "loss": 1.0622, + "step": 1129 + }, + { + "epoch": 0.36281907208219616, + "grad_norm": 2.035529375076294, + "learning_rate": 0.00019989863435996544, + "loss": 1.0937, + "step": 1130 + }, + { + "epoch": 0.36314015090704765, + "grad_norm": 1.6727184057235718, + "learning_rate": 0.00019989705697907149, + "loss": 1.133, + "step": 1131 + }, + { + "epoch": 0.3634612297318992, + "grad_norm": 4.085322856903076, + "learning_rate": 0.00019989546742602414, + "loss": 0.8277, + "step": 1132 + }, + { + "epoch": 0.3637823085567507, + "grad_norm": 1.4142351150512695, + "learning_rate": 0.00019989386570101714, + "loss": 1.0965, + "step": 1133 + }, + { + "epoch": 0.3641033873816022, + "grad_norm": 2.0783348083496094, + "learning_rate": 0.0001998922518042456, + "loss": 1.4158, + "step": 1134 + }, + { + "epoch": 0.3644244662064537, + "grad_norm": 1.3447226285934448, + "learning_rate": 0.00019989062573590616, + "loss": 0.8552, + "step": 1135 + }, + { + "epoch": 0.3647455450313052, + "grad_norm": 1.5264992713928223, + "learning_rate": 0.00019988898749619702, + "loss": 0.9846, + "step": 1136 + }, + { + "epoch": 0.3650666238561567, + "grad_norm": 1.8115450143814087, + "learning_rate": 0.0001998873370853177, + "loss": 1.0604, + "step": 1137 + }, + { + "epoch": 0.3653877026810082, + "grad_norm": 2.269794225692749, + "learning_rate": 0.00019988567450346937, + "loss": 0.8845, + "step": 1138 + }, + { + "epoch": 0.36570878150585967, + "grad_norm": 1.198961853981018, + "learning_rate": 0.0001998839997508546, + "loss": 0.9815, + "step": 1139 + }, + { + "epoch": 0.36602986033071117, + "grad_norm": 2.3479223251342773, + "learning_rate": 0.00019988231282767744, + "loss": 0.9144, + "step": 1140 + }, + { + "epoch": 0.3663509391555627, + "grad_norm": 1.9824615716934204, + "learning_rate": 0.0001998806137341434, + "loss": 1.0468, + "step": 1141 + }, + { + "epoch": 0.3666720179804142, + "grad_norm": 1.780985713005066, + "learning_rate": 0.00019987890247045957, + "loss": 1.0552, + "step": 1142 + }, + { + "epoch": 0.3669930968052657, + "grad_norm": 1.192861557006836, + "learning_rate": 0.00019987717903683448, + "loss": 0.8069, + "step": 1143 + }, + { + "epoch": 0.3673141756301172, + "grad_norm": 2.568082094192505, + "learning_rate": 0.00019987544343347803, + "loss": 0.9455, + "step": 1144 + }, + { + "epoch": 0.3676352544549687, + "grad_norm": 1.151548981666565, + "learning_rate": 0.00019987369566060176, + "loss": 0.6518, + "step": 1145 + }, + { + "epoch": 0.3679563332798202, + "grad_norm": 2.208691358566284, + "learning_rate": 0.0001998719357184186, + "loss": 0.7998, + "step": 1146 + }, + { + "epoch": 0.3682774121046717, + "grad_norm": 2.1074390411376953, + "learning_rate": 0.00019987016360714307, + "loss": 1.0609, + "step": 1147 + }, + { + "epoch": 0.3685984909295232, + "grad_norm": 1.2993351221084595, + "learning_rate": 0.00019986837932699103, + "loss": 0.7665, + "step": 1148 + }, + { + "epoch": 0.3689195697543747, + "grad_norm": 0.9701393246650696, + "learning_rate": 0.00019986658287817987, + "loss": 0.5932, + "step": 1149 + }, + { + "epoch": 0.3692406485792262, + "grad_norm": 6.661820411682129, + "learning_rate": 0.00019986477426092855, + "loss": 0.6992, + "step": 1150 + }, + { + "epoch": 0.3695617274040777, + "grad_norm": 3.1357054710388184, + "learning_rate": 0.0001998629534754574, + "loss": 1.5801, + "step": 1151 + }, + { + "epoch": 0.3698828062289292, + "grad_norm": 3.9719338417053223, + "learning_rate": 0.0001998611205219883, + "loss": 1.5882, + "step": 1152 + }, + { + "epoch": 0.3702038850537807, + "grad_norm": 3.1035382747650146, + "learning_rate": 0.00019985927540074454, + "loss": 1.0743, + "step": 1153 + }, + { + "epoch": 0.3705249638786322, + "grad_norm": 1.9447718858718872, + "learning_rate": 0.00019985741811195097, + "loss": 0.8629, + "step": 1154 + }, + { + "epoch": 0.3708460427034837, + "grad_norm": 1.993913173675537, + "learning_rate": 0.00019985554865583393, + "loss": 0.8233, + "step": 1155 + }, + { + "epoch": 0.3711671215283352, + "grad_norm": 1.8630907535552979, + "learning_rate": 0.0001998536670326212, + "loss": 0.8642, + "step": 1156 + }, + { + "epoch": 0.3714882003531867, + "grad_norm": 1.8467717170715332, + "learning_rate": 0.000199851773242542, + "loss": 0.8528, + "step": 1157 + }, + { + "epoch": 0.3718092791780382, + "grad_norm": 2.4014041423797607, + "learning_rate": 0.0001998498672858271, + "loss": 1.0696, + "step": 1158 + }, + { + "epoch": 0.3721303580028897, + "grad_norm": 1.915739893913269, + "learning_rate": 0.00019984794916270876, + "loss": 1.0595, + "step": 1159 + }, + { + "epoch": 0.37245143682774123, + "grad_norm": 1.4160325527191162, + "learning_rate": 0.00019984601887342073, + "loss": 1.1056, + "step": 1160 + }, + { + "epoch": 0.37277251565259273, + "grad_norm": 2.503619909286499, + "learning_rate": 0.00019984407641819812, + "loss": 0.6714, + "step": 1161 + }, + { + "epoch": 0.3730935944774442, + "grad_norm": 1.4751476049423218, + "learning_rate": 0.00019984212179727766, + "loss": 0.9467, + "step": 1162 + }, + { + "epoch": 0.3734146733022957, + "grad_norm": 1.4288772344589233, + "learning_rate": 0.00019984015501089752, + "loss": 1.0041, + "step": 1163 + }, + { + "epoch": 0.3737357521271472, + "grad_norm": 1.389568567276001, + "learning_rate": 0.00019983817605929733, + "loss": 0.9213, + "step": 1164 + }, + { + "epoch": 0.3740568309519987, + "grad_norm": 1.2409948110580444, + "learning_rate": 0.00019983618494271826, + "loss": 0.9687, + "step": 1165 + }, + { + "epoch": 0.3743779097768502, + "grad_norm": 1.3233766555786133, + "learning_rate": 0.00019983418166140285, + "loss": 0.9057, + "step": 1166 + }, + { + "epoch": 0.3746989886017017, + "grad_norm": 1.7424918413162231, + "learning_rate": 0.00019983216621559524, + "loss": 1.3071, + "step": 1167 + }, + { + "epoch": 0.3750200674265532, + "grad_norm": 2.5900015830993652, + "learning_rate": 0.00019983013860554101, + "loss": 0.9437, + "step": 1168 + }, + { + "epoch": 0.37534114625140474, + "grad_norm": 1.2561944723129272, + "learning_rate": 0.00019982809883148722, + "loss": 1.0481, + "step": 1169 + }, + { + "epoch": 0.37566222507625624, + "grad_norm": 2.1015701293945312, + "learning_rate": 0.00019982604689368239, + "loss": 1.1775, + "step": 1170 + }, + { + "epoch": 0.37598330390110773, + "grad_norm": 2.039940357208252, + "learning_rate": 0.00019982398279237655, + "loss": 1.2701, + "step": 1171 + }, + { + "epoch": 0.37630438272595923, + "grad_norm": 1.330772042274475, + "learning_rate": 0.0001998219065278212, + "loss": 0.9725, + "step": 1172 + }, + { + "epoch": 0.3766254615508107, + "grad_norm": 2.106564998626709, + "learning_rate": 0.00019981981810026934, + "loss": 1.0919, + "step": 1173 + }, + { + "epoch": 0.3769465403756622, + "grad_norm": 1.5898199081420898, + "learning_rate": 0.0001998177175099754, + "loss": 0.9706, + "step": 1174 + }, + { + "epoch": 0.3772676192005137, + "grad_norm": 1.8556996583938599, + "learning_rate": 0.00019981560475719538, + "loss": 1.025, + "step": 1175 + }, + { + "epoch": 0.3775886980253652, + "grad_norm": 1.008709192276001, + "learning_rate": 0.0001998134798421867, + "loss": 0.8611, + "step": 1176 + }, + { + "epoch": 0.3779097768502167, + "grad_norm": 2.131082057952881, + "learning_rate": 0.00019981134276520827, + "loss": 1.2977, + "step": 1177 + }, + { + "epoch": 0.37823085567506826, + "grad_norm": 1.7559832334518433, + "learning_rate": 0.00019980919352652048, + "loss": 0.9628, + "step": 1178 + }, + { + "epoch": 0.37855193449991975, + "grad_norm": 1.290353536605835, + "learning_rate": 0.00019980703212638522, + "loss": 1.0724, + "step": 1179 + }, + { + "epoch": 0.37887301332477125, + "grad_norm": 1.12660813331604, + "learning_rate": 0.00019980485856506582, + "loss": 0.9982, + "step": 1180 + }, + { + "epoch": 0.37919409214962274, + "grad_norm": 1.5011529922485352, + "learning_rate": 0.00019980267284282717, + "loss": 0.9109, + "step": 1181 + }, + { + "epoch": 0.37951517097447424, + "grad_norm": 1.762739896774292, + "learning_rate": 0.00019980047495993554, + "loss": 0.9277, + "step": 1182 + }, + { + "epoch": 0.37983624979932573, + "grad_norm": 1.3952839374542236, + "learning_rate": 0.00019979826491665881, + "loss": 0.7895, + "step": 1183 + }, + { + "epoch": 0.3801573286241772, + "grad_norm": 1.6314538717269897, + "learning_rate": 0.00019979604271326616, + "loss": 0.9361, + "step": 1184 + }, + { + "epoch": 0.3804784074490287, + "grad_norm": 2.3080692291259766, + "learning_rate": 0.00019979380835002846, + "loss": 1.1475, + "step": 1185 + }, + { + "epoch": 0.3807994862738802, + "grad_norm": 2.15315842628479, + "learning_rate": 0.0001997915618272179, + "loss": 1.1916, + "step": 1186 + }, + { + "epoch": 0.3811205650987317, + "grad_norm": 1.3747003078460693, + "learning_rate": 0.00019978930314510824, + "loss": 0.8076, + "step": 1187 + }, + { + "epoch": 0.38144164392358326, + "grad_norm": 1.4667130708694458, + "learning_rate": 0.0001997870323039747, + "loss": 0.95, + "step": 1188 + }, + { + "epoch": 0.38176272274843476, + "grad_norm": 1.726277470588684, + "learning_rate": 0.00019978474930409395, + "loss": 0.9735, + "step": 1189 + }, + { + "epoch": 0.38208380157328625, + "grad_norm": 2.355656385421753, + "learning_rate": 0.00019978245414574417, + "loss": 1.0268, + "step": 1190 + }, + { + "epoch": 0.38240488039813775, + "grad_norm": 1.840656042098999, + "learning_rate": 0.000199780146829205, + "loss": 0.9747, + "step": 1191 + }, + { + "epoch": 0.38272595922298924, + "grad_norm": 1.4547884464263916, + "learning_rate": 0.00019977782735475764, + "loss": 1.0729, + "step": 1192 + }, + { + "epoch": 0.38304703804784074, + "grad_norm": 1.2916489839553833, + "learning_rate": 0.00019977549572268468, + "loss": 0.875, + "step": 1193 + }, + { + "epoch": 0.38336811687269223, + "grad_norm": 1.5631312131881714, + "learning_rate": 0.00019977315193327018, + "loss": 0.8071, + "step": 1194 + }, + { + "epoch": 0.38368919569754373, + "grad_norm": 1.6755515336990356, + "learning_rate": 0.00019977079598679977, + "loss": 0.9683, + "step": 1195 + }, + { + "epoch": 0.3840102745223952, + "grad_norm": 1.1934847831726074, + "learning_rate": 0.00019976842788356055, + "loss": 0.666, + "step": 1196 + }, + { + "epoch": 0.3843313533472468, + "grad_norm": 1.6635997295379639, + "learning_rate": 0.000199766047623841, + "loss": 0.8521, + "step": 1197 + }, + { + "epoch": 0.38465243217209827, + "grad_norm": 1.6572028398513794, + "learning_rate": 0.00019976365520793114, + "loss": 0.7134, + "step": 1198 + }, + { + "epoch": 0.38497351099694976, + "grad_norm": 1.2580087184906006, + "learning_rate": 0.00019976125063612252, + "loss": 0.6517, + "step": 1199 + }, + { + "epoch": 0.38529458982180126, + "grad_norm": 1.3600796461105347, + "learning_rate": 0.00019975883390870817, + "loss": 0.5476, + "step": 1200 + }, + { + "epoch": 0.38561566864665275, + "grad_norm": 2.1080329418182373, + "learning_rate": 0.00019975640502598244, + "loss": 1.4574, + "step": 1201 + }, + { + "epoch": 0.38593674747150425, + "grad_norm": 1.5729361772537231, + "learning_rate": 0.0001997539639882414, + "loss": 1.3152, + "step": 1202 + }, + { + "epoch": 0.38625782629635574, + "grad_norm": 1.7094898223876953, + "learning_rate": 0.00019975151079578237, + "loss": 1.1812, + "step": 1203 + }, + { + "epoch": 0.38657890512120724, + "grad_norm": 1.5421949625015259, + "learning_rate": 0.0001997490454489044, + "loss": 1.023, + "step": 1204 + }, + { + "epoch": 0.38689998394605873, + "grad_norm": 2.4401865005493164, + "learning_rate": 0.00019974656794790775, + "loss": 0.866, + "step": 1205 + }, + { + "epoch": 0.3872210627709103, + "grad_norm": 1.9785239696502686, + "learning_rate": 0.0001997440782930944, + "loss": 0.8572, + "step": 1206 + }, + { + "epoch": 0.3875421415957618, + "grad_norm": 1.7715507745742798, + "learning_rate": 0.00019974157648476766, + "loss": 0.7572, + "step": 1207 + }, + { + "epoch": 0.3878632204206133, + "grad_norm": 1.8642398118972778, + "learning_rate": 0.00019973906252323238, + "loss": 0.8058, + "step": 1208 + }, + { + "epoch": 0.38818429924546477, + "grad_norm": 1.5486159324645996, + "learning_rate": 0.00019973653640879485, + "loss": 1.0713, + "step": 1209 + }, + { + "epoch": 0.38850537807031627, + "grad_norm": 2.690429449081421, + "learning_rate": 0.00019973399814176293, + "loss": 1.024, + "step": 1210 + }, + { + "epoch": 0.38882645689516776, + "grad_norm": 1.7301387786865234, + "learning_rate": 0.00019973144772244582, + "loss": 0.9245, + "step": 1211 + }, + { + "epoch": 0.38914753572001926, + "grad_norm": 1.1060234308242798, + "learning_rate": 0.00019972888515115434, + "loss": 0.9201, + "step": 1212 + }, + { + "epoch": 0.38946861454487075, + "grad_norm": 2.06071138381958, + "learning_rate": 0.0001997263104282007, + "loss": 0.8777, + "step": 1213 + }, + { + "epoch": 0.38978969336972225, + "grad_norm": 1.2055797576904297, + "learning_rate": 0.00019972372355389867, + "loss": 0.9946, + "step": 1214 + }, + { + "epoch": 0.39011077219457374, + "grad_norm": 1.2430628538131714, + "learning_rate": 0.00019972112452856339, + "loss": 0.6915, + "step": 1215 + }, + { + "epoch": 0.3904318510194253, + "grad_norm": 1.3444316387176514, + "learning_rate": 0.00019971851335251158, + "loss": 0.9794, + "step": 1216 + }, + { + "epoch": 0.3907529298442768, + "grad_norm": 1.4662611484527588, + "learning_rate": 0.0001997158900260614, + "loss": 1.0779, + "step": 1217 + }, + { + "epoch": 0.3910740086691283, + "grad_norm": 1.2781003713607788, + "learning_rate": 0.00019971325454953248, + "loss": 0.9051, + "step": 1218 + }, + { + "epoch": 0.3913950874939798, + "grad_norm": 1.5118160247802734, + "learning_rate": 0.00019971060692324598, + "loss": 1.3483, + "step": 1219 + }, + { + "epoch": 0.39171616631883127, + "grad_norm": 3.3005893230438232, + "learning_rate": 0.00019970794714752445, + "loss": 1.0498, + "step": 1220 + }, + { + "epoch": 0.39203724514368277, + "grad_norm": 2.5156784057617188, + "learning_rate": 0.00019970527522269205, + "loss": 1.0823, + "step": 1221 + }, + { + "epoch": 0.39235832396853426, + "grad_norm": 2.2197351455688477, + "learning_rate": 0.00019970259114907425, + "loss": 0.9576, + "step": 1222 + }, + { + "epoch": 0.39267940279338576, + "grad_norm": 1.4013001918792725, + "learning_rate": 0.0001996998949269982, + "loss": 1.0194, + "step": 1223 + }, + { + "epoch": 0.39300048161823725, + "grad_norm": 1.1013851165771484, + "learning_rate": 0.00019969718655679232, + "loss": 0.9049, + "step": 1224 + }, + { + "epoch": 0.3933215604430888, + "grad_norm": 1.4185377359390259, + "learning_rate": 0.00019969446603878673, + "loss": 0.8871, + "step": 1225 + }, + { + "epoch": 0.3936426392679403, + "grad_norm": 1.4201257228851318, + "learning_rate": 0.0001996917333733128, + "loss": 0.8107, + "step": 1226 + }, + { + "epoch": 0.3939637180927918, + "grad_norm": 1.788902997970581, + "learning_rate": 0.0001996889885607036, + "loss": 1.1504, + "step": 1227 + }, + { + "epoch": 0.3942847969176433, + "grad_norm": 2.3521134853363037, + "learning_rate": 0.0001996862316012935, + "loss": 1.2274, + "step": 1228 + }, + { + "epoch": 0.3946058757424948, + "grad_norm": 2.3193109035491943, + "learning_rate": 0.00019968346249541846, + "loss": 1.2382, + "step": 1229 + }, + { + "epoch": 0.3949269545673463, + "grad_norm": 1.1716614961624146, + "learning_rate": 0.0001996806812434159, + "loss": 0.8529, + "step": 1230 + }, + { + "epoch": 0.3952480333921978, + "grad_norm": 1.5683951377868652, + "learning_rate": 0.00019967788784562473, + "loss": 1.1172, + "step": 1231 + }, + { + "epoch": 0.39556911221704927, + "grad_norm": 1.3435286283493042, + "learning_rate": 0.00019967508230238522, + "loss": 1.1469, + "step": 1232 + }, + { + "epoch": 0.39589019104190076, + "grad_norm": 2.070988893508911, + "learning_rate": 0.00019967226461403933, + "loss": 1.1132, + "step": 1233 + }, + { + "epoch": 0.3962112698667523, + "grad_norm": 1.4258763790130615, + "learning_rate": 0.0001996694347809303, + "loss": 1.0245, + "step": 1234 + }, + { + "epoch": 0.3965323486916038, + "grad_norm": 1.191821813583374, + "learning_rate": 0.00019966659280340297, + "loss": 1.0535, + "step": 1235 + }, + { + "epoch": 0.3968534275164553, + "grad_norm": 1.3982123136520386, + "learning_rate": 0.00019966373868180365, + "loss": 0.8893, + "step": 1236 + }, + { + "epoch": 0.3971745063413068, + "grad_norm": 1.734355092048645, + "learning_rate": 0.0001996608724164801, + "loss": 1.1121, + "step": 1237 + }, + { + "epoch": 0.3974955851661583, + "grad_norm": 1.516452670097351, + "learning_rate": 0.00019965799400778152, + "loss": 1.0574, + "step": 1238 + }, + { + "epoch": 0.3978166639910098, + "grad_norm": 2.239095449447632, + "learning_rate": 0.00019965510345605866, + "loss": 1.0554, + "step": 1239 + }, + { + "epoch": 0.3981377428158613, + "grad_norm": 1.5699093341827393, + "learning_rate": 0.00019965220076166376, + "loss": 1.0202, + "step": 1240 + }, + { + "epoch": 0.3984588216407128, + "grad_norm": 1.7507424354553223, + "learning_rate": 0.00019964928592495045, + "loss": 0.9151, + "step": 1241 + }, + { + "epoch": 0.3987799004655643, + "grad_norm": 2.988863229751587, + "learning_rate": 0.0001996463589462739, + "loss": 0.8941, + "step": 1242 + }, + { + "epoch": 0.39910097929041577, + "grad_norm": 2.8474647998809814, + "learning_rate": 0.00019964341982599078, + "loss": 0.9185, + "step": 1243 + }, + { + "epoch": 0.3994220581152673, + "grad_norm": 1.7581831216812134, + "learning_rate": 0.00019964046856445924, + "loss": 0.9486, + "step": 1244 + }, + { + "epoch": 0.3997431369401188, + "grad_norm": 1.2665711641311646, + "learning_rate": 0.00019963750516203884, + "loss": 0.816, + "step": 1245 + }, + { + "epoch": 0.4000642157649703, + "grad_norm": 1.6899489164352417, + "learning_rate": 0.00019963452961909063, + "loss": 0.692, + "step": 1246 + }, + { + "epoch": 0.4003852945898218, + "grad_norm": 1.0340851545333862, + "learning_rate": 0.00019963154193597727, + "loss": 0.7222, + "step": 1247 + }, + { + "epoch": 0.4007063734146733, + "grad_norm": 2.3070790767669678, + "learning_rate": 0.00019962854211306267, + "loss": 0.9091, + "step": 1248 + }, + { + "epoch": 0.4010274522395248, + "grad_norm": 1.4314565658569336, + "learning_rate": 0.0001996255301507125, + "loss": 0.6755, + "step": 1249 + }, + { + "epoch": 0.4013485310643763, + "grad_norm": 0.9915478825569153, + "learning_rate": 0.0001996225060492936, + "loss": 0.6343, + "step": 1250 + }, + { + "epoch": 0.4016696098892278, + "grad_norm": 1.879836916923523, + "learning_rate": 0.00019961946980917456, + "loss": 1.3669, + "step": 1251 + }, + { + "epoch": 0.4019906887140793, + "grad_norm": 2.172273874282837, + "learning_rate": 0.00019961642143072529, + "loss": 1.2117, + "step": 1252 + }, + { + "epoch": 0.40231176753893083, + "grad_norm": 2.1239922046661377, + "learning_rate": 0.00019961336091431727, + "loss": 1.0067, + "step": 1253 + }, + { + "epoch": 0.4026328463637823, + "grad_norm": 1.5531266927719116, + "learning_rate": 0.00019961028826032332, + "loss": 0.7723, + "step": 1254 + }, + { + "epoch": 0.4029539251886338, + "grad_norm": 1.8615666627883911, + "learning_rate": 0.00019960720346911797, + "loss": 0.8307, + "step": 1255 + }, + { + "epoch": 0.4032750040134853, + "grad_norm": 1.9920127391815186, + "learning_rate": 0.00019960410654107697, + "loss": 0.7862, + "step": 1256 + }, + { + "epoch": 0.4035960828383368, + "grad_norm": 1.7379018068313599, + "learning_rate": 0.00019960099747657774, + "loss": 0.8312, + "step": 1257 + }, + { + "epoch": 0.4039171616631883, + "grad_norm": 2.0180602073669434, + "learning_rate": 0.00019959787627599906, + "loss": 0.8846, + "step": 1258 + }, + { + "epoch": 0.4042382404880398, + "grad_norm": 1.567842960357666, + "learning_rate": 0.00019959474293972129, + "loss": 1.0824, + "step": 1259 + }, + { + "epoch": 0.4045593193128913, + "grad_norm": 2.6913554668426514, + "learning_rate": 0.0001995915974681262, + "loss": 1.0713, + "step": 1260 + }, + { + "epoch": 0.4048803981377428, + "grad_norm": 2.3127224445343018, + "learning_rate": 0.00019958843986159704, + "loss": 0.8672, + "step": 1261 + }, + { + "epoch": 0.40520147696259434, + "grad_norm": 2.383711338043213, + "learning_rate": 0.00019958527012051857, + "loss": 0.9304, + "step": 1262 + }, + { + "epoch": 0.40552255578744584, + "grad_norm": 1.3288320302963257, + "learning_rate": 0.000199582088245277, + "loss": 1.0369, + "step": 1263 + }, + { + "epoch": 0.40584363461229733, + "grad_norm": 1.3985912799835205, + "learning_rate": 0.00019957889423626005, + "loss": 1.0793, + "step": 1264 + }, + { + "epoch": 0.4061647134371488, + "grad_norm": 1.3505330085754395, + "learning_rate": 0.00019957568809385694, + "loss": 0.8989, + "step": 1265 + }, + { + "epoch": 0.4064857922620003, + "grad_norm": 1.663252353668213, + "learning_rate": 0.00019957246981845822, + "loss": 1.0797, + "step": 1266 + }, + { + "epoch": 0.4068068710868518, + "grad_norm": 1.4397826194763184, + "learning_rate": 0.0001995692394104561, + "loss": 0.8712, + "step": 1267 + }, + { + "epoch": 0.4071279499117033, + "grad_norm": 1.1795347929000854, + "learning_rate": 0.0001995659968702442, + "loss": 1.1528, + "step": 1268 + }, + { + "epoch": 0.4074490287365548, + "grad_norm": 2.6851541996002197, + "learning_rate": 0.00019956274219821757, + "loss": 0.9325, + "step": 1269 + }, + { + "epoch": 0.4077701075614063, + "grad_norm": 1.8018854856491089, + "learning_rate": 0.00019955947539477284, + "loss": 1.1969, + "step": 1270 + }, + { + "epoch": 0.40809118638625785, + "grad_norm": 1.442039966583252, + "learning_rate": 0.00019955619646030802, + "loss": 0.9039, + "step": 1271 + }, + { + "epoch": 0.40841226521110935, + "grad_norm": 1.8041740655899048, + "learning_rate": 0.0001995529053952226, + "loss": 1.0132, + "step": 1272 + }, + { + "epoch": 0.40873334403596084, + "grad_norm": 1.6970267295837402, + "learning_rate": 0.0001995496021999177, + "loss": 1.2557, + "step": 1273 + }, + { + "epoch": 0.40905442286081234, + "grad_norm": 2.2318642139434814, + "learning_rate": 0.00019954628687479572, + "loss": 0.9386, + "step": 1274 + }, + { + "epoch": 0.40937550168566383, + "grad_norm": 1.3459020853042603, + "learning_rate": 0.00019954295942026064, + "loss": 0.979, + "step": 1275 + }, + { + "epoch": 0.40969658051051533, + "grad_norm": 1.2683597803115845, + "learning_rate": 0.00019953961983671788, + "loss": 0.8684, + "step": 1276 + }, + { + "epoch": 0.4100176593353668, + "grad_norm": 2.0490283966064453, + "learning_rate": 0.0001995362681245744, + "loss": 1.1168, + "step": 1277 + }, + { + "epoch": 0.4103387381602183, + "grad_norm": 1.9572994709014893, + "learning_rate": 0.00019953290428423857, + "loss": 0.8459, + "step": 1278 + }, + { + "epoch": 0.4106598169850698, + "grad_norm": 1.6033639907836914, + "learning_rate": 0.00019952952831612026, + "loss": 1.1142, + "step": 1279 + }, + { + "epoch": 0.4109808958099213, + "grad_norm": 1.5231640338897705, + "learning_rate": 0.00019952614022063084, + "loss": 1.1958, + "step": 1280 + }, + { + "epoch": 0.41130197463477286, + "grad_norm": 1.5248790979385376, + "learning_rate": 0.0001995227399981831, + "loss": 1.2117, + "step": 1281 + }, + { + "epoch": 0.41162305345962436, + "grad_norm": 1.1884125471115112, + "learning_rate": 0.00019951932764919144, + "loss": 0.8868, + "step": 1282 + }, + { + "epoch": 0.41194413228447585, + "grad_norm": 1.509526014328003, + "learning_rate": 0.00019951590317407152, + "loss": 1.0724, + "step": 1283 + }, + { + "epoch": 0.41226521110932735, + "grad_norm": 1.9606530666351318, + "learning_rate": 0.0001995124665732407, + "loss": 1.1933, + "step": 1284 + }, + { + "epoch": 0.41258628993417884, + "grad_norm": 1.8705965280532837, + "learning_rate": 0.00019950901784711764, + "loss": 1.1115, + "step": 1285 + }, + { + "epoch": 0.41290736875903034, + "grad_norm": 2.0867602825164795, + "learning_rate": 0.00019950555699612267, + "loss": 1.2534, + "step": 1286 + }, + { + "epoch": 0.41322844758388183, + "grad_norm": 1.6749221086502075, + "learning_rate": 0.00019950208402067733, + "loss": 1.1313, + "step": 1287 + }, + { + "epoch": 0.4135495264087333, + "grad_norm": 1.0996752977371216, + "learning_rate": 0.00019949859892120491, + "loss": 0.9663, + "step": 1288 + }, + { + "epoch": 0.4138706052335848, + "grad_norm": 1.0107436180114746, + "learning_rate": 0.00019949510169813003, + "loss": 0.8468, + "step": 1289 + }, + { + "epoch": 0.41419168405843637, + "grad_norm": 1.7090445756912231, + "learning_rate": 0.0001994915923518788, + "loss": 1.1206, + "step": 1290 + }, + { + "epoch": 0.41451276288328787, + "grad_norm": 1.2257174253463745, + "learning_rate": 0.00019948807088287883, + "loss": 0.9886, + "step": 1291 + }, + { + "epoch": 0.41483384170813936, + "grad_norm": 1.3304343223571777, + "learning_rate": 0.00019948453729155922, + "loss": 0.9679, + "step": 1292 + }, + { + "epoch": 0.41515492053299086, + "grad_norm": 3.28393816947937, + "learning_rate": 0.00019948099157835047, + "loss": 0.9752, + "step": 1293 + }, + { + "epoch": 0.41547599935784235, + "grad_norm": 1.4946495294570923, + "learning_rate": 0.00019947743374368467, + "loss": 0.9856, + "step": 1294 + }, + { + "epoch": 0.41579707818269385, + "grad_norm": 1.3553723096847534, + "learning_rate": 0.00019947386378799532, + "loss": 0.8419, + "step": 1295 + }, + { + "epoch": 0.41611815700754534, + "grad_norm": 2.194031238555908, + "learning_rate": 0.00019947028171171742, + "loss": 0.8151, + "step": 1296 + }, + { + "epoch": 0.41643923583239684, + "grad_norm": 1.5983561277389526, + "learning_rate": 0.00019946668751528744, + "loss": 0.8907, + "step": 1297 + }, + { + "epoch": 0.41676031465724833, + "grad_norm": 1.9155287742614746, + "learning_rate": 0.00019946308119914323, + "loss": 0.9095, + "step": 1298 + }, + { + "epoch": 0.4170813934820999, + "grad_norm": 1.078909993171692, + "learning_rate": 0.00019945946276372434, + "loss": 0.6413, + "step": 1299 + }, + { + "epoch": 0.4174024723069514, + "grad_norm": 1.317728042602539, + "learning_rate": 0.00019945583220947158, + "loss": 0.7406, + "step": 1300 + }, + { + "epoch": 0.4177235511318029, + "grad_norm": 1.7083852291107178, + "learning_rate": 0.00019945218953682734, + "loss": 1.3869, + "step": 1301 + }, + { + "epoch": 0.41804462995665437, + "grad_norm": 2.715884208679199, + "learning_rate": 0.00019944853474623548, + "loss": 1.4385, + "step": 1302 + }, + { + "epoch": 0.41836570878150586, + "grad_norm": 1.6352131366729736, + "learning_rate": 0.00019944486783814134, + "loss": 1.0713, + "step": 1303 + }, + { + "epoch": 0.41868678760635736, + "grad_norm": 1.9549245834350586, + "learning_rate": 0.00019944118881299168, + "loss": 0.876, + "step": 1304 + }, + { + "epoch": 0.41900786643120885, + "grad_norm": 3.7763001918792725, + "learning_rate": 0.0001994374976712348, + "loss": 0.7917, + "step": 1305 + }, + { + "epoch": 0.41932894525606035, + "grad_norm": 1.8948286771774292, + "learning_rate": 0.00019943379441332047, + "loss": 0.8379, + "step": 1306 + }, + { + "epoch": 0.41965002408091184, + "grad_norm": 5.399472713470459, + "learning_rate": 0.0001994300790396999, + "loss": 0.8858, + "step": 1307 + }, + { + "epoch": 0.41997110290576334, + "grad_norm": 2.114410400390625, + "learning_rate": 0.0001994263515508258, + "loss": 0.8726, + "step": 1308 + }, + { + "epoch": 0.4202921817306149, + "grad_norm": 2.5239293575286865, + "learning_rate": 0.00019942261194715236, + "loss": 1.225, + "step": 1309 + }, + { + "epoch": 0.4206132605554664, + "grad_norm": 1.2384289503097534, + "learning_rate": 0.00019941886022913522, + "loss": 1.1042, + "step": 1310 + }, + { + "epoch": 0.4209343393803179, + "grad_norm": 1.1283193826675415, + "learning_rate": 0.00019941509639723155, + "loss": 1.073, + "step": 1311 + }, + { + "epoch": 0.4212554182051694, + "grad_norm": 1.4066673517227173, + "learning_rate": 0.0001994113204518999, + "loss": 1.0025, + "step": 1312 + }, + { + "epoch": 0.42157649703002087, + "grad_norm": 1.927209734916687, + "learning_rate": 0.00019940753239360047, + "loss": 0.8864, + "step": 1313 + }, + { + "epoch": 0.42189757585487236, + "grad_norm": 1.1925103664398193, + "learning_rate": 0.00019940373222279473, + "loss": 0.7443, + "step": 1314 + }, + { + "epoch": 0.42221865467972386, + "grad_norm": 0.9499497413635254, + "learning_rate": 0.0001993999199399457, + "loss": 0.7643, + "step": 1315 + }, + { + "epoch": 0.42253973350457535, + "grad_norm": 1.9734052419662476, + "learning_rate": 0.000199396095545518, + "loss": 0.9895, + "step": 1316 + }, + { + "epoch": 0.42286081232942685, + "grad_norm": 1.0893967151641846, + "learning_rate": 0.0001993922590399775, + "loss": 0.9146, + "step": 1317 + }, + { + "epoch": 0.4231818911542784, + "grad_norm": 2.039592742919922, + "learning_rate": 0.00019938841042379174, + "loss": 1.0294, + "step": 1318 + }, + { + "epoch": 0.4235029699791299, + "grad_norm": 1.233694314956665, + "learning_rate": 0.00019938454969742968, + "loss": 0.8611, + "step": 1319 + }, + { + "epoch": 0.4238240488039814, + "grad_norm": 2.1485254764556885, + "learning_rate": 0.00019938067686136167, + "loss": 1.0674, + "step": 1320 + }, + { + "epoch": 0.4241451276288329, + "grad_norm": 1.7402235269546509, + "learning_rate": 0.00019937679191605963, + "loss": 1.1541, + "step": 1321 + }, + { + "epoch": 0.4244662064536844, + "grad_norm": 1.3716280460357666, + "learning_rate": 0.00019937289486199696, + "loss": 1.1266, + "step": 1322 + }, + { + "epoch": 0.4247872852785359, + "grad_norm": 1.6395798921585083, + "learning_rate": 0.00019936898569964848, + "loss": 0.9904, + "step": 1323 + }, + { + "epoch": 0.42510836410338737, + "grad_norm": 1.3475819826126099, + "learning_rate": 0.0001993650644294905, + "loss": 1.0421, + "step": 1324 + }, + { + "epoch": 0.42542944292823887, + "grad_norm": 1.9199656248092651, + "learning_rate": 0.00019936113105200085, + "loss": 1.1184, + "step": 1325 + }, + { + "epoch": 0.42575052175309036, + "grad_norm": 1.6179684400558472, + "learning_rate": 0.00019935718556765876, + "loss": 1.2786, + "step": 1326 + }, + { + "epoch": 0.4260716005779419, + "grad_norm": 1.5954012870788574, + "learning_rate": 0.000199353227976945, + "loss": 0.9325, + "step": 1327 + }, + { + "epoch": 0.4263926794027934, + "grad_norm": 1.832868218421936, + "learning_rate": 0.00019934925828034175, + "loss": 1.114, + "step": 1328 + }, + { + "epoch": 0.4267137582276449, + "grad_norm": 1.5652598142623901, + "learning_rate": 0.00019934527647833276, + "loss": 1.2399, + "step": 1329 + }, + { + "epoch": 0.4270348370524964, + "grad_norm": 2.4210355281829834, + "learning_rate": 0.0001993412825714032, + "loss": 0.9815, + "step": 1330 + }, + { + "epoch": 0.4273559158773479, + "grad_norm": 1.2339181900024414, + "learning_rate": 0.00019933727656003963, + "loss": 0.9054, + "step": 1331 + }, + { + "epoch": 0.4276769947021994, + "grad_norm": 2.5136241912841797, + "learning_rate": 0.0001993332584447303, + "loss": 1.1086, + "step": 1332 + }, + { + "epoch": 0.4279980735270509, + "grad_norm": 1.339241623878479, + "learning_rate": 0.00019932922822596473, + "loss": 1.2639, + "step": 1333 + }, + { + "epoch": 0.4283191523519024, + "grad_norm": 1.0698463916778564, + "learning_rate": 0.00019932518590423394, + "loss": 0.8401, + "step": 1334 + }, + { + "epoch": 0.42864023117675387, + "grad_norm": 2.2309930324554443, + "learning_rate": 0.00019932113148003058, + "loss": 0.8794, + "step": 1335 + }, + { + "epoch": 0.42896131000160537, + "grad_norm": 2.3192408084869385, + "learning_rate": 0.00019931706495384863, + "loss": 0.9037, + "step": 1336 + }, + { + "epoch": 0.4292823888264569, + "grad_norm": 1.2625378370285034, + "learning_rate": 0.00019931298632618356, + "loss": 0.9458, + "step": 1337 + }, + { + "epoch": 0.4296034676513084, + "grad_norm": 2.8485538959503174, + "learning_rate": 0.00019930889559753234, + "loss": 0.9646, + "step": 1338 + }, + { + "epoch": 0.4299245464761599, + "grad_norm": 1.6909540891647339, + "learning_rate": 0.00019930479276839344, + "loss": 1.0197, + "step": 1339 + }, + { + "epoch": 0.4302456253010114, + "grad_norm": 1.6980613470077515, + "learning_rate": 0.00019930067783926675, + "loss": 0.9211, + "step": 1340 + }, + { + "epoch": 0.4305667041258629, + "grad_norm": 0.8947736024856567, + "learning_rate": 0.0001992965508106537, + "loss": 0.7911, + "step": 1341 + }, + { + "epoch": 0.4308877829507144, + "grad_norm": 1.4755980968475342, + "learning_rate": 0.00019929241168305714, + "loss": 1.0548, + "step": 1342 + }, + { + "epoch": 0.4312088617755659, + "grad_norm": 2.1290855407714844, + "learning_rate": 0.00019928826045698136, + "loss": 0.9762, + "step": 1343 + }, + { + "epoch": 0.4315299406004174, + "grad_norm": 1.3842682838439941, + "learning_rate": 0.00019928409713293227, + "loss": 1.0557, + "step": 1344 + }, + { + "epoch": 0.4318510194252689, + "grad_norm": 2.651461601257324, + "learning_rate": 0.00019927992171141708, + "loss": 0.8827, + "step": 1345 + }, + { + "epoch": 0.43217209825012043, + "grad_norm": 3.1933929920196533, + "learning_rate": 0.00019927573419294456, + "loss": 0.9752, + "step": 1346 + }, + { + "epoch": 0.4324931770749719, + "grad_norm": 2.6903903484344482, + "learning_rate": 0.000199271534578025, + "loss": 1.0742, + "step": 1347 + }, + { + "epoch": 0.4328142558998234, + "grad_norm": 0.9604675769805908, + "learning_rate": 0.00019926732286717003, + "loss": 0.7134, + "step": 1348 + }, + { + "epoch": 0.4331353347246749, + "grad_norm": 1.6569525003433228, + "learning_rate": 0.0001992630990608929, + "loss": 0.6801, + "step": 1349 + }, + { + "epoch": 0.4334564135495264, + "grad_norm": 1.8039077520370483, + "learning_rate": 0.00019925886315970824, + "loss": 0.6657, + "step": 1350 + }, + { + "epoch": 0.4337774923743779, + "grad_norm": 2.7572011947631836, + "learning_rate": 0.00019925461516413223, + "loss": 1.5341, + "step": 1351 + }, + { + "epoch": 0.4340985711992294, + "grad_norm": 1.5353821516036987, + "learning_rate": 0.0001992503550746824, + "loss": 1.2077, + "step": 1352 + }, + { + "epoch": 0.4344196500240809, + "grad_norm": 1.8560153245925903, + "learning_rate": 0.00019924608289187786, + "loss": 1.043, + "step": 1353 + }, + { + "epoch": 0.4347407288489324, + "grad_norm": 1.4403096437454224, + "learning_rate": 0.00019924179861623915, + "loss": 0.7871, + "step": 1354 + }, + { + "epoch": 0.43506180767378394, + "grad_norm": 1.403806447982788, + "learning_rate": 0.00019923750224828832, + "loss": 0.7443, + "step": 1355 + }, + { + "epoch": 0.43538288649863544, + "grad_norm": 1.7703312635421753, + "learning_rate": 0.00019923319378854887, + "loss": 0.9553, + "step": 1356 + }, + { + "epoch": 0.43570396532348693, + "grad_norm": 1.7927194833755493, + "learning_rate": 0.00019922887323754577, + "loss": 0.6829, + "step": 1357 + }, + { + "epoch": 0.4360250441483384, + "grad_norm": 3.560272693634033, + "learning_rate": 0.00019922454059580544, + "loss": 1.0068, + "step": 1358 + }, + { + "epoch": 0.4363461229731899, + "grad_norm": 1.5825556516647339, + "learning_rate": 0.00019922019586385585, + "loss": 1.0626, + "step": 1359 + }, + { + "epoch": 0.4366672017980414, + "grad_norm": 1.7079756259918213, + "learning_rate": 0.00019921583904222633, + "loss": 0.9531, + "step": 1360 + }, + { + "epoch": 0.4369882806228929, + "grad_norm": 1.5126279592514038, + "learning_rate": 0.0001992114701314478, + "loss": 0.9648, + "step": 1361 + }, + { + "epoch": 0.4373093594477444, + "grad_norm": 1.6111955642700195, + "learning_rate": 0.00019920708913205256, + "loss": 1.111, + "step": 1362 + }, + { + "epoch": 0.4376304382725959, + "grad_norm": 1.1664777994155884, + "learning_rate": 0.00019920269604457446, + "loss": 1.0386, + "step": 1363 + }, + { + "epoch": 0.4379515170974474, + "grad_norm": 1.0824946165084839, + "learning_rate": 0.0001991982908695487, + "loss": 0.9912, + "step": 1364 + }, + { + "epoch": 0.43827259592229895, + "grad_norm": 1.131016492843628, + "learning_rate": 0.00019919387360751217, + "loss": 0.8238, + "step": 1365 + }, + { + "epoch": 0.43859367474715044, + "grad_norm": 1.307161808013916, + "learning_rate": 0.000199189444259003, + "loss": 0.8337, + "step": 1366 + }, + { + "epoch": 0.43891475357200194, + "grad_norm": 1.598332166671753, + "learning_rate": 0.0001991850028245609, + "loss": 1.1024, + "step": 1367 + }, + { + "epoch": 0.43923583239685343, + "grad_norm": 1.131783366203308, + "learning_rate": 0.00019918054930472706, + "loss": 0.9586, + "step": 1368 + }, + { + "epoch": 0.4395569112217049, + "grad_norm": 1.2683618068695068, + "learning_rate": 0.00019917608370004417, + "loss": 1.02, + "step": 1369 + }, + { + "epoch": 0.4398779900465564, + "grad_norm": 2.1559362411499023, + "learning_rate": 0.0001991716060110563, + "loss": 0.9757, + "step": 1370 + }, + { + "epoch": 0.4401990688714079, + "grad_norm": 1.6462043523788452, + "learning_rate": 0.00019916711623830903, + "loss": 1.0838, + "step": 1371 + }, + { + "epoch": 0.4405201476962594, + "grad_norm": 1.468853235244751, + "learning_rate": 0.0001991626143823495, + "loss": 1.1971, + "step": 1372 + }, + { + "epoch": 0.4408412265211109, + "grad_norm": 1.2045490741729736, + "learning_rate": 0.00019915810044372618, + "loss": 1.2012, + "step": 1373 + }, + { + "epoch": 0.44116230534596246, + "grad_norm": 1.699227213859558, + "learning_rate": 0.0001991535744229891, + "loss": 0.9933, + "step": 1374 + }, + { + "epoch": 0.44148338417081395, + "grad_norm": 1.5638002157211304, + "learning_rate": 0.00019914903632068973, + "loss": 1.2959, + "step": 1375 + }, + { + "epoch": 0.44180446299566545, + "grad_norm": 2.418407440185547, + "learning_rate": 0.00019914448613738106, + "loss": 1.1246, + "step": 1376 + }, + { + "epoch": 0.44212554182051694, + "grad_norm": 2.6279420852661133, + "learning_rate": 0.00019913992387361745, + "loss": 1.069, + "step": 1377 + }, + { + "epoch": 0.44244662064536844, + "grad_norm": 1.4538555145263672, + "learning_rate": 0.00019913534952995486, + "loss": 0.9135, + "step": 1378 + }, + { + "epoch": 0.44276769947021993, + "grad_norm": 1.7238460779190063, + "learning_rate": 0.00019913076310695068, + "loss": 1.3682, + "step": 1379 + }, + { + "epoch": 0.44308877829507143, + "grad_norm": 2.1236801147460938, + "learning_rate": 0.00019912616460516364, + "loss": 1.0323, + "step": 1380 + }, + { + "epoch": 0.4434098571199229, + "grad_norm": 1.527573585510254, + "learning_rate": 0.00019912155402515417, + "loss": 1.2224, + "step": 1381 + }, + { + "epoch": 0.4437309359447744, + "grad_norm": 1.1867547035217285, + "learning_rate": 0.00019911693136748403, + "loss": 0.9621, + "step": 1382 + }, + { + "epoch": 0.44405201476962597, + "grad_norm": 1.631277322769165, + "learning_rate": 0.0001991122966327164, + "loss": 1.1633, + "step": 1383 + }, + { + "epoch": 0.44437309359447746, + "grad_norm": 2.110377550125122, + "learning_rate": 0.0001991076498214161, + "loss": 1.081, + "step": 1384 + }, + { + "epoch": 0.44469417241932896, + "grad_norm": 1.2821978330612183, + "learning_rate": 0.0001991029909341493, + "loss": 0.8347, + "step": 1385 + }, + { + "epoch": 0.44501525124418045, + "grad_norm": 1.3833321332931519, + "learning_rate": 0.00019909831997148362, + "loss": 1.0652, + "step": 1386 + }, + { + "epoch": 0.44533633006903195, + "grad_norm": 1.2607406377792358, + "learning_rate": 0.00019909363693398828, + "loss": 0.9888, + "step": 1387 + }, + { + "epoch": 0.44565740889388344, + "grad_norm": 1.2479708194732666, + "learning_rate": 0.00019908894182223388, + "loss": 0.9652, + "step": 1388 + }, + { + "epoch": 0.44597848771873494, + "grad_norm": 1.9284359216690063, + "learning_rate": 0.00019908423463679248, + "loss": 1.0546, + "step": 1389 + }, + { + "epoch": 0.44629956654358643, + "grad_norm": 1.1474945545196533, + "learning_rate": 0.0001990795153782376, + "loss": 0.8867, + "step": 1390 + }, + { + "epoch": 0.44662064536843793, + "grad_norm": 1.8049343824386597, + "learning_rate": 0.00019907478404714436, + "loss": 0.921, + "step": 1391 + }, + { + "epoch": 0.4469417241932895, + "grad_norm": 2.072420835494995, + "learning_rate": 0.0001990700406440892, + "loss": 1.0613, + "step": 1392 + }, + { + "epoch": 0.447262803018141, + "grad_norm": 1.3974262475967407, + "learning_rate": 0.00019906528516965008, + "loss": 0.9483, + "step": 1393 + }, + { + "epoch": 0.44758388184299247, + "grad_norm": 1.0429906845092773, + "learning_rate": 0.0001990605176244065, + "loss": 0.8286, + "step": 1394 + }, + { + "epoch": 0.44790496066784397, + "grad_norm": 1.2362806797027588, + "learning_rate": 0.0001990557380089393, + "loss": 1.0296, + "step": 1395 + }, + { + "epoch": 0.44822603949269546, + "grad_norm": 1.1999589204788208, + "learning_rate": 0.0001990509463238309, + "loss": 0.7282, + "step": 1396 + }, + { + "epoch": 0.44854711831754696, + "grad_norm": 1.3461198806762695, + "learning_rate": 0.00019904614256966512, + "loss": 0.9301, + "step": 1397 + }, + { + "epoch": 0.44886819714239845, + "grad_norm": 2.5032989978790283, + "learning_rate": 0.00019904132674702734, + "loss": 0.5983, + "step": 1398 + }, + { + "epoch": 0.44918927596724995, + "grad_norm": 1.029445767402649, + "learning_rate": 0.0001990364988565043, + "loss": 0.7837, + "step": 1399 + }, + { + "epoch": 0.44951035479210144, + "grad_norm": 0.8912380337715149, + "learning_rate": 0.0001990316588986843, + "loss": 0.6475, + "step": 1400 + }, + { + "epoch": 0.44983143361695294, + "grad_norm": 4.18993616104126, + "learning_rate": 0.00019902680687415705, + "loss": 1.7317, + "step": 1401 + }, + { + "epoch": 0.4501525124418045, + "grad_norm": 6.351428031921387, + "learning_rate": 0.00019902194278351374, + "loss": 1.6514, + "step": 1402 + }, + { + "epoch": 0.450473591266656, + "grad_norm": 2.5410821437835693, + "learning_rate": 0.00019901706662734712, + "loss": 1.1215, + "step": 1403 + }, + { + "epoch": 0.4507946700915075, + "grad_norm": 1.9113538265228271, + "learning_rate": 0.0001990121784062512, + "loss": 0.9147, + "step": 1404 + }, + { + "epoch": 0.45111574891635897, + "grad_norm": 3.196178913116455, + "learning_rate": 0.00019900727812082177, + "loss": 0.9308, + "step": 1405 + }, + { + "epoch": 0.45143682774121047, + "grad_norm": 1.5448819398880005, + "learning_rate": 0.00019900236577165576, + "loss": 0.8038, + "step": 1406 + }, + { + "epoch": 0.45175790656606196, + "grad_norm": 1.4571887254714966, + "learning_rate": 0.0001989974413593518, + "loss": 0.7484, + "step": 1407 + }, + { + "epoch": 0.45207898539091346, + "grad_norm": 1.5297858715057373, + "learning_rate": 0.0001989925048845099, + "loss": 0.854, + "step": 1408 + }, + { + "epoch": 0.45240006421576495, + "grad_norm": 1.8126001358032227, + "learning_rate": 0.00019898755634773158, + "loss": 0.9785, + "step": 1409 + }, + { + "epoch": 0.45272114304061645, + "grad_norm": 1.3122637271881104, + "learning_rate": 0.00019898259574961978, + "loss": 1.1906, + "step": 1410 + }, + { + "epoch": 0.453042221865468, + "grad_norm": 1.2841436862945557, + "learning_rate": 0.0001989776230907789, + "loss": 0.7769, + "step": 1411 + }, + { + "epoch": 0.4533633006903195, + "grad_norm": 1.6594353914260864, + "learning_rate": 0.00019897263837181491, + "loss": 0.8995, + "step": 1412 + }, + { + "epoch": 0.453684379515171, + "grad_norm": 1.2056258916854858, + "learning_rate": 0.0001989676415933351, + "loss": 0.9341, + "step": 1413 + }, + { + "epoch": 0.4540054583400225, + "grad_norm": 1.2220386266708374, + "learning_rate": 0.00019896263275594842, + "loss": 0.9202, + "step": 1414 + }, + { + "epoch": 0.454326537164874, + "grad_norm": 1.083976149559021, + "learning_rate": 0.0001989576118602651, + "loss": 0.8357, + "step": 1415 + }, + { + "epoch": 0.4546476159897255, + "grad_norm": 1.134037733078003, + "learning_rate": 0.00019895257890689696, + "loss": 1.0837, + "step": 1416 + }, + { + "epoch": 0.45496869481457697, + "grad_norm": 1.6373850107192993, + "learning_rate": 0.00019894753389645723, + "loss": 1.0875, + "step": 1417 + }, + { + "epoch": 0.45528977363942846, + "grad_norm": 1.8361527919769287, + "learning_rate": 0.0001989424768295606, + "loss": 0.9749, + "step": 1418 + }, + { + "epoch": 0.45561085246427996, + "grad_norm": 2.0697522163391113, + "learning_rate": 0.00019893740770682335, + "loss": 0.9262, + "step": 1419 + }, + { + "epoch": 0.4559319312891315, + "grad_norm": 1.6787400245666504, + "learning_rate": 0.00019893232652886306, + "loss": 1.028, + "step": 1420 + }, + { + "epoch": 0.456253010113983, + "grad_norm": 1.6536740064620972, + "learning_rate": 0.00019892723329629887, + "loss": 1.138, + "step": 1421 + }, + { + "epoch": 0.4565740889388345, + "grad_norm": 2.046743631362915, + "learning_rate": 0.00019892212800975135, + "loss": 0.9917, + "step": 1422 + }, + { + "epoch": 0.456895167763686, + "grad_norm": 1.4609099626541138, + "learning_rate": 0.00019891701066984262, + "loss": 0.9998, + "step": 1423 + }, + { + "epoch": 0.4572162465885375, + "grad_norm": 1.2826584577560425, + "learning_rate": 0.00019891188127719618, + "loss": 1.0006, + "step": 1424 + }, + { + "epoch": 0.457537325413389, + "grad_norm": 1.3913053274154663, + "learning_rate": 0.00019890673983243706, + "loss": 1.098, + "step": 1425 + }, + { + "epoch": 0.4578584042382405, + "grad_norm": 2.0255448818206787, + "learning_rate": 0.0001989015863361917, + "loss": 1.0888, + "step": 1426 + }, + { + "epoch": 0.458179483063092, + "grad_norm": 2.3133769035339355, + "learning_rate": 0.00019889642078908804, + "loss": 1.2081, + "step": 1427 + }, + { + "epoch": 0.45850056188794347, + "grad_norm": 1.3124561309814453, + "learning_rate": 0.00019889124319175547, + "loss": 1.125, + "step": 1428 + }, + { + "epoch": 0.45882164071279496, + "grad_norm": 1.5743858814239502, + "learning_rate": 0.0001988860535448249, + "loss": 1.0983, + "step": 1429 + }, + { + "epoch": 0.4591427195376465, + "grad_norm": 1.6619899272918701, + "learning_rate": 0.00019888085184892868, + "loss": 1.1139, + "step": 1430 + }, + { + "epoch": 0.459463798362498, + "grad_norm": 1.2634574174880981, + "learning_rate": 0.0001988756381047006, + "loss": 1.1412, + "step": 1431 + }, + { + "epoch": 0.4597848771873495, + "grad_norm": 1.1055835485458374, + "learning_rate": 0.00019887041231277593, + "loss": 0.7893, + "step": 1432 + }, + { + "epoch": 0.460105956012201, + "grad_norm": 1.8112157583236694, + "learning_rate": 0.0001988651744737914, + "loss": 0.8947, + "step": 1433 + }, + { + "epoch": 0.4604270348370525, + "grad_norm": 0.9498884081840515, + "learning_rate": 0.00019885992458838528, + "loss": 0.9129, + "step": 1434 + }, + { + "epoch": 0.460748113661904, + "grad_norm": 1.6810704469680786, + "learning_rate": 0.0001988546626571972, + "loss": 1.0324, + "step": 1435 + }, + { + "epoch": 0.4610691924867555, + "grad_norm": 1.1200361251831055, + "learning_rate": 0.00019884938868086835, + "loss": 1.1815, + "step": 1436 + }, + { + "epoch": 0.461390271311607, + "grad_norm": 1.532668113708496, + "learning_rate": 0.00019884410266004135, + "loss": 0.9087, + "step": 1437 + }, + { + "epoch": 0.4617113501364585, + "grad_norm": 3.874738931655884, + "learning_rate": 0.00019883880459536024, + "loss": 0.9593, + "step": 1438 + }, + { + "epoch": 0.46203242896131, + "grad_norm": 2.262633800506592, + "learning_rate": 0.00019883349448747062, + "loss": 0.9921, + "step": 1439 + }, + { + "epoch": 0.4623535077861615, + "grad_norm": 1.0738508701324463, + "learning_rate": 0.00019882817233701948, + "loss": 0.7347, + "step": 1440 + }, + { + "epoch": 0.462674586611013, + "grad_norm": 1.7245491743087769, + "learning_rate": 0.0001988228381446553, + "loss": 0.9579, + "step": 1441 + }, + { + "epoch": 0.4629956654358645, + "grad_norm": 1.3573378324508667, + "learning_rate": 0.00019881749191102808, + "loss": 0.9096, + "step": 1442 + }, + { + "epoch": 0.463316744260716, + "grad_norm": 1.2465156316757202, + "learning_rate": 0.0001988121336367892, + "loss": 0.9165, + "step": 1443 + }, + { + "epoch": 0.4636378230855675, + "grad_norm": 1.3146942853927612, + "learning_rate": 0.00019880676332259154, + "loss": 0.7766, + "step": 1444 + }, + { + "epoch": 0.463958901910419, + "grad_norm": 1.2299082279205322, + "learning_rate": 0.00019880138096908952, + "loss": 0.7833, + "step": 1445 + }, + { + "epoch": 0.4642799807352705, + "grad_norm": 1.622185230255127, + "learning_rate": 0.00019879598657693891, + "loss": 0.7363, + "step": 1446 + }, + { + "epoch": 0.464601059560122, + "grad_norm": 1.7409615516662598, + "learning_rate": 0.00019879058014679704, + "loss": 1.0281, + "step": 1447 + }, + { + "epoch": 0.46492213838497354, + "grad_norm": 1.8660942316055298, + "learning_rate": 0.00019878516167932261, + "loss": 0.8218, + "step": 1448 + }, + { + "epoch": 0.46524321720982503, + "grad_norm": 1.445410132408142, + "learning_rate": 0.0001987797311751759, + "loss": 0.7792, + "step": 1449 + }, + { + "epoch": 0.46556429603467653, + "grad_norm": 1.0770496129989624, + "learning_rate": 0.00019877428863501856, + "loss": 0.6364, + "step": 1450 + }, + { + "epoch": 0.465885374859528, + "grad_norm": 1.7615505456924438, + "learning_rate": 0.00019876883405951377, + "loss": 1.367, + "step": 1451 + }, + { + "epoch": 0.4662064536843795, + "grad_norm": 1.661313772201538, + "learning_rate": 0.00019876336744932614, + "loss": 1.1688, + "step": 1452 + }, + { + "epoch": 0.466527532509231, + "grad_norm": 1.480622410774231, + "learning_rate": 0.0001987578888051218, + "loss": 1.0435, + "step": 1453 + }, + { + "epoch": 0.4668486113340825, + "grad_norm": 1.6343483924865723, + "learning_rate": 0.00019875239812756825, + "loss": 0.9328, + "step": 1454 + }, + { + "epoch": 0.467169690158934, + "grad_norm": 2.5799641609191895, + "learning_rate": 0.00019874689541733457, + "loss": 0.8994, + "step": 1455 + }, + { + "epoch": 0.4674907689837855, + "grad_norm": 1.4654110670089722, + "learning_rate": 0.00019874138067509117, + "loss": 0.7419, + "step": 1456 + }, + { + "epoch": 0.467811847808637, + "grad_norm": 1.7656830549240112, + "learning_rate": 0.00019873585390151003, + "loss": 1.0638, + "step": 1457 + }, + { + "epoch": 0.46813292663348854, + "grad_norm": 1.733784794807434, + "learning_rate": 0.00019873031509726462, + "loss": 1.1878, + "step": 1458 + }, + { + "epoch": 0.46845400545834004, + "grad_norm": 2.345583915710449, + "learning_rate": 0.00019872476426302982, + "loss": 0.98, + "step": 1459 + }, + { + "epoch": 0.46877508428319153, + "grad_norm": 1.3415275812149048, + "learning_rate": 0.00019871920139948192, + "loss": 0.9307, + "step": 1460 + }, + { + "epoch": 0.46909616310804303, + "grad_norm": 1.6265960931777954, + "learning_rate": 0.0001987136265072988, + "loss": 1.0417, + "step": 1461 + }, + { + "epoch": 0.4694172419328945, + "grad_norm": 1.1737104654312134, + "learning_rate": 0.00019870803958715972, + "loss": 0.8448, + "step": 1462 + }, + { + "epoch": 0.469738320757746, + "grad_norm": 1.189310908317566, + "learning_rate": 0.0001987024406397454, + "loss": 1.006, + "step": 1463 + }, + { + "epoch": 0.4700593995825975, + "grad_norm": 1.5854594707489014, + "learning_rate": 0.00019869682966573813, + "loss": 0.9803, + "step": 1464 + }, + { + "epoch": 0.470380478407449, + "grad_norm": 1.1072514057159424, + "learning_rate": 0.00019869120666582153, + "loss": 1.0124, + "step": 1465 + }, + { + "epoch": 0.4707015572323005, + "grad_norm": 1.613070011138916, + "learning_rate": 0.00019868557164068074, + "loss": 1.2908, + "step": 1466 + }, + { + "epoch": 0.47102263605715206, + "grad_norm": 1.2394685745239258, + "learning_rate": 0.0001986799245910024, + "loss": 1.1003, + "step": 1467 + }, + { + "epoch": 0.47134371488200355, + "grad_norm": 1.047046422958374, + "learning_rate": 0.00019867426551747457, + "loss": 1.0153, + "step": 1468 + }, + { + "epoch": 0.47166479370685505, + "grad_norm": 1.3292659521102905, + "learning_rate": 0.0001986685944207868, + "loss": 0.9651, + "step": 1469 + }, + { + "epoch": 0.47198587253170654, + "grad_norm": 1.4986034631729126, + "learning_rate": 0.0001986629113016301, + "loss": 0.9747, + "step": 1470 + }, + { + "epoch": 0.47230695135655804, + "grad_norm": 2.4843838214874268, + "learning_rate": 0.00019865721616069696, + "loss": 1.0655, + "step": 1471 + }, + { + "epoch": 0.47262803018140953, + "grad_norm": 1.3794037103652954, + "learning_rate": 0.00019865150899868125, + "loss": 1.1716, + "step": 1472 + }, + { + "epoch": 0.472949109006261, + "grad_norm": 1.1681032180786133, + "learning_rate": 0.00019864578981627844, + "loss": 1.1593, + "step": 1473 + }, + { + "epoch": 0.4732701878311125, + "grad_norm": 1.135563850402832, + "learning_rate": 0.00019864005861418535, + "loss": 0.9522, + "step": 1474 + }, + { + "epoch": 0.473591266655964, + "grad_norm": 1.3052153587341309, + "learning_rate": 0.0001986343153931003, + "loss": 1.0636, + "step": 1475 + }, + { + "epoch": 0.47391234548081557, + "grad_norm": 2.6790149211883545, + "learning_rate": 0.00019862856015372317, + "loss": 1.0276, + "step": 1476 + }, + { + "epoch": 0.47423342430566706, + "grad_norm": 1.5842307806015015, + "learning_rate": 0.00019862279289675509, + "loss": 0.9003, + "step": 1477 + }, + { + "epoch": 0.47455450313051856, + "grad_norm": 1.1842899322509766, + "learning_rate": 0.0001986170136228989, + "loss": 0.8584, + "step": 1478 + }, + { + "epoch": 0.47487558195537005, + "grad_norm": 1.4418596029281616, + "learning_rate": 0.0001986112223328587, + "loss": 0.9339, + "step": 1479 + }, + { + "epoch": 0.47519666078022155, + "grad_norm": 1.6683169603347778, + "learning_rate": 0.00019860541902734022, + "loss": 0.9629, + "step": 1480 + }, + { + "epoch": 0.47551773960507304, + "grad_norm": 1.3546855449676514, + "learning_rate": 0.0001985996037070505, + "loss": 1.0198, + "step": 1481 + }, + { + "epoch": 0.47583881842992454, + "grad_norm": 1.2630248069763184, + "learning_rate": 0.00019859377637269815, + "loss": 0.9135, + "step": 1482 + }, + { + "epoch": 0.47615989725477603, + "grad_norm": 1.2437412738800049, + "learning_rate": 0.00019858793702499323, + "loss": 0.8743, + "step": 1483 + }, + { + "epoch": 0.4764809760796275, + "grad_norm": 2.26094388961792, + "learning_rate": 0.00019858208566464724, + "loss": 1.038, + "step": 1484 + }, + { + "epoch": 0.476802054904479, + "grad_norm": 1.2740654945373535, + "learning_rate": 0.00019857622229237313, + "loss": 1.0045, + "step": 1485 + }, + { + "epoch": 0.4771231337293306, + "grad_norm": 1.8857311010360718, + "learning_rate": 0.00019857034690888537, + "loss": 0.9932, + "step": 1486 + }, + { + "epoch": 0.47744421255418207, + "grad_norm": 1.1165155172348022, + "learning_rate": 0.00019856445951489982, + "loss": 0.8839, + "step": 1487 + }, + { + "epoch": 0.47776529137903356, + "grad_norm": 1.5677917003631592, + "learning_rate": 0.00019855856011113384, + "loss": 0.8075, + "step": 1488 + }, + { + "epoch": 0.47808637020388506, + "grad_norm": 1.1825371980667114, + "learning_rate": 0.00019855264869830629, + "loss": 0.875, + "step": 1489 + }, + { + "epoch": 0.47840744902873655, + "grad_norm": 1.572262167930603, + "learning_rate": 0.00019854672527713744, + "loss": 1.1585, + "step": 1490 + }, + { + "epoch": 0.47872852785358805, + "grad_norm": 2.2230706214904785, + "learning_rate": 0.00019854078984834903, + "loss": 0.9478, + "step": 1491 + }, + { + "epoch": 0.47904960667843954, + "grad_norm": 1.5785419940948486, + "learning_rate": 0.00019853484241266428, + "loss": 0.8647, + "step": 1492 + }, + { + "epoch": 0.47937068550329104, + "grad_norm": 0.8901344537734985, + "learning_rate": 0.00019852888297080786, + "loss": 0.7951, + "step": 1493 + }, + { + "epoch": 0.47969176432814253, + "grad_norm": 1.1752713918685913, + "learning_rate": 0.00019852291152350592, + "loss": 0.9361, + "step": 1494 + }, + { + "epoch": 0.4800128431529941, + "grad_norm": 1.196556568145752, + "learning_rate": 0.0001985169280714861, + "loss": 0.9662, + "step": 1495 + }, + { + "epoch": 0.4803339219778456, + "grad_norm": 1.6458885669708252, + "learning_rate": 0.0001985109326154774, + "loss": 0.932, + "step": 1496 + }, + { + "epoch": 0.4806550008026971, + "grad_norm": 1.7147083282470703, + "learning_rate": 0.00019850492515621038, + "loss": 0.8323, + "step": 1497 + }, + { + "epoch": 0.48097607962754857, + "grad_norm": 1.5965648889541626, + "learning_rate": 0.00019849890569441703, + "loss": 0.6222, + "step": 1498 + }, + { + "epoch": 0.48129715845240006, + "grad_norm": 0.8603938221931458, + "learning_rate": 0.00019849287423083078, + "loss": 0.4993, + "step": 1499 + }, + { + "epoch": 0.48161823727725156, + "grad_norm": 1.216637134552002, + "learning_rate": 0.00019848683076618658, + "loss": 0.606, + "step": 1500 + }, + { + "epoch": 0.48193931610210305, + "grad_norm": 1.512508511543274, + "learning_rate": 0.00019848077530122083, + "loss": 1.5987, + "step": 1501 + }, + { + "epoch": 0.48226039492695455, + "grad_norm": 1.3921582698822021, + "learning_rate": 0.00019847470783667127, + "loss": 1.3467, + "step": 1502 + }, + { + "epoch": 0.48258147375180604, + "grad_norm": 1.5864951610565186, + "learning_rate": 0.0001984686283732773, + "loss": 1.3512, + "step": 1503 + }, + { + "epoch": 0.4829025525766576, + "grad_norm": 1.5356676578521729, + "learning_rate": 0.00019846253691177966, + "loss": 0.8669, + "step": 1504 + }, + { + "epoch": 0.4832236314015091, + "grad_norm": 1.3852877616882324, + "learning_rate": 0.00019845643345292054, + "loss": 0.677, + "step": 1505 + }, + { + "epoch": 0.4835447102263606, + "grad_norm": 1.4612033367156982, + "learning_rate": 0.00019845031799744367, + "loss": 0.8081, + "step": 1506 + }, + { + "epoch": 0.4838657890512121, + "grad_norm": 1.5875556468963623, + "learning_rate": 0.0001984441905460942, + "loss": 0.783, + "step": 1507 + }, + { + "epoch": 0.4841868678760636, + "grad_norm": 2.032498359680176, + "learning_rate": 0.00019843805109961868, + "loss": 1.2169, + "step": 1508 + }, + { + "epoch": 0.48450794670091507, + "grad_norm": 1.468684196472168, + "learning_rate": 0.00019843189965876526, + "loss": 1.1707, + "step": 1509 + }, + { + "epoch": 0.48482902552576657, + "grad_norm": 1.416133165359497, + "learning_rate": 0.00019842573622428345, + "loss": 1.0083, + "step": 1510 + }, + { + "epoch": 0.48515010435061806, + "grad_norm": 1.3447717428207397, + "learning_rate": 0.0001984195607969242, + "loss": 1.158, + "step": 1511 + }, + { + "epoch": 0.48547118317546956, + "grad_norm": 0.9209278225898743, + "learning_rate": 0.00019841337337744004, + "loss": 0.8482, + "step": 1512 + }, + { + "epoch": 0.48579226200032105, + "grad_norm": 1.1930347681045532, + "learning_rate": 0.00019840717396658484, + "loss": 0.9346, + "step": 1513 + }, + { + "epoch": 0.4861133408251726, + "grad_norm": 2.087160587310791, + "learning_rate": 0.00019840096256511398, + "loss": 1.034, + "step": 1514 + }, + { + "epoch": 0.4864344196500241, + "grad_norm": 2.3745105266571045, + "learning_rate": 0.00019839473917378434, + "loss": 0.9559, + "step": 1515 + }, + { + "epoch": 0.4867554984748756, + "grad_norm": 1.2410439252853394, + "learning_rate": 0.00019838850379335417, + "loss": 1.1425, + "step": 1516 + }, + { + "epoch": 0.4870765772997271, + "grad_norm": 1.2628313302993774, + "learning_rate": 0.00019838225642458327, + "loss": 0.8775, + "step": 1517 + }, + { + "epoch": 0.4873976561245786, + "grad_norm": 1.2423824071884155, + "learning_rate": 0.00019837599706823284, + "loss": 1.1197, + "step": 1518 + }, + { + "epoch": 0.4877187349494301, + "grad_norm": 1.6348954439163208, + "learning_rate": 0.00019836972572506557, + "loss": 1.1318, + "step": 1519 + }, + { + "epoch": 0.4880398137742816, + "grad_norm": 1.3113266229629517, + "learning_rate": 0.00019836344239584564, + "loss": 0.8346, + "step": 1520 + }, + { + "epoch": 0.48836089259913307, + "grad_norm": 2.537024974822998, + "learning_rate": 0.00019835714708133862, + "loss": 0.9601, + "step": 1521 + }, + { + "epoch": 0.48868197142398456, + "grad_norm": 1.3241581916809082, + "learning_rate": 0.00019835083978231156, + "loss": 1.0883, + "step": 1522 + }, + { + "epoch": 0.4890030502488361, + "grad_norm": 1.2066631317138672, + "learning_rate": 0.00019834452049953297, + "loss": 0.8249, + "step": 1523 + }, + { + "epoch": 0.4893241290736876, + "grad_norm": 2.023279905319214, + "learning_rate": 0.0001983381892337729, + "loss": 1.1508, + "step": 1524 + }, + { + "epoch": 0.4896452078985391, + "grad_norm": 1.0849757194519043, + "learning_rate": 0.00019833184598580276, + "loss": 0.8601, + "step": 1525 + }, + { + "epoch": 0.4899662867233906, + "grad_norm": 1.6499979496002197, + "learning_rate": 0.0001983254907563955, + "loss": 1.0404, + "step": 1526 + }, + { + "epoch": 0.4902873655482421, + "grad_norm": 2.0686190128326416, + "learning_rate": 0.00019831912354632535, + "loss": 1.0121, + "step": 1527 + }, + { + "epoch": 0.4906084443730936, + "grad_norm": 1.2880290746688843, + "learning_rate": 0.0001983127443563683, + "loss": 0.9684, + "step": 1528 + }, + { + "epoch": 0.4909295231979451, + "grad_norm": 1.8285752534866333, + "learning_rate": 0.00019830635318730154, + "loss": 1.1573, + "step": 1529 + }, + { + "epoch": 0.4912506020227966, + "grad_norm": 1.0872652530670166, + "learning_rate": 0.00019829995003990388, + "loss": 0.9672, + "step": 1530 + }, + { + "epoch": 0.4915716808476481, + "grad_norm": 3.2847740650177, + "learning_rate": 0.00019829353491495545, + "loss": 1.0548, + "step": 1531 + }, + { + "epoch": 0.4918927596724996, + "grad_norm": 1.274139165878296, + "learning_rate": 0.00019828710781323792, + "loss": 1.1498, + "step": 1532 + }, + { + "epoch": 0.4922138384973511, + "grad_norm": 1.1564581394195557, + "learning_rate": 0.00019828066873553448, + "loss": 1.0376, + "step": 1533 + }, + { + "epoch": 0.4925349173222026, + "grad_norm": 2.3864920139312744, + "learning_rate": 0.00019827421768262967, + "loss": 1.2347, + "step": 1534 + }, + { + "epoch": 0.4928559961470541, + "grad_norm": 1.5429226160049438, + "learning_rate": 0.0001982677546553095, + "loss": 0.7993, + "step": 1535 + }, + { + "epoch": 0.4931770749719056, + "grad_norm": 1.7663393020629883, + "learning_rate": 0.00019826127965436152, + "loss": 1.1139, + "step": 1536 + }, + { + "epoch": 0.4934981537967571, + "grad_norm": 1.8667831420898438, + "learning_rate": 0.00019825479268057467, + "loss": 1.1442, + "step": 1537 + }, + { + "epoch": 0.4938192326216086, + "grad_norm": 1.3998355865478516, + "learning_rate": 0.0001982482937347394, + "loss": 0.9002, + "step": 1538 + }, + { + "epoch": 0.4941403114464601, + "grad_norm": 1.7938013076782227, + "learning_rate": 0.00019824178281764753, + "loss": 1.039, + "step": 1539 + }, + { + "epoch": 0.4944613902713116, + "grad_norm": 2.098247528076172, + "learning_rate": 0.00019823525993009243, + "loss": 0.729, + "step": 1540 + }, + { + "epoch": 0.49478246909616314, + "grad_norm": 1.4042150974273682, + "learning_rate": 0.0001982287250728689, + "loss": 1.1048, + "step": 1541 + }, + { + "epoch": 0.49510354792101463, + "grad_norm": 1.135709285736084, + "learning_rate": 0.00019822217824677315, + "loss": 0.8585, + "step": 1542 + }, + { + "epoch": 0.4954246267458661, + "grad_norm": 2.2327728271484375, + "learning_rate": 0.0001982156194526029, + "loss": 1.0654, + "step": 1543 + }, + { + "epoch": 0.4957457055707176, + "grad_norm": 1.6728448867797852, + "learning_rate": 0.0001982090486911574, + "loss": 0.8941, + "step": 1544 + }, + { + "epoch": 0.4960667843955691, + "grad_norm": 1.843428373336792, + "learning_rate": 0.0001982024659632372, + "loss": 1.0319, + "step": 1545 + }, + { + "epoch": 0.4963878632204206, + "grad_norm": 1.780278205871582, + "learning_rate": 0.00019819587126964437, + "loss": 0.9306, + "step": 1546 + }, + { + "epoch": 0.4967089420452721, + "grad_norm": 1.5725250244140625, + "learning_rate": 0.00019818926461118253, + "loss": 0.8823, + "step": 1547 + }, + { + "epoch": 0.4970300208701236, + "grad_norm": 2.787895679473877, + "learning_rate": 0.0001981826459886566, + "loss": 1.0951, + "step": 1548 + }, + { + "epoch": 0.4973510996949751, + "grad_norm": 0.9631800651550293, + "learning_rate": 0.00019817601540287306, + "loss": 0.637, + "step": 1549 + }, + { + "epoch": 0.4976721785198266, + "grad_norm": 1.3815464973449707, + "learning_rate": 0.0001981693728546399, + "loss": 0.7441, + "step": 1550 + }, + { + "epoch": 0.49799325734467814, + "grad_norm": 1.8321810960769653, + "learning_rate": 0.00019816271834476642, + "loss": 1.4642, + "step": 1551 + }, + { + "epoch": 0.49831433616952964, + "grad_norm": 1.4489206075668335, + "learning_rate": 0.00019815605187406345, + "loss": 1.2029, + "step": 1552 + }, + { + "epoch": 0.49863541499438113, + "grad_norm": 1.4845582246780396, + "learning_rate": 0.0001981493734433433, + "loss": 0.823, + "step": 1553 + }, + { + "epoch": 0.4989564938192326, + "grad_norm": 1.642909288406372, + "learning_rate": 0.0001981426830534197, + "loss": 0.8766, + "step": 1554 + }, + { + "epoch": 0.4992775726440841, + "grad_norm": 1.8939411640167236, + "learning_rate": 0.00019813598070510792, + "loss": 0.9463, + "step": 1555 + }, + { + "epoch": 0.4995986514689356, + "grad_norm": 1.7453148365020752, + "learning_rate": 0.0001981292663992245, + "loss": 0.8896, + "step": 1556 + }, + { + "epoch": 0.4999197302937871, + "grad_norm": 1.5144426822662354, + "learning_rate": 0.00019812254013658768, + "loss": 0.8909, + "step": 1557 + }, + { + "epoch": 0.5002408091186387, + "grad_norm": 1.3950881958007812, + "learning_rate": 0.00019811580191801697, + "loss": 0.9749, + "step": 1558 + }, + { + "epoch": 0.5005618879434901, + "grad_norm": 1.4966586828231812, + "learning_rate": 0.0001981090517443334, + "loss": 0.9999, + "step": 1559 + }, + { + "epoch": 0.5008829667683417, + "grad_norm": 1.356341004371643, + "learning_rate": 0.0001981022896163595, + "loss": 1.192, + "step": 1560 + }, + { + "epoch": 0.5012040455931931, + "grad_norm": 3.053656816482544, + "learning_rate": 0.00019809551553491916, + "loss": 1.1255, + "step": 1561 + }, + { + "epoch": 0.5015251244180446, + "grad_norm": 1.3365064859390259, + "learning_rate": 0.00019808872950083782, + "loss": 0.9588, + "step": 1562 + }, + { + "epoch": 0.5018462032428961, + "grad_norm": 0.789279580116272, + "learning_rate": 0.00019808193151494232, + "loss": 0.747, + "step": 1563 + }, + { + "epoch": 0.5021672820677476, + "grad_norm": 1.067087173461914, + "learning_rate": 0.000198075121578061, + "loss": 0.9783, + "step": 1564 + }, + { + "epoch": 0.5024883608925992, + "grad_norm": 1.0413140058517456, + "learning_rate": 0.00019806829969102357, + "loss": 0.8484, + "step": 1565 + }, + { + "epoch": 0.5028094397174506, + "grad_norm": 1.8259743452072144, + "learning_rate": 0.0001980614658546613, + "loss": 1.0746, + "step": 1566 + }, + { + "epoch": 0.5031305185423022, + "grad_norm": 1.282426118850708, + "learning_rate": 0.00019805462006980689, + "loss": 1.1119, + "step": 1567 + }, + { + "epoch": 0.5034515973671536, + "grad_norm": 1.075327754020691, + "learning_rate": 0.00019804776233729444, + "loss": 0.8866, + "step": 1568 + }, + { + "epoch": 0.5037726761920052, + "grad_norm": 1.3174536228179932, + "learning_rate": 0.0001980408926579596, + "loss": 0.9901, + "step": 1569 + }, + { + "epoch": 0.5040937550168566, + "grad_norm": 1.03889000415802, + "learning_rate": 0.00019803401103263933, + "loss": 0.9264, + "step": 1570 + }, + { + "epoch": 0.5044148338417082, + "grad_norm": 1.4712145328521729, + "learning_rate": 0.00019802711746217218, + "loss": 1.113, + "step": 1571 + }, + { + "epoch": 0.5047359126665596, + "grad_norm": 3.3879761695861816, + "learning_rate": 0.00019802021194739814, + "loss": 0.9667, + "step": 1572 + }, + { + "epoch": 0.5050569914914111, + "grad_norm": 1.116847276687622, + "learning_rate": 0.00019801329448915862, + "loss": 1.0384, + "step": 1573 + }, + { + "epoch": 0.5053780703162627, + "grad_norm": 1.0987874269485474, + "learning_rate": 0.00019800636508829643, + "loss": 1.0302, + "step": 1574 + }, + { + "epoch": 0.5056991491411141, + "grad_norm": 1.6565288305282593, + "learning_rate": 0.00019799942374565597, + "loss": 1.0317, + "step": 1575 + }, + { + "epoch": 0.5060202279659657, + "grad_norm": 1.2395991086959839, + "learning_rate": 0.00019799247046208297, + "loss": 1.1209, + "step": 1576 + }, + { + "epoch": 0.5063413067908171, + "grad_norm": 1.0488249063491821, + "learning_rate": 0.0001979855052384247, + "loss": 0.9093, + "step": 1577 + }, + { + "epoch": 0.5066623856156687, + "grad_norm": 1.3731625080108643, + "learning_rate": 0.00019797852807552983, + "loss": 0.9188, + "step": 1578 + }, + { + "epoch": 0.5069834644405201, + "grad_norm": 2.111989974975586, + "learning_rate": 0.00019797153897424852, + "loss": 1.098, + "step": 1579 + }, + { + "epoch": 0.5073045432653717, + "grad_norm": 1.2938238382339478, + "learning_rate": 0.00019796453793543238, + "loss": 0.8608, + "step": 1580 + }, + { + "epoch": 0.5076256220902231, + "grad_norm": 1.4208569526672363, + "learning_rate": 0.0001979575249599344, + "loss": 1.0857, + "step": 1581 + }, + { + "epoch": 0.5079467009150747, + "grad_norm": 1.5202668905258179, + "learning_rate": 0.00019795050004860917, + "loss": 1.2191, + "step": 1582 + }, + { + "epoch": 0.5082677797399262, + "grad_norm": 0.9358740448951721, + "learning_rate": 0.00019794346320231265, + "loss": 0.9756, + "step": 1583 + }, + { + "epoch": 0.5085888585647776, + "grad_norm": 1.4582595825195312, + "learning_rate": 0.00019793641442190221, + "loss": 1.0251, + "step": 1584 + }, + { + "epoch": 0.5089099373896292, + "grad_norm": 1.3542490005493164, + "learning_rate": 0.00019792935370823675, + "loss": 1.0413, + "step": 1585 + }, + { + "epoch": 0.5092310162144806, + "grad_norm": 0.8536126017570496, + "learning_rate": 0.00019792228106217658, + "loss": 0.7827, + "step": 1586 + }, + { + "epoch": 0.5095520950393322, + "grad_norm": 1.294661521911621, + "learning_rate": 0.00019791519648458352, + "loss": 0.8979, + "step": 1587 + }, + { + "epoch": 0.5098731738641836, + "grad_norm": 1.0872153043746948, + "learning_rate": 0.00019790809997632076, + "loss": 0.8247, + "step": 1588 + }, + { + "epoch": 0.5101942526890352, + "grad_norm": 1.6541205644607544, + "learning_rate": 0.000197900991538253, + "loss": 1.0276, + "step": 1589 + }, + { + "epoch": 0.5105153315138866, + "grad_norm": 1.2755903005599976, + "learning_rate": 0.00019789387117124637, + "loss": 0.9243, + "step": 1590 + }, + { + "epoch": 0.5108364103387382, + "grad_norm": 1.2465382814407349, + "learning_rate": 0.0001978867388761685, + "loss": 0.9222, + "step": 1591 + }, + { + "epoch": 0.5111574891635896, + "grad_norm": 0.9154629707336426, + "learning_rate": 0.00019787959465388842, + "loss": 0.8289, + "step": 1592 + }, + { + "epoch": 0.5114785679884412, + "grad_norm": 1.7428443431854248, + "learning_rate": 0.00019787243850527664, + "loss": 1.0268, + "step": 1593 + }, + { + "epoch": 0.5117996468132927, + "grad_norm": 1.0472004413604736, + "learning_rate": 0.0001978652704312051, + "loss": 0.8157, + "step": 1594 + }, + { + "epoch": 0.5121207256381441, + "grad_norm": 1.6456936597824097, + "learning_rate": 0.00019785809043254722, + "loss": 0.9551, + "step": 1595 + }, + { + "epoch": 0.5124418044629957, + "grad_norm": 1.1858004331588745, + "learning_rate": 0.00019785089851017787, + "loss": 0.9122, + "step": 1596 + }, + { + "epoch": 0.5127628832878471, + "grad_norm": 1.2532726526260376, + "learning_rate": 0.0001978436946649733, + "loss": 0.8158, + "step": 1597 + }, + { + "epoch": 0.5130839621126987, + "grad_norm": 0.9633054733276367, + "learning_rate": 0.00019783647889781136, + "loss": 0.7278, + "step": 1598 + }, + { + "epoch": 0.5134050409375501, + "grad_norm": 1.5852664709091187, + "learning_rate": 0.00019782925120957124, + "loss": 0.8357, + "step": 1599 + }, + { + "epoch": 0.5137261197624017, + "grad_norm": 0.9574192762374878, + "learning_rate": 0.0001978220116011336, + "loss": 0.5688, + "step": 1600 + }, + { + "epoch": 0.5140471985872531, + "grad_norm": 1.5146691799163818, + "learning_rate": 0.00019781476007338058, + "loss": 1.3743, + "step": 1601 + }, + { + "epoch": 0.5143682774121047, + "grad_norm": 1.5694231986999512, + "learning_rate": 0.00019780749662719573, + "loss": 1.6016, + "step": 1602 + }, + { + "epoch": 0.5146893562369562, + "grad_norm": 1.2235115766525269, + "learning_rate": 0.0001978002212634641, + "loss": 1.0506, + "step": 1603 + }, + { + "epoch": 0.5150104350618077, + "grad_norm": 1.2383902072906494, + "learning_rate": 0.0001977929339830722, + "loss": 0.7889, + "step": 1604 + }, + { + "epoch": 0.5153315138866592, + "grad_norm": 1.5607134103775024, + "learning_rate": 0.0001977856347869079, + "loss": 0.9399, + "step": 1605 + }, + { + "epoch": 0.5156525927115106, + "grad_norm": 1.2659744024276733, + "learning_rate": 0.00019777832367586063, + "loss": 0.7502, + "step": 1606 + }, + { + "epoch": 0.5159736715363622, + "grad_norm": 1.645847201347351, + "learning_rate": 0.00019777100065082118, + "loss": 1.0366, + "step": 1607 + }, + { + "epoch": 0.5162947503612136, + "grad_norm": 1.4110909700393677, + "learning_rate": 0.00019776366571268192, + "loss": 1.1166, + "step": 1608 + }, + { + "epoch": 0.5166158291860652, + "grad_norm": 1.3182811737060547, + "learning_rate": 0.00019775631886233654, + "loss": 1.0907, + "step": 1609 + }, + { + "epoch": 0.5169369080109166, + "grad_norm": 1.0148051977157593, + "learning_rate": 0.0001977489601006802, + "loss": 0.7937, + "step": 1610 + }, + { + "epoch": 0.5172579868357682, + "grad_norm": 1.499527096748352, + "learning_rate": 0.0001977415894286096, + "loss": 0.8182, + "step": 1611 + }, + { + "epoch": 0.5175790656606197, + "grad_norm": 1.6975308656692505, + "learning_rate": 0.0001977342068470228, + "loss": 0.9804, + "step": 1612 + }, + { + "epoch": 0.5179001444854712, + "grad_norm": 1.4909874200820923, + "learning_rate": 0.00019772681235681936, + "loss": 1.088, + "step": 1613 + }, + { + "epoch": 0.5182212233103227, + "grad_norm": 1.8054780960083008, + "learning_rate": 0.00019771940595890027, + "loss": 0.8292, + "step": 1614 + }, + { + "epoch": 0.5185423021351742, + "grad_norm": 1.4584299325942993, + "learning_rate": 0.000197711987654168, + "loss": 1.0235, + "step": 1615 + }, + { + "epoch": 0.5188633809600257, + "grad_norm": 1.4883002042770386, + "learning_rate": 0.0001977045574435264, + "loss": 0.9593, + "step": 1616 + }, + { + "epoch": 0.5191844597848772, + "grad_norm": 1.0827285051345825, + "learning_rate": 0.00019769711532788083, + "loss": 0.9392, + "step": 1617 + }, + { + "epoch": 0.5195055386097287, + "grad_norm": 1.358249306678772, + "learning_rate": 0.0001976896613081381, + "loss": 1.1373, + "step": 1618 + }, + { + "epoch": 0.5198266174345801, + "grad_norm": 1.4473522901535034, + "learning_rate": 0.0001976821953852065, + "loss": 0.9981, + "step": 1619 + }, + { + "epoch": 0.5201476962594317, + "grad_norm": 1.2284332513809204, + "learning_rate": 0.0001976747175599957, + "loss": 0.9899, + "step": 1620 + }, + { + "epoch": 0.5204687750842832, + "grad_norm": 1.0846779346466064, + "learning_rate": 0.0001976672278334168, + "loss": 1.0516, + "step": 1621 + }, + { + "epoch": 0.5207898539091347, + "grad_norm": 1.5218236446380615, + "learning_rate": 0.00019765972620638248, + "loss": 0.7433, + "step": 1622 + }, + { + "epoch": 0.5211109327339862, + "grad_norm": 1.1767224073410034, + "learning_rate": 0.00019765221267980675, + "loss": 0.9726, + "step": 1623 + }, + { + "epoch": 0.5214320115588377, + "grad_norm": 1.486958384513855, + "learning_rate": 0.00019764468725460508, + "loss": 0.8919, + "step": 1624 + }, + { + "epoch": 0.5217530903836892, + "grad_norm": 1.7597538232803345, + "learning_rate": 0.00019763714993169452, + "loss": 0.8817, + "step": 1625 + }, + { + "epoch": 0.5220741692085407, + "grad_norm": 1.96730637550354, + "learning_rate": 0.00019762960071199333, + "loss": 1.0347, + "step": 1626 + }, + { + "epoch": 0.5223952480333922, + "grad_norm": 1.458665370941162, + "learning_rate": 0.0001976220395964215, + "loss": 0.9896, + "step": 1627 + }, + { + "epoch": 0.5227163268582437, + "grad_norm": 1.2586889266967773, + "learning_rate": 0.00019761446658590024, + "loss": 0.9721, + "step": 1628 + }, + { + "epoch": 0.5230374056830952, + "grad_norm": 1.374340295791626, + "learning_rate": 0.00019760688168135232, + "loss": 1.1315, + "step": 1629 + }, + { + "epoch": 0.5233584845079468, + "grad_norm": 1.393900990486145, + "learning_rate": 0.00019759928488370193, + "loss": 1.0165, + "step": 1630 + }, + { + "epoch": 0.5236795633327982, + "grad_norm": 1.0543795824050903, + "learning_rate": 0.00019759167619387476, + "loss": 0.8592, + "step": 1631 + }, + { + "epoch": 0.5240006421576497, + "grad_norm": 0.9907365441322327, + "learning_rate": 0.00019758405561279784, + "loss": 1.095, + "step": 1632 + }, + { + "epoch": 0.5243217209825012, + "grad_norm": 0.9849667549133301, + "learning_rate": 0.00019757642314139977, + "loss": 0.8316, + "step": 1633 + }, + { + "epoch": 0.5246427998073527, + "grad_norm": 1.4609546661376953, + "learning_rate": 0.00019756877878061052, + "loss": 0.8168, + "step": 1634 + }, + { + "epoch": 0.5249638786322042, + "grad_norm": 1.2530486583709717, + "learning_rate": 0.0001975611225313615, + "loss": 0.922, + "step": 1635 + }, + { + "epoch": 0.5252849574570557, + "grad_norm": 1.1039314270019531, + "learning_rate": 0.00019755345439458565, + "loss": 0.9009, + "step": 1636 + }, + { + "epoch": 0.5256060362819072, + "grad_norm": 1.1519142389297485, + "learning_rate": 0.00019754577437121733, + "loss": 1.0119, + "step": 1637 + }, + { + "epoch": 0.5259271151067587, + "grad_norm": 1.89316725730896, + "learning_rate": 0.00019753808246219224, + "loss": 0.9892, + "step": 1638 + }, + { + "epoch": 0.5262481939316103, + "grad_norm": 1.0566115379333496, + "learning_rate": 0.00019753037866844771, + "loss": 0.8139, + "step": 1639 + }, + { + "epoch": 0.5265692727564617, + "grad_norm": 1.0403966903686523, + "learning_rate": 0.00019752266299092236, + "loss": 0.9116, + "step": 1640 + }, + { + "epoch": 0.5268903515813133, + "grad_norm": 1.4061709642410278, + "learning_rate": 0.00019751493543055632, + "loss": 1.0763, + "step": 1641 + }, + { + "epoch": 0.5272114304061647, + "grad_norm": 1.2321988344192505, + "learning_rate": 0.00019750719598829122, + "loss": 0.8309, + "step": 1642 + }, + { + "epoch": 0.5275325092310162, + "grad_norm": 1.538996696472168, + "learning_rate": 0.00019749944466507008, + "loss": 1.1677, + "step": 1643 + }, + { + "epoch": 0.5278535880558677, + "grad_norm": 1.1386488676071167, + "learning_rate": 0.00019749168146183731, + "loss": 0.9062, + "step": 1644 + }, + { + "epoch": 0.5281746668807192, + "grad_norm": 1.7029637098312378, + "learning_rate": 0.0001974839063795389, + "loss": 0.898, + "step": 1645 + }, + { + "epoch": 0.5284957457055707, + "grad_norm": 1.1630817651748657, + "learning_rate": 0.0001974761194191222, + "loss": 0.9107, + "step": 1646 + }, + { + "epoch": 0.5288168245304222, + "grad_norm": 0.9628080725669861, + "learning_rate": 0.00019746832058153602, + "loss": 0.6659, + "step": 1647 + }, + { + "epoch": 0.5291379033552737, + "grad_norm": 0.8993632197380066, + "learning_rate": 0.0001974605098677306, + "loss": 0.8283, + "step": 1648 + }, + { + "epoch": 0.5294589821801252, + "grad_norm": 3.56905198097229, + "learning_rate": 0.00019745268727865774, + "loss": 0.6945, + "step": 1649 + }, + { + "epoch": 0.5297800610049768, + "grad_norm": 1.1273267269134521, + "learning_rate": 0.00019744485281527049, + "loss": 0.5489, + "step": 1650 + }, + { + "epoch": 0.5301011398298282, + "grad_norm": 1.4737544059753418, + "learning_rate": 0.00019743700647852354, + "loss": 1.4845, + "step": 1651 + }, + { + "epoch": 0.5304222186546798, + "grad_norm": 1.4742531776428223, + "learning_rate": 0.00019742914826937288, + "loss": 1.327, + "step": 1652 + }, + { + "epoch": 0.5307432974795312, + "grad_norm": 1.2791469097137451, + "learning_rate": 0.00019742127818877606, + "loss": 1.2235, + "step": 1653 + }, + { + "epoch": 0.5310643763043827, + "grad_norm": 1.511093258857727, + "learning_rate": 0.000197413396237692, + "loss": 0.8866, + "step": 1654 + }, + { + "epoch": 0.5313854551292342, + "grad_norm": 2.150474786758423, + "learning_rate": 0.00019740550241708108, + "loss": 0.9557, + "step": 1655 + }, + { + "epoch": 0.5317065339540857, + "grad_norm": 1.3363398313522339, + "learning_rate": 0.0001973975967279052, + "loss": 0.6729, + "step": 1656 + }, + { + "epoch": 0.5320276127789372, + "grad_norm": 1.4328546524047852, + "learning_rate": 0.0001973896791711275, + "loss": 0.6497, + "step": 1657 + }, + { + "epoch": 0.5323486916037887, + "grad_norm": 1.1948676109313965, + "learning_rate": 0.0001973817497477129, + "loss": 0.6963, + "step": 1658 + }, + { + "epoch": 0.5326697704286403, + "grad_norm": 1.3898074626922607, + "learning_rate": 0.00019737380845862745, + "loss": 1.0177, + "step": 1659 + }, + { + "epoch": 0.5329908492534917, + "grad_norm": 1.250869631767273, + "learning_rate": 0.0001973658553048388, + "loss": 0.9914, + "step": 1660 + }, + { + "epoch": 0.5333119280783433, + "grad_norm": 1.9206534624099731, + "learning_rate": 0.00019735789028731604, + "loss": 0.9322, + "step": 1661 + }, + { + "epoch": 0.5336330069031947, + "grad_norm": 1.8212529420852661, + "learning_rate": 0.00019734991340702966, + "loss": 1.1671, + "step": 1662 + }, + { + "epoch": 0.5339540857280463, + "grad_norm": 1.4000660181045532, + "learning_rate": 0.00019734192466495162, + "loss": 1.0022, + "step": 1663 + }, + { + "epoch": 0.5342751645528977, + "grad_norm": 0.9797595143318176, + "learning_rate": 0.0001973339240620553, + "loss": 0.8335, + "step": 1664 + }, + { + "epoch": 0.5345962433777492, + "grad_norm": 1.2020220756530762, + "learning_rate": 0.0001973259115993156, + "loss": 0.9649, + "step": 1665 + }, + { + "epoch": 0.5349173222026007, + "grad_norm": 1.1489737033843994, + "learning_rate": 0.00019731788727770885, + "loss": 0.9856, + "step": 1666 + }, + { + "epoch": 0.5352384010274522, + "grad_norm": 1.017920970916748, + "learning_rate": 0.00019730985109821266, + "loss": 0.899, + "step": 1667 + }, + { + "epoch": 0.5355594798523038, + "grad_norm": 0.8471775054931641, + "learning_rate": 0.0001973018030618063, + "loss": 0.88, + "step": 1668 + }, + { + "epoch": 0.5358805586771552, + "grad_norm": 1.4469013214111328, + "learning_rate": 0.0001972937431694704, + "loss": 1.0509, + "step": 1669 + }, + { + "epoch": 0.5362016375020068, + "grad_norm": 1.3785357475280762, + "learning_rate": 0.00019728567142218703, + "loss": 1.0469, + "step": 1670 + }, + { + "epoch": 0.5365227163268582, + "grad_norm": 1.0268610715866089, + "learning_rate": 0.00019727758782093967, + "loss": 1.0102, + "step": 1671 + }, + { + "epoch": 0.5368437951517098, + "grad_norm": 1.2795389890670776, + "learning_rate": 0.00019726949236671332, + "loss": 1.0537, + "step": 1672 + }, + { + "epoch": 0.5371648739765612, + "grad_norm": 3.1554412841796875, + "learning_rate": 0.00019726138506049438, + "loss": 0.9041, + "step": 1673 + }, + { + "epoch": 0.5374859528014128, + "grad_norm": 1.8737775087356567, + "learning_rate": 0.00019725326590327066, + "loss": 1.0927, + "step": 1674 + }, + { + "epoch": 0.5378070316262642, + "grad_norm": 1.8332324028015137, + "learning_rate": 0.00019724513489603155, + "loss": 1.0715, + "step": 1675 + }, + { + "epoch": 0.5381281104511157, + "grad_norm": 1.0385593175888062, + "learning_rate": 0.00019723699203976766, + "loss": 0.8541, + "step": 1676 + }, + { + "epoch": 0.5384491892759673, + "grad_norm": 1.0988675355911255, + "learning_rate": 0.00019722883733547128, + "loss": 0.7933, + "step": 1677 + }, + { + "epoch": 0.5387702681008187, + "grad_norm": 1.472822666168213, + "learning_rate": 0.00019722067078413599, + "loss": 0.9275, + "step": 1678 + }, + { + "epoch": 0.5390913469256703, + "grad_norm": 1.4651199579238892, + "learning_rate": 0.00019721249238675688, + "loss": 1.2088, + "step": 1679 + }, + { + "epoch": 0.5394124257505217, + "grad_norm": 1.5309169292449951, + "learning_rate": 0.00019720430214433042, + "loss": 1.0605, + "step": 1680 + }, + { + "epoch": 0.5397335045753733, + "grad_norm": 1.445311427116394, + "learning_rate": 0.00019719610005785465, + "loss": 1.0896, + "step": 1681 + }, + { + "epoch": 0.5400545834002247, + "grad_norm": 1.6236193180084229, + "learning_rate": 0.00019718788612832887, + "loss": 1.0223, + "step": 1682 + }, + { + "epoch": 0.5403756622250763, + "grad_norm": 1.451526165008545, + "learning_rate": 0.00019717966035675397, + "loss": 0.8866, + "step": 1683 + }, + { + "epoch": 0.5406967410499277, + "grad_norm": 1.2759616374969482, + "learning_rate": 0.00019717142274413223, + "loss": 1.0889, + "step": 1684 + }, + { + "epoch": 0.5410178198747793, + "grad_norm": 1.0535154342651367, + "learning_rate": 0.0001971631732914674, + "loss": 0.9059, + "step": 1685 + }, + { + "epoch": 0.5413388986996308, + "grad_norm": 1.0445899963378906, + "learning_rate": 0.0001971549119997646, + "loss": 0.9921, + "step": 1686 + }, + { + "epoch": 0.5416599775244823, + "grad_norm": 1.2191705703735352, + "learning_rate": 0.00019714663887003054, + "loss": 1.0312, + "step": 1687 + }, + { + "epoch": 0.5419810563493338, + "grad_norm": 1.2853853702545166, + "learning_rate": 0.00019713835390327316, + "loss": 1.0743, + "step": 1688 + }, + { + "epoch": 0.5423021351741852, + "grad_norm": 1.3771874904632568, + "learning_rate": 0.000197130057100502, + "loss": 1.033, + "step": 1689 + }, + { + "epoch": 0.5426232139990368, + "grad_norm": 1.2575607299804688, + "learning_rate": 0.00019712174846272805, + "loss": 0.94, + "step": 1690 + }, + { + "epoch": 0.5429442928238882, + "grad_norm": 1.1774635314941406, + "learning_rate": 0.00019711342799096361, + "loss": 0.9817, + "step": 1691 + }, + { + "epoch": 0.5432653716487398, + "grad_norm": 1.3301347494125366, + "learning_rate": 0.00019710509568622258, + "loss": 1.0584, + "step": 1692 + }, + { + "epoch": 0.5435864504735912, + "grad_norm": 1.1408923864364624, + "learning_rate": 0.00019709675154952017, + "loss": 0.9083, + "step": 1693 + }, + { + "epoch": 0.5439075292984428, + "grad_norm": 1.0724045038223267, + "learning_rate": 0.0001970883955818731, + "loss": 0.7918, + "step": 1694 + }, + { + "epoch": 0.5442286081232943, + "grad_norm": 0.9542874693870544, + "learning_rate": 0.00019708002778429955, + "loss": 0.9377, + "step": 1695 + }, + { + "epoch": 0.5445496869481458, + "grad_norm": 1.1520448923110962, + "learning_rate": 0.00019707164815781908, + "loss": 0.6503, + "step": 1696 + }, + { + "epoch": 0.5448707657729973, + "grad_norm": 1.0709972381591797, + "learning_rate": 0.00019706325670345275, + "loss": 0.905, + "step": 1697 + }, + { + "epoch": 0.5451918445978488, + "grad_norm": 0.7395738959312439, + "learning_rate": 0.000197054853422223, + "loss": 0.6726, + "step": 1698 + }, + { + "epoch": 0.5455129234227003, + "grad_norm": 1.2998228073120117, + "learning_rate": 0.00019704643831515374, + "loss": 0.8171, + "step": 1699 + }, + { + "epoch": 0.5458340022475517, + "grad_norm": 1.2186784744262695, + "learning_rate": 0.00019703801138327038, + "loss": 0.6297, + "step": 1700 + }, + { + "epoch": 0.5461550810724033, + "grad_norm": 1.2121597528457642, + "learning_rate": 0.00019702957262759965, + "loss": 1.4739, + "step": 1701 + }, + { + "epoch": 0.5464761598972547, + "grad_norm": 1.483771800994873, + "learning_rate": 0.00019702112204916984, + "loss": 1.3041, + "step": 1702 + }, + { + "epoch": 0.5467972387221063, + "grad_norm": 1.4804165363311768, + "learning_rate": 0.0001970126596490106, + "loss": 0.8875, + "step": 1703 + }, + { + "epoch": 0.5471183175469578, + "grad_norm": 1.211633324623108, + "learning_rate": 0.00019700418542815306, + "loss": 0.8732, + "step": 1704 + }, + { + "epoch": 0.5474393963718093, + "grad_norm": 1.4509685039520264, + "learning_rate": 0.00019699569938762973, + "loss": 0.7624, + "step": 1705 + }, + { + "epoch": 0.5477604751966608, + "grad_norm": 1.4379287958145142, + "learning_rate": 0.00019698720152847468, + "loss": 0.6996, + "step": 1706 + }, + { + "epoch": 0.5480815540215123, + "grad_norm": 1.7644935846328735, + "learning_rate": 0.00019697869185172331, + "loss": 1.1261, + "step": 1707 + }, + { + "epoch": 0.5484026328463638, + "grad_norm": 1.4107887744903564, + "learning_rate": 0.00019697017035841252, + "loss": 1.1591, + "step": 1708 + }, + { + "epoch": 0.5487237116712153, + "grad_norm": 1.2489043474197388, + "learning_rate": 0.0001969616370495806, + "loss": 0.8075, + "step": 1709 + }, + { + "epoch": 0.5490447904960668, + "grad_norm": 1.4318604469299316, + "learning_rate": 0.00019695309192626734, + "loss": 0.9552, + "step": 1710 + }, + { + "epoch": 0.5493658693209182, + "grad_norm": 1.043524146080017, + "learning_rate": 0.0001969445349895139, + "loss": 0.7759, + "step": 1711 + }, + { + "epoch": 0.5496869481457698, + "grad_norm": 1.1414694786071777, + "learning_rate": 0.00019693596624036292, + "loss": 1.0484, + "step": 1712 + }, + { + "epoch": 0.5500080269706212, + "grad_norm": 1.1262508630752563, + "learning_rate": 0.00019692738567985853, + "loss": 0.924, + "step": 1713 + }, + { + "epoch": 0.5503291057954728, + "grad_norm": 1.4722039699554443, + "learning_rate": 0.0001969187933090462, + "loss": 0.7755, + "step": 1714 + }, + { + "epoch": 0.5506501846203243, + "grad_norm": 1.0937949419021606, + "learning_rate": 0.00019691018912897286, + "loss": 0.8581, + "step": 1715 + }, + { + "epoch": 0.5509712634451758, + "grad_norm": 1.4399571418762207, + "learning_rate": 0.00019690157314068696, + "loss": 0.9408, + "step": 1716 + }, + { + "epoch": 0.5512923422700273, + "grad_norm": 1.3758063316345215, + "learning_rate": 0.0001968929453452383, + "loss": 1.0267, + "step": 1717 + }, + { + "epoch": 0.5516134210948788, + "grad_norm": 1.2351738214492798, + "learning_rate": 0.00019688430574367819, + "loss": 1.1119, + "step": 1718 + }, + { + "epoch": 0.5519344999197303, + "grad_norm": 0.9020370244979858, + "learning_rate": 0.00019687565433705926, + "loss": 0.8939, + "step": 1719 + }, + { + "epoch": 0.5522555787445818, + "grad_norm": 1.680139183998108, + "learning_rate": 0.00019686699112643572, + "loss": 1.1864, + "step": 1720 + }, + { + "epoch": 0.5525766575694333, + "grad_norm": 1.5821527242660522, + "learning_rate": 0.0001968583161128631, + "loss": 1.125, + "step": 1721 + }, + { + "epoch": 0.5528977363942847, + "grad_norm": 1.2951277494430542, + "learning_rate": 0.00019684962929739853, + "loss": 1.0259, + "step": 1722 + }, + { + "epoch": 0.5532188152191363, + "grad_norm": 1.182592749595642, + "learning_rate": 0.00019684093068110038, + "loss": 1.1474, + "step": 1723 + }, + { + "epoch": 0.5535398940439878, + "grad_norm": 1.3489532470703125, + "learning_rate": 0.00019683222026502858, + "loss": 0.774, + "step": 1724 + }, + { + "epoch": 0.5538609728688393, + "grad_norm": 1.1987816095352173, + "learning_rate": 0.00019682349805024446, + "loss": 1.1013, + "step": 1725 + }, + { + "epoch": 0.5541820516936908, + "grad_norm": 1.347397804260254, + "learning_rate": 0.0001968147640378108, + "loss": 1.1095, + "step": 1726 + }, + { + "epoch": 0.5545031305185423, + "grad_norm": 1.4341198205947876, + "learning_rate": 0.00019680601822879182, + "loss": 1.2611, + "step": 1727 + }, + { + "epoch": 0.5548242093433938, + "grad_norm": 1.5567268133163452, + "learning_rate": 0.00019679726062425316, + "loss": 1.0499, + "step": 1728 + }, + { + "epoch": 0.5551452881682453, + "grad_norm": 0.9341502785682678, + "learning_rate": 0.00019678849122526187, + "loss": 0.9607, + "step": 1729 + }, + { + "epoch": 0.5554663669930968, + "grad_norm": 1.2237082719802856, + "learning_rate": 0.00019677971003288655, + "loss": 1.0517, + "step": 1730 + }, + { + "epoch": 0.5557874458179483, + "grad_norm": 0.9689716696739197, + "learning_rate": 0.00019677091704819715, + "loss": 1.0166, + "step": 1731 + }, + { + "epoch": 0.5561085246427998, + "grad_norm": 1.32294762134552, + "learning_rate": 0.000196762112272265, + "loss": 0.9483, + "step": 1732 + }, + { + "epoch": 0.5564296034676514, + "grad_norm": 1.5692658424377441, + "learning_rate": 0.00019675329570616298, + "loss": 0.8919, + "step": 1733 + }, + { + "epoch": 0.5567506822925028, + "grad_norm": 1.022190809249878, + "learning_rate": 0.0001967444673509654, + "loss": 0.7758, + "step": 1734 + }, + { + "epoch": 0.5570717611173543, + "grad_norm": 0.9672703742980957, + "learning_rate": 0.00019673562720774792, + "loss": 0.9181, + "step": 1735 + }, + { + "epoch": 0.5573928399422058, + "grad_norm": 1.6587204933166504, + "learning_rate": 0.0001967267752775877, + "loss": 0.8566, + "step": 1736 + }, + { + "epoch": 0.5577139187670573, + "grad_norm": 1.575669765472412, + "learning_rate": 0.0001967179115615633, + "loss": 0.9348, + "step": 1737 + }, + { + "epoch": 0.5580349975919088, + "grad_norm": 1.2742935419082642, + "learning_rate": 0.00019670903606075474, + "loss": 0.6976, + "step": 1738 + }, + { + "epoch": 0.5583560764167603, + "grad_norm": 2.927837610244751, + "learning_rate": 0.00019670014877624353, + "loss": 0.9914, + "step": 1739 + }, + { + "epoch": 0.5586771552416118, + "grad_norm": 1.1376343965530396, + "learning_rate": 0.00019669124970911247, + "loss": 0.6877, + "step": 1740 + }, + { + "epoch": 0.5589982340664633, + "grad_norm": 1.5874276161193848, + "learning_rate": 0.00019668233886044597, + "loss": 1.0347, + "step": 1741 + }, + { + "epoch": 0.5593193128913149, + "grad_norm": 1.3938146829605103, + "learning_rate": 0.0001966734162313297, + "loss": 0.8869, + "step": 1742 + }, + { + "epoch": 0.5596403917161663, + "grad_norm": 2.021955728530884, + "learning_rate": 0.00019666448182285094, + "loss": 0.9766, + "step": 1743 + }, + { + "epoch": 0.5599614705410179, + "grad_norm": 1.3332651853561401, + "learning_rate": 0.00019665553563609825, + "loss": 0.9548, + "step": 1744 + }, + { + "epoch": 0.5602825493658693, + "grad_norm": 1.0142797231674194, + "learning_rate": 0.00019664657767216176, + "loss": 0.8287, + "step": 1745 + }, + { + "epoch": 0.5606036281907208, + "grad_norm": 1.369992733001709, + "learning_rate": 0.00019663760793213296, + "loss": 1.0265, + "step": 1746 + }, + { + "epoch": 0.5609247070155723, + "grad_norm": 1.5672804117202759, + "learning_rate": 0.0001966286264171047, + "loss": 0.7512, + "step": 1747 + }, + { + "epoch": 0.5612457858404238, + "grad_norm": 1.7840815782546997, + "learning_rate": 0.00019661963312817148, + "loss": 0.985, + "step": 1748 + }, + { + "epoch": 0.5615668646652753, + "grad_norm": 0.9597679972648621, + "learning_rate": 0.00019661062806642903, + "loss": 0.5063, + "step": 1749 + }, + { + "epoch": 0.5618879434901268, + "grad_norm": 1.5060789585113525, + "learning_rate": 0.00019660161123297458, + "loss": 0.5703, + "step": 1750 + }, + { + "epoch": 0.5622090223149784, + "grad_norm": 1.2468721866607666, + "learning_rate": 0.00019659258262890683, + "loss": 1.4691, + "step": 1751 + }, + { + "epoch": 0.5625301011398298, + "grad_norm": 1.2596136331558228, + "learning_rate": 0.00019658354225532589, + "loss": 0.9999, + "step": 1752 + }, + { + "epoch": 0.5628511799646814, + "grad_norm": 1.5096850395202637, + "learning_rate": 0.00019657449011333328, + "loss": 1.04, + "step": 1753 + }, + { + "epoch": 0.5631722587895328, + "grad_norm": 1.6973052024841309, + "learning_rate": 0.00019656542620403203, + "loss": 0.8056, + "step": 1754 + }, + { + "epoch": 0.5634933376143844, + "grad_norm": 1.484620213508606, + "learning_rate": 0.00019655635052852647, + "loss": 0.7742, + "step": 1755 + }, + { + "epoch": 0.5638144164392358, + "grad_norm": 1.2450261116027832, + "learning_rate": 0.0001965472630879225, + "loss": 0.9274, + "step": 1756 + }, + { + "epoch": 0.5641354952640873, + "grad_norm": 1.8217682838439941, + "learning_rate": 0.0001965381638833274, + "loss": 0.8686, + "step": 1757 + }, + { + "epoch": 0.5644565740889388, + "grad_norm": 1.1573996543884277, + "learning_rate": 0.00019652905291584984, + "loss": 0.9089, + "step": 1758 + }, + { + "epoch": 0.5647776529137903, + "grad_norm": 1.1910955905914307, + "learning_rate": 0.0001965199301866, + "loss": 1.1458, + "step": 1759 + }, + { + "epoch": 0.5650987317386419, + "grad_norm": 1.2282229661941528, + "learning_rate": 0.00019651079569668945, + "loss": 0.9382, + "step": 1760 + }, + { + "epoch": 0.5654198105634933, + "grad_norm": 1.6961151361465454, + "learning_rate": 0.00019650164944723115, + "loss": 1.1206, + "step": 1761 + }, + { + "epoch": 0.5657408893883449, + "grad_norm": 1.1346982717514038, + "learning_rate": 0.00019649249143933962, + "loss": 0.9793, + "step": 1762 + }, + { + "epoch": 0.5660619682131963, + "grad_norm": 1.3887813091278076, + "learning_rate": 0.00019648332167413067, + "loss": 0.9796, + "step": 1763 + }, + { + "epoch": 0.5663830470380479, + "grad_norm": 0.7095258831977844, + "learning_rate": 0.0001964741401527217, + "loss": 0.7365, + "step": 1764 + }, + { + "epoch": 0.5667041258628993, + "grad_norm": 1.4679083824157715, + "learning_rate": 0.00019646494687623135, + "loss": 0.8576, + "step": 1765 + }, + { + "epoch": 0.5670252046877509, + "grad_norm": 1.2272289991378784, + "learning_rate": 0.00019645574184577982, + "loss": 0.9592, + "step": 1766 + }, + { + "epoch": 0.5673462835126023, + "grad_norm": 0.9056873917579651, + "learning_rate": 0.00019644652506248874, + "loss": 0.9373, + "step": 1767 + }, + { + "epoch": 0.5676673623374539, + "grad_norm": 1.2756919860839844, + "learning_rate": 0.00019643729652748113, + "loss": 0.9717, + "step": 1768 + }, + { + "epoch": 0.5679884411623053, + "grad_norm": 1.2208895683288574, + "learning_rate": 0.00019642805624188147, + "loss": 1.0495, + "step": 1769 + }, + { + "epoch": 0.5683095199871568, + "grad_norm": 1.9911493062973022, + "learning_rate": 0.00019641880420681566, + "loss": 1.0649, + "step": 1770 + }, + { + "epoch": 0.5686305988120084, + "grad_norm": 1.385181188583374, + "learning_rate": 0.00019640954042341103, + "loss": 1.0059, + "step": 1771 + }, + { + "epoch": 0.5689516776368598, + "grad_norm": 1.642540693283081, + "learning_rate": 0.0001964002648927963, + "loss": 0.924, + "step": 1772 + }, + { + "epoch": 0.5692727564617114, + "grad_norm": 1.2365972995758057, + "learning_rate": 0.00019639097761610174, + "loss": 0.7404, + "step": 1773 + }, + { + "epoch": 0.5695938352865628, + "grad_norm": 1.8943426609039307, + "learning_rate": 0.00019638167859445895, + "loss": 0.9449, + "step": 1774 + }, + { + "epoch": 0.5699149141114144, + "grad_norm": 1.3321115970611572, + "learning_rate": 0.000196372367829001, + "loss": 0.9601, + "step": 1775 + }, + { + "epoch": 0.5702359929362658, + "grad_norm": 1.592492938041687, + "learning_rate": 0.0001963630453208623, + "loss": 0.9763, + "step": 1776 + }, + { + "epoch": 0.5705570717611174, + "grad_norm": 1.3213098049163818, + "learning_rate": 0.00019635371107117888, + "loss": 1.3418, + "step": 1777 + }, + { + "epoch": 0.5708781505859688, + "grad_norm": 1.128093957901001, + "learning_rate": 0.000196344365081088, + "loss": 0.8828, + "step": 1778 + }, + { + "epoch": 0.5711992294108204, + "grad_norm": 1.2909680604934692, + "learning_rate": 0.0001963350073517285, + "loss": 1.1179, + "step": 1779 + }, + { + "epoch": 0.5715203082356719, + "grad_norm": 1.7713595628738403, + "learning_rate": 0.00019632563788424053, + "loss": 1.0826, + "step": 1780 + }, + { + "epoch": 0.5718413870605233, + "grad_norm": 1.069727897644043, + "learning_rate": 0.00019631625667976583, + "loss": 0.659, + "step": 1781 + }, + { + "epoch": 0.5721624658853749, + "grad_norm": 1.3080742359161377, + "learning_rate": 0.00019630686373944738, + "loss": 0.8356, + "step": 1782 + }, + { + "epoch": 0.5724835447102263, + "grad_norm": 1.5017932653427124, + "learning_rate": 0.0001962974590644297, + "loss": 0.9553, + "step": 1783 + }, + { + "epoch": 0.5728046235350779, + "grad_norm": 2.8345601558685303, + "learning_rate": 0.00019628804265585877, + "loss": 0.984, + "step": 1784 + }, + { + "epoch": 0.5731257023599293, + "grad_norm": 0.9238687753677368, + "learning_rate": 0.00019627861451488189, + "loss": 0.8865, + "step": 1785 + }, + { + "epoch": 0.5734467811847809, + "grad_norm": 1.3167964220046997, + "learning_rate": 0.0001962691746426479, + "loss": 0.8951, + "step": 1786 + }, + { + "epoch": 0.5737678600096323, + "grad_norm": 0.8535014986991882, + "learning_rate": 0.00019625972304030697, + "loss": 0.7118, + "step": 1787 + }, + { + "epoch": 0.5740889388344839, + "grad_norm": 1.6030869483947754, + "learning_rate": 0.00019625025970901078, + "loss": 0.9116, + "step": 1788 + }, + { + "epoch": 0.5744100176593354, + "grad_norm": 1.7325750589370728, + "learning_rate": 0.0001962407846499124, + "loss": 0.9291, + "step": 1789 + }, + { + "epoch": 0.5747310964841869, + "grad_norm": 1.3394780158996582, + "learning_rate": 0.00019623129786416635, + "loss": 0.9792, + "step": 1790 + }, + { + "epoch": 0.5750521753090384, + "grad_norm": 1.3643033504486084, + "learning_rate": 0.00019622179935292855, + "loss": 0.8923, + "step": 1791 + }, + { + "epoch": 0.5753732541338898, + "grad_norm": 1.2005494832992554, + "learning_rate": 0.00019621228911735636, + "loss": 1.0839, + "step": 1792 + }, + { + "epoch": 0.5756943329587414, + "grad_norm": 1.9481810331344604, + "learning_rate": 0.0001962027671586086, + "loss": 1.0381, + "step": 1793 + }, + { + "epoch": 0.5760154117835928, + "grad_norm": 1.1878541707992554, + "learning_rate": 0.00019619323347784548, + "loss": 1.005, + "step": 1794 + }, + { + "epoch": 0.5763364906084444, + "grad_norm": 1.2770967483520508, + "learning_rate": 0.00019618368807622862, + "loss": 1.008, + "step": 1795 + }, + { + "epoch": 0.5766575694332958, + "grad_norm": 0.9496734738349915, + "learning_rate": 0.00019617413095492114, + "loss": 0.7313, + "step": 1796 + }, + { + "epoch": 0.5769786482581474, + "grad_norm": 1.6152045726776123, + "learning_rate": 0.00019616456211508752, + "loss": 0.9678, + "step": 1797 + }, + { + "epoch": 0.5772997270829989, + "grad_norm": 1.1719512939453125, + "learning_rate": 0.0001961549815578937, + "loss": 0.735, + "step": 1798 + }, + { + "epoch": 0.5776208059078504, + "grad_norm": 0.9691303372383118, + "learning_rate": 0.0001961453892845071, + "loss": 0.6029, + "step": 1799 + }, + { + "epoch": 0.5779418847327019, + "grad_norm": 1.4163908958435059, + "learning_rate": 0.0001961357852960964, + "loss": 0.6984, + "step": 1800 + }, + { + "epoch": 0.5782629635575534, + "grad_norm": 2.1804752349853516, + "learning_rate": 0.0001961261695938319, + "loss": 1.5497, + "step": 1801 + }, + { + "epoch": 0.5785840423824049, + "grad_norm": 1.3419125080108643, + "learning_rate": 0.0001961165421788852, + "loss": 1.3835, + "step": 1802 + }, + { + "epoch": 0.5789051212072563, + "grad_norm": 1.8257540464401245, + "learning_rate": 0.0001961069030524294, + "loss": 0.9603, + "step": 1803 + }, + { + "epoch": 0.5792262000321079, + "grad_norm": 1.763144612312317, + "learning_rate": 0.00019609725221563897, + "loss": 0.9591, + "step": 1804 + }, + { + "epoch": 0.5795472788569593, + "grad_norm": 1.1090623140335083, + "learning_rate": 0.00019608758966968988, + "loss": 0.8613, + "step": 1805 + }, + { + "epoch": 0.5798683576818109, + "grad_norm": 1.6234127283096313, + "learning_rate": 0.00019607791541575943, + "loss": 0.8793, + "step": 1806 + }, + { + "epoch": 0.5801894365066624, + "grad_norm": 1.1503936052322388, + "learning_rate": 0.0001960682294550264, + "loss": 0.6553, + "step": 1807 + }, + { + "epoch": 0.5805105153315139, + "grad_norm": 1.3944035768508911, + "learning_rate": 0.00019605853178867105, + "loss": 0.682, + "step": 1808 + }, + { + "epoch": 0.5808315941563654, + "grad_norm": 1.1830666065216064, + "learning_rate": 0.00019604882241787498, + "loss": 0.9909, + "step": 1809 + }, + { + "epoch": 0.5811526729812169, + "grad_norm": 1.4636564254760742, + "learning_rate": 0.00019603910134382123, + "loss": 1.0726, + "step": 1810 + }, + { + "epoch": 0.5814737518060684, + "grad_norm": 1.203645944595337, + "learning_rate": 0.0001960293685676943, + "loss": 0.9043, + "step": 1811 + }, + { + "epoch": 0.5817948306309199, + "grad_norm": 1.30366051197052, + "learning_rate": 0.00019601962409068012, + "loss": 0.9162, + "step": 1812 + }, + { + "epoch": 0.5821159094557714, + "grad_norm": 2.1224048137664795, + "learning_rate": 0.000196009867913966, + "loss": 1.1189, + "step": 1813 + }, + { + "epoch": 0.5824369882806228, + "grad_norm": 0.9786831140518188, + "learning_rate": 0.00019600010003874069, + "loss": 0.8792, + "step": 1814 + }, + { + "epoch": 0.5827580671054744, + "grad_norm": 1.0438469648361206, + "learning_rate": 0.00019599032046619438, + "loss": 1.1121, + "step": 1815 + }, + { + "epoch": 0.583079145930326, + "grad_norm": 1.4787214994430542, + "learning_rate": 0.0001959805291975187, + "loss": 1.1414, + "step": 1816 + }, + { + "epoch": 0.5834002247551774, + "grad_norm": 1.3728911876678467, + "learning_rate": 0.00019597072623390668, + "loss": 0.8714, + "step": 1817 + }, + { + "epoch": 0.5837213035800289, + "grad_norm": 1.0536571741104126, + "learning_rate": 0.00019596091157655278, + "loss": 0.9525, + "step": 1818 + }, + { + "epoch": 0.5840423824048804, + "grad_norm": 1.2520835399627686, + "learning_rate": 0.0001959510852266529, + "loss": 0.7777, + "step": 1819 + }, + { + "epoch": 0.5843634612297319, + "grad_norm": 0.8757473826408386, + "learning_rate": 0.0001959412471854043, + "loss": 1.0104, + "step": 1820 + }, + { + "epoch": 0.5846845400545834, + "grad_norm": 1.4845868349075317, + "learning_rate": 0.00019593139745400576, + "loss": 0.9476, + "step": 1821 + }, + { + "epoch": 0.5850056188794349, + "grad_norm": 2.418626546859741, + "learning_rate": 0.00019592153603365743, + "loss": 1.0727, + "step": 1822 + }, + { + "epoch": 0.5853266977042864, + "grad_norm": 1.0435357093811035, + "learning_rate": 0.0001959116629255609, + "loss": 1.0303, + "step": 1823 + }, + { + "epoch": 0.5856477765291379, + "grad_norm": 1.3769514560699463, + "learning_rate": 0.0001959017781309192, + "loss": 1.0692, + "step": 1824 + }, + { + "epoch": 0.5859688553539895, + "grad_norm": 1.0408692359924316, + "learning_rate": 0.0001958918816509367, + "loss": 0.903, + "step": 1825 + }, + { + "epoch": 0.5862899341788409, + "grad_norm": 1.2975987195968628, + "learning_rate": 0.0001958819734868193, + "loss": 0.9121, + "step": 1826 + }, + { + "epoch": 0.5866110130036924, + "grad_norm": 1.479382872581482, + "learning_rate": 0.00019587205363977427, + "loss": 0.9408, + "step": 1827 + }, + { + "epoch": 0.5869320918285439, + "grad_norm": 1.200589656829834, + "learning_rate": 0.00019586212211101037, + "loss": 1.0283, + "step": 1828 + }, + { + "epoch": 0.5872531706533954, + "grad_norm": 1.039357304573059, + "learning_rate": 0.0001958521789017376, + "loss": 0.8903, + "step": 1829 + }, + { + "epoch": 0.5875742494782469, + "grad_norm": 1.5455893278121948, + "learning_rate": 0.0001958422240131676, + "loss": 1.0474, + "step": 1830 + }, + { + "epoch": 0.5878953283030984, + "grad_norm": 1.092637538909912, + "learning_rate": 0.00019583225744651333, + "loss": 0.9873, + "step": 1831 + }, + { + "epoch": 0.5882164071279499, + "grad_norm": 2.400148868560791, + "learning_rate": 0.00019582227920298916, + "loss": 1.2347, + "step": 1832 + }, + { + "epoch": 0.5885374859528014, + "grad_norm": 1.1216742992401123, + "learning_rate": 0.00019581228928381098, + "loss": 1.0158, + "step": 1833 + }, + { + "epoch": 0.5888585647776529, + "grad_norm": 1.167275309562683, + "learning_rate": 0.00019580228769019593, + "loss": 0.9061, + "step": 1834 + }, + { + "epoch": 0.5891796436025044, + "grad_norm": 1.382562518119812, + "learning_rate": 0.00019579227442336278, + "loss": 1.216, + "step": 1835 + }, + { + "epoch": 0.589500722427356, + "grad_norm": 1.2429423332214355, + "learning_rate": 0.0001957822494845315, + "loss": 0.9084, + "step": 1836 + }, + { + "epoch": 0.5898218012522074, + "grad_norm": 1.1769795417785645, + "learning_rate": 0.00019577221287492367, + "loss": 0.9287, + "step": 1837 + }, + { + "epoch": 0.590142880077059, + "grad_norm": 1.2153514623641968, + "learning_rate": 0.00019576216459576222, + "loss": 1.0391, + "step": 1838 + }, + { + "epoch": 0.5904639589019104, + "grad_norm": 1.096490740776062, + "learning_rate": 0.00019575210464827149, + "loss": 0.9332, + "step": 1839 + }, + { + "epoch": 0.5907850377267619, + "grad_norm": 1.9070253372192383, + "learning_rate": 0.00019574203303367727, + "loss": 0.859, + "step": 1840 + }, + { + "epoch": 0.5911061165516134, + "grad_norm": 1.2111899852752686, + "learning_rate": 0.00019573194975320673, + "loss": 1.0296, + "step": 1841 + }, + { + "epoch": 0.5914271953764649, + "grad_norm": 1.3868889808654785, + "learning_rate": 0.00019572185480808848, + "loss": 1.0413, + "step": 1842 + }, + { + "epoch": 0.5917482742013164, + "grad_norm": 1.646368384361267, + "learning_rate": 0.00019571174819955263, + "loss": 0.9224, + "step": 1843 + }, + { + "epoch": 0.5920693530261679, + "grad_norm": 1.1916215419769287, + "learning_rate": 0.00019570162992883054, + "loss": 0.9388, + "step": 1844 + }, + { + "epoch": 0.5923904318510195, + "grad_norm": 1.1193400621414185, + "learning_rate": 0.00019569149999715515, + "loss": 0.8399, + "step": 1845 + }, + { + "epoch": 0.5927115106758709, + "grad_norm": 1.161150574684143, + "learning_rate": 0.00019568135840576076, + "loss": 0.8907, + "step": 1846 + }, + { + "epoch": 0.5930325895007225, + "grad_norm": 1.9916490316390991, + "learning_rate": 0.00019567120515588308, + "loss": 0.9989, + "step": 1847 + }, + { + "epoch": 0.5933536683255739, + "grad_norm": 1.4825630187988281, + "learning_rate": 0.0001956610402487592, + "loss": 0.6702, + "step": 1848 + }, + { + "epoch": 0.5936747471504255, + "grad_norm": 1.547219157218933, + "learning_rate": 0.0001956508636856278, + "loss": 0.6974, + "step": 1849 + }, + { + "epoch": 0.5939958259752769, + "grad_norm": 0.9746522307395935, + "learning_rate": 0.00019564067546772878, + "loss": 0.499, + "step": 1850 + }, + { + "epoch": 0.5943169048001284, + "grad_norm": 1.3846455812454224, + "learning_rate": 0.00019563047559630357, + "loss": 1.4075, + "step": 1851 + }, + { + "epoch": 0.5946379836249799, + "grad_norm": 1.398285984992981, + "learning_rate": 0.00019562026407259495, + "loss": 0.8492, + "step": 1852 + }, + { + "epoch": 0.5949590624498314, + "grad_norm": 3.702108860015869, + "learning_rate": 0.00019561004089784723, + "loss": 1.2278, + "step": 1853 + }, + { + "epoch": 0.595280141274683, + "grad_norm": 1.2834221124649048, + "learning_rate": 0.00019559980607330605, + "loss": 0.7047, + "step": 1854 + }, + { + "epoch": 0.5956012200995344, + "grad_norm": 1.3995305299758911, + "learning_rate": 0.00019558955960021849, + "loss": 0.7136, + "step": 1855 + }, + { + "epoch": 0.595922298924386, + "grad_norm": 1.1465208530426025, + "learning_rate": 0.00019557930147983302, + "loss": 0.651, + "step": 1856 + }, + { + "epoch": 0.5962433777492374, + "grad_norm": 1.3999861478805542, + "learning_rate": 0.00019556903171339963, + "loss": 0.7879, + "step": 1857 + }, + { + "epoch": 0.596564456574089, + "grad_norm": 1.0851597785949707, + "learning_rate": 0.00019555875030216954, + "loss": 0.8221, + "step": 1858 + }, + { + "epoch": 0.5968855353989404, + "grad_norm": 1.2108211517333984, + "learning_rate": 0.00019554845724739566, + "loss": 0.7746, + "step": 1859 + }, + { + "epoch": 0.597206614223792, + "grad_norm": 1.5989631414413452, + "learning_rate": 0.00019553815255033205, + "loss": 1.028, + "step": 1860 + }, + { + "epoch": 0.5975276930486434, + "grad_norm": 1.233514666557312, + "learning_rate": 0.00019552783621223436, + "loss": 0.6234, + "step": 1861 + }, + { + "epoch": 0.5978487718734949, + "grad_norm": 0.9018953442573547, + "learning_rate": 0.0001955175082343596, + "loss": 0.824, + "step": 1862 + }, + { + "epoch": 0.5981698506983465, + "grad_norm": 1.028957486152649, + "learning_rate": 0.00019550716861796623, + "loss": 0.9231, + "step": 1863 + }, + { + "epoch": 0.5984909295231979, + "grad_norm": 1.2545114755630493, + "learning_rate": 0.00019549681736431404, + "loss": 0.8434, + "step": 1864 + }, + { + "epoch": 0.5988120083480495, + "grad_norm": 1.197217583656311, + "learning_rate": 0.00019548645447466431, + "loss": 0.9775, + "step": 1865 + }, + { + "epoch": 0.5991330871729009, + "grad_norm": 1.3030047416687012, + "learning_rate": 0.00019547607995027978, + "loss": 0.8244, + "step": 1866 + }, + { + "epoch": 0.5994541659977525, + "grad_norm": 1.4685194492340088, + "learning_rate": 0.00019546569379242444, + "loss": 0.9339, + "step": 1867 + }, + { + "epoch": 0.5997752448226039, + "grad_norm": 1.9783920049667358, + "learning_rate": 0.00019545529600236398, + "loss": 0.9437, + "step": 1868 + }, + { + "epoch": 0.6000963236474555, + "grad_norm": 1.3374611139297485, + "learning_rate": 0.00019544488658136523, + "loss": 1.0672, + "step": 1869 + }, + { + "epoch": 0.6004174024723069, + "grad_norm": 1.4614479541778564, + "learning_rate": 0.0001954344655306965, + "loss": 0.9309, + "step": 1870 + }, + { + "epoch": 0.6007384812971585, + "grad_norm": 1.359053373336792, + "learning_rate": 0.0001954240328516277, + "loss": 1.0011, + "step": 1871 + }, + { + "epoch": 0.60105956012201, + "grad_norm": 1.4997376203536987, + "learning_rate": 0.00019541358854542991, + "loss": 1.0507, + "step": 1872 + }, + { + "epoch": 0.6013806389468614, + "grad_norm": 1.5839393138885498, + "learning_rate": 0.0001954031326133758, + "loss": 1.1981, + "step": 1873 + }, + { + "epoch": 0.601701717771713, + "grad_norm": 1.4018974304199219, + "learning_rate": 0.00019539266505673938, + "loss": 1.0872, + "step": 1874 + }, + { + "epoch": 0.6020227965965644, + "grad_norm": 1.35053551197052, + "learning_rate": 0.00019538218587679605, + "loss": 0.8378, + "step": 1875 + }, + { + "epoch": 0.602343875421416, + "grad_norm": 0.7986770272254944, + "learning_rate": 0.0001953716950748227, + "loss": 0.6296, + "step": 1876 + }, + { + "epoch": 0.6026649542462674, + "grad_norm": 1.5917302370071411, + "learning_rate": 0.0001953611926520976, + "loss": 1.0359, + "step": 1877 + }, + { + "epoch": 0.602986033071119, + "grad_norm": 1.197108507156372, + "learning_rate": 0.00019535067860990046, + "loss": 1.1222, + "step": 1878 + }, + { + "epoch": 0.6033071118959704, + "grad_norm": 1.6555898189544678, + "learning_rate": 0.00019534015294951233, + "loss": 1.0129, + "step": 1879 + }, + { + "epoch": 0.603628190720822, + "grad_norm": 0.971400797367096, + "learning_rate": 0.00019532961567221576, + "loss": 0.9535, + "step": 1880 + }, + { + "epoch": 0.6039492695456735, + "grad_norm": 1.5631428956985474, + "learning_rate": 0.0001953190667792947, + "loss": 1.2075, + "step": 1881 + }, + { + "epoch": 0.604270348370525, + "grad_norm": 1.193997859954834, + "learning_rate": 0.0001953085062720345, + "loss": 0.9112, + "step": 1882 + }, + { + "epoch": 0.6045914271953765, + "grad_norm": 1.317486047744751, + "learning_rate": 0.00019529793415172192, + "loss": 1.02, + "step": 1883 + }, + { + "epoch": 0.604912506020228, + "grad_norm": 0.9995319247245789, + "learning_rate": 0.00019528735041964509, + "loss": 0.8154, + "step": 1884 + }, + { + "epoch": 0.6052335848450795, + "grad_norm": 1.0479660034179688, + "learning_rate": 0.00019527675507709366, + "loss": 0.8572, + "step": 1885 + }, + { + "epoch": 0.6055546636699309, + "grad_norm": 1.4373043775558472, + "learning_rate": 0.00019526614812535864, + "loss": 0.9219, + "step": 1886 + }, + { + "epoch": 0.6058757424947825, + "grad_norm": 1.3879246711730957, + "learning_rate": 0.00019525552956573244, + "loss": 0.9512, + "step": 1887 + }, + { + "epoch": 0.6061968213196339, + "grad_norm": 1.2492396831512451, + "learning_rate": 0.0001952448993995089, + "loss": 1.0225, + "step": 1888 + }, + { + "epoch": 0.6065179001444855, + "grad_norm": 1.0773907899856567, + "learning_rate": 0.00019523425762798329, + "loss": 0.7924, + "step": 1889 + }, + { + "epoch": 0.6068389789693369, + "grad_norm": 1.1691362857818604, + "learning_rate": 0.00019522360425245226, + "loss": 0.8436, + "step": 1890 + }, + { + "epoch": 0.6071600577941885, + "grad_norm": 1.218471646308899, + "learning_rate": 0.00019521293927421388, + "loss": 1.0175, + "step": 1891 + }, + { + "epoch": 0.60748113661904, + "grad_norm": 0.9042808413505554, + "learning_rate": 0.00019520226269456768, + "loss": 0.9039, + "step": 1892 + }, + { + "epoch": 0.6078022154438915, + "grad_norm": 1.3849420547485352, + "learning_rate": 0.00019519157451481454, + "loss": 1.1497, + "step": 1893 + }, + { + "epoch": 0.608123294268743, + "grad_norm": 1.3135268688201904, + "learning_rate": 0.0001951808747362568, + "loss": 0.8948, + "step": 1894 + }, + { + "epoch": 0.6084443730935944, + "grad_norm": 0.94450443983078, + "learning_rate": 0.0001951701633601982, + "loss": 0.9394, + "step": 1895 + }, + { + "epoch": 0.608765451918446, + "grad_norm": 1.5512750148773193, + "learning_rate": 0.00019515944038794384, + "loss": 0.9455, + "step": 1896 + }, + { + "epoch": 0.6090865307432974, + "grad_norm": 1.3148894309997559, + "learning_rate": 0.00019514870582080032, + "loss": 0.8146, + "step": 1897 + }, + { + "epoch": 0.609407609568149, + "grad_norm": 1.836193323135376, + "learning_rate": 0.00019513795966007562, + "loss": 0.5957, + "step": 1898 + }, + { + "epoch": 0.6097286883930004, + "grad_norm": 1.1311513185501099, + "learning_rate": 0.00019512720190707913, + "loss": 0.9068, + "step": 1899 + }, + { + "epoch": 0.610049767217852, + "grad_norm": 0.6564053893089294, + "learning_rate": 0.00019511643256312164, + "loss": 0.4583, + "step": 1900 + }, + { + "epoch": 0.6103708460427035, + "grad_norm": 1.3620964288711548, + "learning_rate": 0.00019510565162951537, + "loss": 1.3662, + "step": 1901 + }, + { + "epoch": 0.610691924867555, + "grad_norm": 1.2974669933319092, + "learning_rate": 0.0001950948591075739, + "loss": 1.2901, + "step": 1902 + }, + { + "epoch": 0.6110130036924065, + "grad_norm": 1.4451632499694824, + "learning_rate": 0.00019508405499861232, + "loss": 0.9189, + "step": 1903 + }, + { + "epoch": 0.611334082517258, + "grad_norm": 1.583996057510376, + "learning_rate": 0.00019507323930394708, + "loss": 0.9118, + "step": 1904 + }, + { + "epoch": 0.6116551613421095, + "grad_norm": 1.4750548601150513, + "learning_rate": 0.00019506241202489602, + "loss": 0.7096, + "step": 1905 + }, + { + "epoch": 0.611976240166961, + "grad_norm": 1.2632248401641846, + "learning_rate": 0.00019505157316277837, + "loss": 0.9424, + "step": 1906 + }, + { + "epoch": 0.6122973189918125, + "grad_norm": 1.392075538635254, + "learning_rate": 0.00019504072271891488, + "loss": 0.9846, + "step": 1907 + }, + { + "epoch": 0.6126183978166639, + "grad_norm": 1.0871883630752563, + "learning_rate": 0.00019502986069462762, + "loss": 0.9791, + "step": 1908 + }, + { + "epoch": 0.6129394766415155, + "grad_norm": 1.2287681102752686, + "learning_rate": 0.00019501898709124008, + "loss": 1.0328, + "step": 1909 + }, + { + "epoch": 0.613260555466367, + "grad_norm": 1.0907347202301025, + "learning_rate": 0.00019500810191007718, + "loss": 0.9497, + "step": 1910 + }, + { + "epoch": 0.6135816342912185, + "grad_norm": 1.0309808254241943, + "learning_rate": 0.00019499720515246525, + "loss": 0.7238, + "step": 1911 + }, + { + "epoch": 0.61390271311607, + "grad_norm": 1.0022937059402466, + "learning_rate": 0.00019498629681973206, + "loss": 0.8948, + "step": 1912 + }, + { + "epoch": 0.6142237919409215, + "grad_norm": 1.440856695175171, + "learning_rate": 0.00019497537691320668, + "loss": 1.0733, + "step": 1913 + }, + { + "epoch": 0.614544870765773, + "grad_norm": 1.5727375745773315, + "learning_rate": 0.00019496444543421975, + "loss": 1.0399, + "step": 1914 + }, + { + "epoch": 0.6148659495906245, + "grad_norm": 1.1591044664382935, + "learning_rate": 0.0001949535023841032, + "loss": 0.9664, + "step": 1915 + }, + { + "epoch": 0.615187028415476, + "grad_norm": 1.1390159130096436, + "learning_rate": 0.0001949425477641904, + "loss": 0.9239, + "step": 1916 + }, + { + "epoch": 0.6155081072403275, + "grad_norm": 1.00704026222229, + "learning_rate": 0.00019493158157581615, + "loss": 0.9421, + "step": 1917 + }, + { + "epoch": 0.615829186065179, + "grad_norm": 2.458767890930176, + "learning_rate": 0.00019492060382031662, + "loss": 1.1242, + "step": 1918 + }, + { + "epoch": 0.6161502648900306, + "grad_norm": 1.4069280624389648, + "learning_rate": 0.00019490961449902946, + "loss": 1.0579, + "step": 1919 + }, + { + "epoch": 0.616471343714882, + "grad_norm": 0.9519532322883606, + "learning_rate": 0.00019489861361329366, + "loss": 0.7231, + "step": 1920 + }, + { + "epoch": 0.6167924225397335, + "grad_norm": 1.2751188278198242, + "learning_rate": 0.00019488760116444966, + "loss": 1.1183, + "step": 1921 + }, + { + "epoch": 0.617113501364585, + "grad_norm": 1.6662013530731201, + "learning_rate": 0.00019487657715383926, + "loss": 1.1126, + "step": 1922 + }, + { + "epoch": 0.6174345801894365, + "grad_norm": 1.7743055820465088, + "learning_rate": 0.00019486554158280574, + "loss": 0.945, + "step": 1923 + }, + { + "epoch": 0.617755659014288, + "grad_norm": 1.3609472513198853, + "learning_rate": 0.00019485449445269377, + "loss": 0.9343, + "step": 1924 + }, + { + "epoch": 0.6180767378391395, + "grad_norm": 2.255979061126709, + "learning_rate": 0.00019484343576484933, + "loss": 0.9, + "step": 1925 + }, + { + "epoch": 0.618397816663991, + "grad_norm": 1.3037607669830322, + "learning_rate": 0.00019483236552061994, + "loss": 1.059, + "step": 1926 + }, + { + "epoch": 0.6187188954888425, + "grad_norm": 1.691819190979004, + "learning_rate": 0.00019482128372135446, + "loss": 1.0162, + "step": 1927 + }, + { + "epoch": 0.6190399743136941, + "grad_norm": 1.8937206268310547, + "learning_rate": 0.0001948101903684032, + "loss": 0.9981, + "step": 1928 + }, + { + "epoch": 0.6193610531385455, + "grad_norm": 1.4838215112686157, + "learning_rate": 0.00019479908546311781, + "loss": 1.0377, + "step": 1929 + }, + { + "epoch": 0.619682131963397, + "grad_norm": 1.2339526414871216, + "learning_rate": 0.00019478796900685146, + "loss": 0.911, + "step": 1930 + }, + { + "epoch": 0.6200032107882485, + "grad_norm": 1.4416377544403076, + "learning_rate": 0.0001947768410009586, + "loss": 0.9814, + "step": 1931 + }, + { + "epoch": 0.6203242896131, + "grad_norm": 1.8542779684066772, + "learning_rate": 0.0001947657014467951, + "loss": 1.0792, + "step": 1932 + }, + { + "epoch": 0.6206453684379515, + "grad_norm": 1.3977333307266235, + "learning_rate": 0.00019475455034571838, + "loss": 0.9371, + "step": 1933 + }, + { + "epoch": 0.620966447262803, + "grad_norm": 1.1695326566696167, + "learning_rate": 0.0001947433876990871, + "loss": 0.9352, + "step": 1934 + }, + { + "epoch": 0.6212875260876545, + "grad_norm": 1.1949793100357056, + "learning_rate": 0.00019473221350826142, + "loss": 0.8381, + "step": 1935 + }, + { + "epoch": 0.621608604912506, + "grad_norm": 1.197929859161377, + "learning_rate": 0.0001947210277746029, + "loss": 1.0588, + "step": 1936 + }, + { + "epoch": 0.6219296837373576, + "grad_norm": 0.9528883695602417, + "learning_rate": 0.00019470983049947444, + "loss": 0.7636, + "step": 1937 + }, + { + "epoch": 0.622250762562209, + "grad_norm": 0.9562763571739197, + "learning_rate": 0.00019469862168424042, + "loss": 0.9649, + "step": 1938 + }, + { + "epoch": 0.6225718413870606, + "grad_norm": 1.5887722969055176, + "learning_rate": 0.0001946874013302666, + "loss": 0.9054, + "step": 1939 + }, + { + "epoch": 0.622892920211912, + "grad_norm": 1.0492193698883057, + "learning_rate": 0.0001946761694389202, + "loss": 0.9068, + "step": 1940 + }, + { + "epoch": 0.6232139990367636, + "grad_norm": 0.9907318353652954, + "learning_rate": 0.00019466492601156966, + "loss": 0.8565, + "step": 1941 + }, + { + "epoch": 0.623535077861615, + "grad_norm": 1.7006665468215942, + "learning_rate": 0.00019465367104958505, + "loss": 1.0636, + "step": 1942 + }, + { + "epoch": 0.6238561566864665, + "grad_norm": 1.3816338777542114, + "learning_rate": 0.00019464240455433775, + "loss": 0.9676, + "step": 1943 + }, + { + "epoch": 0.624177235511318, + "grad_norm": 1.1604748964309692, + "learning_rate": 0.00019463112652720054, + "loss": 0.8379, + "step": 1944 + }, + { + "epoch": 0.6244983143361695, + "grad_norm": 5.007503509521484, + "learning_rate": 0.00019461983696954758, + "loss": 1.1482, + "step": 1945 + }, + { + "epoch": 0.6248193931610211, + "grad_norm": 1.0724717378616333, + "learning_rate": 0.00019460853588275454, + "loss": 0.7847, + "step": 1946 + }, + { + "epoch": 0.6251404719858725, + "grad_norm": 1.2051833868026733, + "learning_rate": 0.00019459722326819838, + "loss": 0.9865, + "step": 1947 + }, + { + "epoch": 0.6254615508107241, + "grad_norm": 1.1510910987854004, + "learning_rate": 0.00019458589912725748, + "loss": 0.689, + "step": 1948 + }, + { + "epoch": 0.6257826296355755, + "grad_norm": 0.7677122354507446, + "learning_rate": 0.0001945745634613117, + "loss": 0.7075, + "step": 1949 + }, + { + "epoch": 0.6261037084604271, + "grad_norm": 1.6661738157272339, + "learning_rate": 0.00019456321627174221, + "loss": 0.6606, + "step": 1950 + }, + { + "epoch": 0.6264247872852785, + "grad_norm": 1.3030189275741577, + "learning_rate": 0.0001945518575599317, + "loss": 1.3519, + "step": 1951 + }, + { + "epoch": 0.6267458661101301, + "grad_norm": 1.27639901638031, + "learning_rate": 0.00019454048732726412, + "loss": 1.4878, + "step": 1952 + }, + { + "epoch": 0.6270669449349815, + "grad_norm": 1.197721004486084, + "learning_rate": 0.00019452910557512496, + "loss": 1.0872, + "step": 1953 + }, + { + "epoch": 0.627388023759833, + "grad_norm": 1.3855386972427368, + "learning_rate": 0.000194517712304901, + "loss": 0.8866, + "step": 1954 + }, + { + "epoch": 0.6277091025846845, + "grad_norm": 1.5599814653396606, + "learning_rate": 0.00019450630751798048, + "loss": 0.8322, + "step": 1955 + }, + { + "epoch": 0.628030181409536, + "grad_norm": 1.247623324394226, + "learning_rate": 0.0001944948912157531, + "loss": 0.4998, + "step": 1956 + }, + { + "epoch": 0.6283512602343876, + "grad_norm": 1.6704400777816772, + "learning_rate": 0.00019448346339960982, + "loss": 0.739, + "step": 1957 + }, + { + "epoch": 0.628672339059239, + "grad_norm": 1.241674542427063, + "learning_rate": 0.00019447202407094316, + "loss": 0.7236, + "step": 1958 + }, + { + "epoch": 0.6289934178840906, + "grad_norm": 1.0023155212402344, + "learning_rate": 0.0001944605732311469, + "loss": 0.8875, + "step": 1959 + }, + { + "epoch": 0.629314496708942, + "grad_norm": 1.710057020187378, + "learning_rate": 0.00019444911088161636, + "loss": 1.1843, + "step": 1960 + }, + { + "epoch": 0.6296355755337936, + "grad_norm": 1.0903862714767456, + "learning_rate": 0.00019443763702374812, + "loss": 0.9194, + "step": 1961 + }, + { + "epoch": 0.629956654358645, + "grad_norm": 0.8305079340934753, + "learning_rate": 0.00019442615165894027, + "loss": 0.9536, + "step": 1962 + }, + { + "epoch": 0.6302777331834966, + "grad_norm": 13.161450386047363, + "learning_rate": 0.00019441465478859228, + "loss": 0.9715, + "step": 1963 + }, + { + "epoch": 0.630598812008348, + "grad_norm": 1.4573770761489868, + "learning_rate": 0.000194403146414105, + "loss": 1.056, + "step": 1964 + }, + { + "epoch": 0.6309198908331995, + "grad_norm": 2.9237208366394043, + "learning_rate": 0.00019439162653688065, + "loss": 0.8055, + "step": 1965 + }, + { + "epoch": 0.6312409696580511, + "grad_norm": 1.0383076667785645, + "learning_rate": 0.00019438009515832297, + "loss": 0.938, + "step": 1966 + }, + { + "epoch": 0.6315620484829025, + "grad_norm": 1.4683743715286255, + "learning_rate": 0.00019436855227983695, + "loss": 1.0683, + "step": 1967 + }, + { + "epoch": 0.6318831273077541, + "grad_norm": 1.3468014001846313, + "learning_rate": 0.00019435699790282908, + "loss": 1.1329, + "step": 1968 + }, + { + "epoch": 0.6322042061326055, + "grad_norm": 1.5651507377624512, + "learning_rate": 0.00019434543202870725, + "loss": 0.8679, + "step": 1969 + }, + { + "epoch": 0.6325252849574571, + "grad_norm": 1.5266989469528198, + "learning_rate": 0.0001943338546588807, + "loss": 1.2249, + "step": 1970 + }, + { + "epoch": 0.6328463637823085, + "grad_norm": 3.1390273571014404, + "learning_rate": 0.0001943222657947601, + "loss": 1.2727, + "step": 1971 + }, + { + "epoch": 0.6331674426071601, + "grad_norm": 1.983047366142273, + "learning_rate": 0.00019431066543775752, + "loss": 0.8392, + "step": 1972 + }, + { + "epoch": 0.6334885214320115, + "grad_norm": 1.197851538658142, + "learning_rate": 0.00019429905358928646, + "loss": 1.0931, + "step": 1973 + }, + { + "epoch": 0.6338096002568631, + "grad_norm": 1.3977620601654053, + "learning_rate": 0.00019428743025076174, + "loss": 1.1318, + "step": 1974 + }, + { + "epoch": 0.6341306790817146, + "grad_norm": 1.217929720878601, + "learning_rate": 0.00019427579542359965, + "loss": 1.1208, + "step": 1975 + }, + { + "epoch": 0.634451757906566, + "grad_norm": 1.4628900289535522, + "learning_rate": 0.00019426414910921787, + "loss": 1.0285, + "step": 1976 + }, + { + "epoch": 0.6347728367314176, + "grad_norm": 1.353716254234314, + "learning_rate": 0.00019425249130903543, + "loss": 0.9842, + "step": 1977 + }, + { + "epoch": 0.635093915556269, + "grad_norm": 1.2540056705474854, + "learning_rate": 0.00019424082202447283, + "loss": 0.9581, + "step": 1978 + }, + { + "epoch": 0.6354149943811206, + "grad_norm": 1.8885056972503662, + "learning_rate": 0.0001942291412569519, + "loss": 0.9603, + "step": 1979 + }, + { + "epoch": 0.635736073205972, + "grad_norm": 1.4361056089401245, + "learning_rate": 0.00019421744900789597, + "loss": 0.9598, + "step": 1980 + }, + { + "epoch": 0.6360571520308236, + "grad_norm": 0.9510266780853271, + "learning_rate": 0.00019420574527872968, + "loss": 0.8859, + "step": 1981 + }, + { + "epoch": 0.636378230855675, + "grad_norm": 1.5164238214492798, + "learning_rate": 0.00019419403007087907, + "loss": 1.045, + "step": 1982 + }, + { + "epoch": 0.6366993096805266, + "grad_norm": 1.3843255043029785, + "learning_rate": 0.0001941823033857716, + "loss": 0.9721, + "step": 1983 + }, + { + "epoch": 0.6370203885053781, + "grad_norm": 1.692832112312317, + "learning_rate": 0.0001941705652248362, + "loss": 0.8765, + "step": 1984 + }, + { + "epoch": 0.6373414673302296, + "grad_norm": 1.3866074085235596, + "learning_rate": 0.00019415881558950302, + "loss": 1.0165, + "step": 1985 + }, + { + "epoch": 0.6376625461550811, + "grad_norm": 0.9165050983428955, + "learning_rate": 0.0001941470544812038, + "loss": 1.0291, + "step": 1986 + }, + { + "epoch": 0.6379836249799326, + "grad_norm": 0.8897827863693237, + "learning_rate": 0.0001941352819013716, + "loss": 0.7903, + "step": 1987 + }, + { + "epoch": 0.6383047038047841, + "grad_norm": 1.3819725513458252, + "learning_rate": 0.00019412349785144078, + "loss": 1.0681, + "step": 1988 + }, + { + "epoch": 0.6386257826296355, + "grad_norm": 1.1194961071014404, + "learning_rate": 0.00019411170233284727, + "loss": 0.8758, + "step": 1989 + }, + { + "epoch": 0.6389468614544871, + "grad_norm": 0.9944826364517212, + "learning_rate": 0.00019409989534702833, + "loss": 0.8176, + "step": 1990 + }, + { + "epoch": 0.6392679402793385, + "grad_norm": 1.1443473100662231, + "learning_rate": 0.00019408807689542257, + "loss": 1.0307, + "step": 1991 + }, + { + "epoch": 0.6395890191041901, + "grad_norm": 0.8666215538978577, + "learning_rate": 0.00019407624697947003, + "loss": 0.8433, + "step": 1992 + }, + { + "epoch": 0.6399100979290416, + "grad_norm": 1.3199023008346558, + "learning_rate": 0.00019406440560061216, + "loss": 1.0904, + "step": 1993 + }, + { + "epoch": 0.6402311767538931, + "grad_norm": 0.9692159295082092, + "learning_rate": 0.0001940525527602918, + "loss": 0.8195, + "step": 1994 + }, + { + "epoch": 0.6405522555787446, + "grad_norm": 1.1354212760925293, + "learning_rate": 0.00019404068845995317, + "loss": 0.8017, + "step": 1995 + }, + { + "epoch": 0.6408733344035961, + "grad_norm": 1.7543773651123047, + "learning_rate": 0.0001940288127010419, + "loss": 0.8184, + "step": 1996 + }, + { + "epoch": 0.6411944132284476, + "grad_norm": 1.025387167930603, + "learning_rate": 0.00019401692548500502, + "loss": 0.7647, + "step": 1997 + }, + { + "epoch": 0.641515492053299, + "grad_norm": 0.7941446304321289, + "learning_rate": 0.00019400502681329098, + "loss": 0.7321, + "step": 1998 + }, + { + "epoch": 0.6418365708781506, + "grad_norm": 1.0787677764892578, + "learning_rate": 0.00019399311668734956, + "loss": 0.6183, + "step": 1999 + }, + { + "epoch": 0.642157649703002, + "grad_norm": 0.9851073026657104, + "learning_rate": 0.00019398119510863197, + "loss": 0.6669, + "step": 2000 + }, + { + "epoch": 0.6424787285278536, + "grad_norm": 1.7059520483016968, + "learning_rate": 0.00019396926207859084, + "loss": 1.4499, + "step": 2001 + }, + { + "epoch": 0.6427998073527051, + "grad_norm": 1.7769484519958496, + "learning_rate": 0.00019395731759868018, + "loss": 1.2898, + "step": 2002 + }, + { + "epoch": 0.6431208861775566, + "grad_norm": 1.4100226163864136, + "learning_rate": 0.00019394536167035534, + "loss": 0.8129, + "step": 2003 + }, + { + "epoch": 0.6434419650024081, + "grad_norm": 1.3856101036071777, + "learning_rate": 0.00019393339429507318, + "loss": 0.6282, + "step": 2004 + }, + { + "epoch": 0.6437630438272596, + "grad_norm": 1.2996238470077515, + "learning_rate": 0.00019392141547429183, + "loss": 0.7415, + "step": 2005 + }, + { + "epoch": 0.6440841226521111, + "grad_norm": 1.237810730934143, + "learning_rate": 0.0001939094252094709, + "loss": 0.5929, + "step": 2006 + }, + { + "epoch": 0.6444052014769626, + "grad_norm": 1.1693778038024902, + "learning_rate": 0.00019389742350207141, + "loss": 0.8008, + "step": 2007 + }, + { + "epoch": 0.6447262803018141, + "grad_norm": 1.4258911609649658, + "learning_rate": 0.00019388541035355564, + "loss": 1.2064, + "step": 2008 + }, + { + "epoch": 0.6450473591266656, + "grad_norm": 1.4066739082336426, + "learning_rate": 0.00019387338576538744, + "loss": 1.0647, + "step": 2009 + }, + { + "epoch": 0.6453684379515171, + "grad_norm": 1.1812559366226196, + "learning_rate": 0.0001938613497390319, + "loss": 0.9243, + "step": 2010 + }, + { + "epoch": 0.6456895167763685, + "grad_norm": 5.421741962432861, + "learning_rate": 0.0001938493022759556, + "loss": 0.9313, + "step": 2011 + }, + { + "epoch": 0.6460105956012201, + "grad_norm": 1.2386350631713867, + "learning_rate": 0.0001938372433776265, + "loss": 0.6525, + "step": 2012 + }, + { + "epoch": 0.6463316744260716, + "grad_norm": 1.1228787899017334, + "learning_rate": 0.00019382517304551396, + "loss": 0.9062, + "step": 2013 + }, + { + "epoch": 0.6466527532509231, + "grad_norm": 0.9291990399360657, + "learning_rate": 0.00019381309128108865, + "loss": 1.1112, + "step": 2014 + }, + { + "epoch": 0.6469738320757746, + "grad_norm": 0.8409721851348877, + "learning_rate": 0.00019380099808582278, + "loss": 1.0773, + "step": 2015 + }, + { + "epoch": 0.6472949109006261, + "grad_norm": 1.0756123065948486, + "learning_rate": 0.0001937888934611898, + "loss": 0.9227, + "step": 2016 + }, + { + "epoch": 0.6476159897254776, + "grad_norm": 1.1562964916229248, + "learning_rate": 0.0001937767774086646, + "loss": 0.9476, + "step": 2017 + }, + { + "epoch": 0.6479370685503291, + "grad_norm": 1.2570315599441528, + "learning_rate": 0.00019376464992972356, + "loss": 0.8856, + "step": 2018 + }, + { + "epoch": 0.6482581473751806, + "grad_norm": 1.1748182773590088, + "learning_rate": 0.0001937525110258444, + "loss": 0.8097, + "step": 2019 + }, + { + "epoch": 0.648579226200032, + "grad_norm": 0.8975158333778381, + "learning_rate": 0.00019374036069850608, + "loss": 0.9826, + "step": 2020 + }, + { + "epoch": 0.6489003050248836, + "grad_norm": 1.5920442342758179, + "learning_rate": 0.00019372819894918915, + "loss": 1.1392, + "step": 2021 + }, + { + "epoch": 0.6492213838497352, + "grad_norm": 0.9668254256248474, + "learning_rate": 0.00019371602577937554, + "loss": 0.8112, + "step": 2022 + }, + { + "epoch": 0.6495424626745866, + "grad_norm": 1.265324592590332, + "learning_rate": 0.0001937038411905484, + "loss": 0.9334, + "step": 2023 + }, + { + "epoch": 0.6498635414994381, + "grad_norm": 1.3954192399978638, + "learning_rate": 0.0001936916451841925, + "loss": 0.9908, + "step": 2024 + }, + { + "epoch": 0.6501846203242896, + "grad_norm": 1.23148775100708, + "learning_rate": 0.0001936794377617938, + "loss": 1.0773, + "step": 2025 + }, + { + "epoch": 0.6505056991491411, + "grad_norm": 1.0222960710525513, + "learning_rate": 0.00019366721892483978, + "loss": 0.8985, + "step": 2026 + }, + { + "epoch": 0.6508267779739926, + "grad_norm": 0.9243746995925903, + "learning_rate": 0.00019365498867481923, + "loss": 0.6689, + "step": 2027 + }, + { + "epoch": 0.6511478567988441, + "grad_norm": 0.9078730344772339, + "learning_rate": 0.00019364274701322244, + "loss": 0.7267, + "step": 2028 + }, + { + "epoch": 0.6514689356236956, + "grad_norm": 1.5736430883407593, + "learning_rate": 0.00019363049394154094, + "loss": 0.994, + "step": 2029 + }, + { + "epoch": 0.6517900144485471, + "grad_norm": 0.8982104659080505, + "learning_rate": 0.0001936182294612678, + "loss": 0.838, + "step": 2030 + }, + { + "epoch": 0.6521110932733987, + "grad_norm": 1.2022709846496582, + "learning_rate": 0.00019360595357389735, + "loss": 1.2414, + "step": 2031 + }, + { + "epoch": 0.6524321720982501, + "grad_norm": 0.9198827147483826, + "learning_rate": 0.00019359366628092539, + "loss": 0.7564, + "step": 2032 + }, + { + "epoch": 0.6527532509231017, + "grad_norm": 1.3141475915908813, + "learning_rate": 0.00019358136758384912, + "loss": 1.3289, + "step": 2033 + }, + { + "epoch": 0.6530743297479531, + "grad_norm": 1.4782156944274902, + "learning_rate": 0.00019356905748416702, + "loss": 0.9758, + "step": 2034 + }, + { + "epoch": 0.6533954085728046, + "grad_norm": 1.2020496129989624, + "learning_rate": 0.00019355673598337914, + "loss": 0.8308, + "step": 2035 + }, + { + "epoch": 0.6537164873976561, + "grad_norm": 1.2560274600982666, + "learning_rate": 0.00019354440308298675, + "loss": 0.7407, + "step": 2036 + }, + { + "epoch": 0.6540375662225076, + "grad_norm": 0.8934011459350586, + "learning_rate": 0.00019353205878449258, + "loss": 0.865, + "step": 2037 + }, + { + "epoch": 0.6543586450473591, + "grad_norm": 0.9289383888244629, + "learning_rate": 0.0001935197030894008, + "loss": 0.7145, + "step": 2038 + }, + { + "epoch": 0.6546797238722106, + "grad_norm": 1.2418136596679688, + "learning_rate": 0.00019350733599921683, + "loss": 0.8111, + "step": 2039 + }, + { + "epoch": 0.6550008026970622, + "grad_norm": 0.9828119277954102, + "learning_rate": 0.00019349495751544763, + "loss": 0.9877, + "step": 2040 + }, + { + "epoch": 0.6553218815219136, + "grad_norm": 0.8384512662887573, + "learning_rate": 0.00019348256763960145, + "loss": 0.7265, + "step": 2041 + }, + { + "epoch": 0.6556429603467652, + "grad_norm": 1.1183538436889648, + "learning_rate": 0.000193470166373188, + "loss": 0.8691, + "step": 2042 + }, + { + "epoch": 0.6559640391716166, + "grad_norm": 1.0585153102874756, + "learning_rate": 0.00019345775371771824, + "loss": 0.9087, + "step": 2043 + }, + { + "epoch": 0.6562851179964682, + "grad_norm": 1.3051196336746216, + "learning_rate": 0.0001934453296747047, + "loss": 0.8324, + "step": 2044 + }, + { + "epoch": 0.6566061968213196, + "grad_norm": 1.0037448406219482, + "learning_rate": 0.00019343289424566122, + "loss": 0.7462, + "step": 2045 + }, + { + "epoch": 0.6569272756461711, + "grad_norm": 1.1593952178955078, + "learning_rate": 0.00019342044743210295, + "loss": 0.8161, + "step": 2046 + }, + { + "epoch": 0.6572483544710226, + "grad_norm": 0.9699090123176575, + "learning_rate": 0.00019340798923554657, + "loss": 0.8817, + "step": 2047 + }, + { + "epoch": 0.6575694332958741, + "grad_norm": 2.443246841430664, + "learning_rate": 0.00019339551965751002, + "loss": 0.888, + "step": 2048 + }, + { + "epoch": 0.6578905121207257, + "grad_norm": 0.9894551634788513, + "learning_rate": 0.00019338303869951269, + "loss": 0.6925, + "step": 2049 + }, + { + "epoch": 0.6582115909455771, + "grad_norm": 1.5795120000839233, + "learning_rate": 0.00019337054636307536, + "loss": 0.6042, + "step": 2050 + }, + { + "epoch": 0.6585326697704287, + "grad_norm": 1.1168063879013062, + "learning_rate": 0.00019335804264972018, + "loss": 1.449, + "step": 2051 + }, + { + "epoch": 0.6588537485952801, + "grad_norm": 1.1965980529785156, + "learning_rate": 0.0001933455275609707, + "loss": 1.1259, + "step": 2052 + }, + { + "epoch": 0.6591748274201317, + "grad_norm": 1.1799712181091309, + "learning_rate": 0.0001933330010983518, + "loss": 1.0329, + "step": 2053 + }, + { + "epoch": 0.6594959062449831, + "grad_norm": 1.4881231784820557, + "learning_rate": 0.00019332046326338986, + "loss": 0.8752, + "step": 2054 + }, + { + "epoch": 0.6598169850698347, + "grad_norm": 1.203697919845581, + "learning_rate": 0.00019330791405761252, + "loss": 0.7875, + "step": 2055 + }, + { + "epoch": 0.6601380638946861, + "grad_norm": 1.563508152961731, + "learning_rate": 0.00019329535348254893, + "loss": 0.907, + "step": 2056 + }, + { + "epoch": 0.6604591427195377, + "grad_norm": 1.3101892471313477, + "learning_rate": 0.00019328278153972947, + "loss": 0.9096, + "step": 2057 + }, + { + "epoch": 0.6607802215443892, + "grad_norm": 1.4064226150512695, + "learning_rate": 0.00019327019823068604, + "loss": 1.0788, + "step": 2058 + }, + { + "epoch": 0.6611013003692406, + "grad_norm": 1.2184844017028809, + "learning_rate": 0.00019325760355695188, + "loss": 0.9344, + "step": 2059 + }, + { + "epoch": 0.6614223791940922, + "grad_norm": 1.2525115013122559, + "learning_rate": 0.0001932449975200616, + "loss": 0.9809, + "step": 2060 + }, + { + "epoch": 0.6617434580189436, + "grad_norm": 1.2751420736312866, + "learning_rate": 0.00019323238012155123, + "loss": 0.8502, + "step": 2061 + }, + { + "epoch": 0.6620645368437952, + "grad_norm": 1.271164059638977, + "learning_rate": 0.00019321975136295813, + "loss": 0.9252, + "step": 2062 + }, + { + "epoch": 0.6623856156686466, + "grad_norm": 0.9167636632919312, + "learning_rate": 0.0001932071112458211, + "loss": 0.9382, + "step": 2063 + }, + { + "epoch": 0.6627066944934982, + "grad_norm": 1.027632474899292, + "learning_rate": 0.0001931944597716803, + "loss": 0.941, + "step": 2064 + }, + { + "epoch": 0.6630277733183496, + "grad_norm": 0.8215566873550415, + "learning_rate": 0.00019318179694207725, + "loss": 0.8984, + "step": 2065 + }, + { + "epoch": 0.6633488521432012, + "grad_norm": 1.2999035120010376, + "learning_rate": 0.0001931691227585549, + "loss": 0.8938, + "step": 2066 + }, + { + "epoch": 0.6636699309680526, + "grad_norm": 0.8892626762390137, + "learning_rate": 0.00019315643722265757, + "loss": 0.711, + "step": 2067 + }, + { + "epoch": 0.6639910097929042, + "grad_norm": 1.4477237462997437, + "learning_rate": 0.0001931437403359309, + "loss": 1.0521, + "step": 2068 + }, + { + "epoch": 0.6643120886177557, + "grad_norm": 1.3851264715194702, + "learning_rate": 0.00019313103209992204, + "loss": 1.0395, + "step": 2069 + }, + { + "epoch": 0.6646331674426071, + "grad_norm": 1.1687705516815186, + "learning_rate": 0.0001931183125161794, + "loss": 0.8819, + "step": 2070 + }, + { + "epoch": 0.6649542462674587, + "grad_norm": 1.4307971000671387, + "learning_rate": 0.00019310558158625285, + "loss": 1.012, + "step": 2071 + }, + { + "epoch": 0.6652753250923101, + "grad_norm": 1.1840988397598267, + "learning_rate": 0.00019309283931169356, + "loss": 1.034, + "step": 2072 + }, + { + "epoch": 0.6655964039171617, + "grad_norm": 1.1295652389526367, + "learning_rate": 0.00019308008569405422, + "loss": 0.968, + "step": 2073 + }, + { + "epoch": 0.6659174827420131, + "grad_norm": 1.238215684890747, + "learning_rate": 0.0001930673207348888, + "loss": 0.9026, + "step": 2074 + }, + { + "epoch": 0.6662385615668647, + "grad_norm": 1.0732163190841675, + "learning_rate": 0.00019305454443575262, + "loss": 1.0993, + "step": 2075 + }, + { + "epoch": 0.6665596403917161, + "grad_norm": 1.2432470321655273, + "learning_rate": 0.00019304175679820247, + "loss": 1.2107, + "step": 2076 + }, + { + "epoch": 0.6668807192165677, + "grad_norm": 1.0395501852035522, + "learning_rate": 0.0001930289578237965, + "loss": 0.8751, + "step": 2077 + }, + { + "epoch": 0.6672017980414192, + "grad_norm": 1.7850005626678467, + "learning_rate": 0.00019301614751409416, + "loss": 0.9662, + "step": 2078 + }, + { + "epoch": 0.6675228768662707, + "grad_norm": 1.5475821495056152, + "learning_rate": 0.0001930033258706564, + "loss": 1.056, + "step": 2079 + }, + { + "epoch": 0.6678439556911222, + "grad_norm": 0.9385414123535156, + "learning_rate": 0.00019299049289504553, + "loss": 0.8348, + "step": 2080 + }, + { + "epoch": 0.6681650345159736, + "grad_norm": 1.2614946365356445, + "learning_rate": 0.00019297764858882514, + "loss": 0.8467, + "step": 2081 + }, + { + "epoch": 0.6684861133408252, + "grad_norm": 1.07332444190979, + "learning_rate": 0.00019296479295356035, + "loss": 0.9718, + "step": 2082 + }, + { + "epoch": 0.6688071921656766, + "grad_norm": 2.2811384201049805, + "learning_rate": 0.00019295192599081746, + "loss": 1.1172, + "step": 2083 + }, + { + "epoch": 0.6691282709905282, + "grad_norm": 0.9308247566223145, + "learning_rate": 0.00019293904770216437, + "loss": 0.878, + "step": 2084 + }, + { + "epoch": 0.6694493498153796, + "grad_norm": 1.3048980236053467, + "learning_rate": 0.00019292615808917026, + "loss": 0.9988, + "step": 2085 + }, + { + "epoch": 0.6697704286402312, + "grad_norm": 2.0807573795318604, + "learning_rate": 0.00019291325715340563, + "loss": 0.9486, + "step": 2086 + }, + { + "epoch": 0.6700915074650827, + "grad_norm": 1.4993385076522827, + "learning_rate": 0.00019290034489644246, + "loss": 0.9338, + "step": 2087 + }, + { + "epoch": 0.6704125862899342, + "grad_norm": 1.1639915704727173, + "learning_rate": 0.00019288742131985407, + "loss": 0.9569, + "step": 2088 + }, + { + "epoch": 0.6707336651147857, + "grad_norm": 1.270796775817871, + "learning_rate": 0.00019287448642521513, + "loss": 0.904, + "step": 2089 + }, + { + "epoch": 0.6710547439396372, + "grad_norm": 1.7701547145843506, + "learning_rate": 0.00019286154021410173, + "loss": 1.0951, + "step": 2090 + }, + { + "epoch": 0.6713758227644887, + "grad_norm": 1.2145819664001465, + "learning_rate": 0.00019284858268809137, + "loss": 0.7358, + "step": 2091 + }, + { + "epoch": 0.6716969015893401, + "grad_norm": 1.4250010251998901, + "learning_rate": 0.00019283561384876284, + "loss": 1.0274, + "step": 2092 + }, + { + "epoch": 0.6720179804141917, + "grad_norm": 1.232686996459961, + "learning_rate": 0.00019282263369769633, + "loss": 1.0309, + "step": 2093 + }, + { + "epoch": 0.6723390592390431, + "grad_norm": 1.1875118017196655, + "learning_rate": 0.00019280964223647348, + "loss": 0.789, + "step": 2094 + }, + { + "epoch": 0.6726601380638947, + "grad_norm": 1.035929560661316, + "learning_rate": 0.00019279663946667727, + "loss": 0.8858, + "step": 2095 + }, + { + "epoch": 0.6729812168887462, + "grad_norm": 1.323173999786377, + "learning_rate": 0.000192783625389892, + "loss": 1.0185, + "step": 2096 + }, + { + "epoch": 0.6733022957135977, + "grad_norm": 1.4377621412277222, + "learning_rate": 0.00019277060000770342, + "loss": 0.6274, + "step": 2097 + }, + { + "epoch": 0.6736233745384492, + "grad_norm": 0.6943148374557495, + "learning_rate": 0.00019275756332169867, + "loss": 0.5938, + "step": 2098 + }, + { + "epoch": 0.6739444533633007, + "grad_norm": 1.0975568294525146, + "learning_rate": 0.00019274451533346615, + "loss": 0.5224, + "step": 2099 + }, + { + "epoch": 0.6742655321881522, + "grad_norm": 0.7381674647331238, + "learning_rate": 0.00019273145604459577, + "loss": 0.5018, + "step": 2100 + }, + { + "epoch": 0.6745866110130037, + "grad_norm": 1.5068893432617188, + "learning_rate": 0.00019271838545667876, + "loss": 1.3901, + "step": 2101 + }, + { + "epoch": 0.6749076898378552, + "grad_norm": 4.2163472175598145, + "learning_rate": 0.0001927053035713077, + "loss": 0.9407, + "step": 2102 + }, + { + "epoch": 0.6752287686627066, + "grad_norm": 1.432925820350647, + "learning_rate": 0.00019269221039007665, + "loss": 0.9272, + "step": 2103 + }, + { + "epoch": 0.6755498474875582, + "grad_norm": 1.374459981918335, + "learning_rate": 0.0001926791059145809, + "loss": 0.7463, + "step": 2104 + }, + { + "epoch": 0.6758709263124097, + "grad_norm": 1.4353713989257812, + "learning_rate": 0.0001926659901464172, + "loss": 0.8202, + "step": 2105 + }, + { + "epoch": 0.6761920051372612, + "grad_norm": 1.1444305181503296, + "learning_rate": 0.00019265286308718372, + "loss": 0.7957, + "step": 2106 + }, + { + "epoch": 0.6765130839621127, + "grad_norm": 1.4592642784118652, + "learning_rate": 0.00019263972473847993, + "loss": 0.6475, + "step": 2107 + }, + { + "epoch": 0.6768341627869642, + "grad_norm": 1.0431547164916992, + "learning_rate": 0.00019262657510190666, + "loss": 0.8323, + "step": 2108 + }, + { + "epoch": 0.6771552416118157, + "grad_norm": 1.0479865074157715, + "learning_rate": 0.00019261341417906621, + "loss": 0.9267, + "step": 2109 + }, + { + "epoch": 0.6774763204366672, + "grad_norm": 1.3619253635406494, + "learning_rate": 0.00019260024197156214, + "loss": 1.0108, + "step": 2110 + }, + { + "epoch": 0.6777973992615187, + "grad_norm": 1.088303565979004, + "learning_rate": 0.0001925870584809995, + "loss": 0.9458, + "step": 2111 + }, + { + "epoch": 0.6781184780863702, + "grad_norm": 1.064591646194458, + "learning_rate": 0.00019257386370898457, + "loss": 0.8121, + "step": 2112 + }, + { + "epoch": 0.6784395569112217, + "grad_norm": 0.9795963764190674, + "learning_rate": 0.00019256065765712522, + "loss": 1.006, + "step": 2113 + }, + { + "epoch": 0.6787606357360733, + "grad_norm": 1.218254804611206, + "learning_rate": 0.00019254744032703048, + "loss": 0.866, + "step": 2114 + }, + { + "epoch": 0.6790817145609247, + "grad_norm": 0.9186314940452576, + "learning_rate": 0.00019253421172031086, + "loss": 0.922, + "step": 2115 + }, + { + "epoch": 0.6794027933857762, + "grad_norm": 1.187469244003296, + "learning_rate": 0.00019252097183857823, + "loss": 0.7671, + "step": 2116 + }, + { + "epoch": 0.6797238722106277, + "grad_norm": 1.1949058771133423, + "learning_rate": 0.0001925077206834458, + "loss": 0.9525, + "step": 2117 + }, + { + "epoch": 0.6800449510354792, + "grad_norm": 1.0139573812484741, + "learning_rate": 0.00019249445825652824, + "loss": 0.8317, + "step": 2118 + }, + { + "epoch": 0.6803660298603307, + "grad_norm": 1.2550780773162842, + "learning_rate": 0.0001924811845594415, + "loss": 1.0096, + "step": 2119 + }, + { + "epoch": 0.6806871086851822, + "grad_norm": 1.0419883728027344, + "learning_rate": 0.00019246789959380295, + "loss": 0.9382, + "step": 2120 + }, + { + "epoch": 0.6810081875100337, + "grad_norm": 0.8842172622680664, + "learning_rate": 0.00019245460336123134, + "loss": 1.0352, + "step": 2121 + }, + { + "epoch": 0.6813292663348852, + "grad_norm": 1.3533904552459717, + "learning_rate": 0.00019244129586334672, + "loss": 1.1023, + "step": 2122 + }, + { + "epoch": 0.6816503451597368, + "grad_norm": 1.0904252529144287, + "learning_rate": 0.0001924279771017706, + "loss": 1.0112, + "step": 2123 + }, + { + "epoch": 0.6819714239845882, + "grad_norm": 1.2503504753112793, + "learning_rate": 0.00019241464707812585, + "loss": 0.8125, + "step": 2124 + }, + { + "epoch": 0.6822925028094398, + "grad_norm": 1.4106786251068115, + "learning_rate": 0.0001924013057940367, + "loss": 0.9185, + "step": 2125 + }, + { + "epoch": 0.6826135816342912, + "grad_norm": 1.1263327598571777, + "learning_rate": 0.0001923879532511287, + "loss": 0.9063, + "step": 2126 + }, + { + "epoch": 0.6829346604591428, + "grad_norm": 2.7052721977233887, + "learning_rate": 0.00019237458945102882, + "loss": 0.9509, + "step": 2127 + }, + { + "epoch": 0.6832557392839942, + "grad_norm": 1.1038817167282104, + "learning_rate": 0.00019236121439536542, + "loss": 0.9669, + "step": 2128 + }, + { + "epoch": 0.6835768181088457, + "grad_norm": 1.2589917182922363, + "learning_rate": 0.00019234782808576824, + "loss": 0.9313, + "step": 2129 + }, + { + "epoch": 0.6838978969336972, + "grad_norm": 1.6602294445037842, + "learning_rate": 0.0001923344305238683, + "loss": 1.0642, + "step": 2130 + }, + { + "epoch": 0.6842189757585487, + "grad_norm": 0.957186758518219, + "learning_rate": 0.00019232102171129811, + "loss": 0.7523, + "step": 2131 + }, + { + "epoch": 0.6845400545834002, + "grad_norm": 0.8682358264923096, + "learning_rate": 0.00019230760164969143, + "loss": 0.8184, + "step": 2132 + }, + { + "epoch": 0.6848611334082517, + "grad_norm": 1.2894452810287476, + "learning_rate": 0.0001922941703406835, + "loss": 0.7959, + "step": 2133 + }, + { + "epoch": 0.6851822122331033, + "grad_norm": 0.9780199527740479, + "learning_rate": 0.00019228072778591088, + "loss": 0.865, + "step": 2134 + }, + { + "epoch": 0.6855032910579547, + "grad_norm": 1.7117539644241333, + "learning_rate": 0.0001922672739870115, + "loss": 0.9489, + "step": 2135 + }, + { + "epoch": 0.6858243698828063, + "grad_norm": 1.3718947172164917, + "learning_rate": 0.00019225380894562463, + "loss": 1.0358, + "step": 2136 + }, + { + "epoch": 0.6861454487076577, + "grad_norm": 1.2108441591262817, + "learning_rate": 0.00019224033266339102, + "loss": 1.023, + "step": 2137 + }, + { + "epoch": 0.6864665275325093, + "grad_norm": 1.9689425230026245, + "learning_rate": 0.00019222684514195264, + "loss": 0.8403, + "step": 2138 + }, + { + "epoch": 0.6867876063573607, + "grad_norm": 1.626488208770752, + "learning_rate": 0.00019221334638295294, + "loss": 0.889, + "step": 2139 + }, + { + "epoch": 0.6871086851822122, + "grad_norm": 1.0145803689956665, + "learning_rate": 0.0001921998363880367, + "loss": 0.9659, + "step": 2140 + }, + { + "epoch": 0.6874297640070637, + "grad_norm": 2.1822826862335205, + "learning_rate": 0.00019218631515885006, + "loss": 0.8542, + "step": 2141 + }, + { + "epoch": 0.6877508428319152, + "grad_norm": 0.9943452477455139, + "learning_rate": 0.00019217278269704056, + "loss": 0.8499, + "step": 2142 + }, + { + "epoch": 0.6880719216567668, + "grad_norm": 0.675469696521759, + "learning_rate": 0.00019215923900425707, + "loss": 0.6101, + "step": 2143 + }, + { + "epoch": 0.6883930004816182, + "grad_norm": 1.052608609199524, + "learning_rate": 0.00019214568408214985, + "loss": 0.8342, + "step": 2144 + }, + { + "epoch": 0.6887140793064698, + "grad_norm": 0.9476191997528076, + "learning_rate": 0.00019213211793237057, + "loss": 0.8063, + "step": 2145 + }, + { + "epoch": 0.6890351581313212, + "grad_norm": 1.0476568937301636, + "learning_rate": 0.00019211854055657215, + "loss": 0.6778, + "step": 2146 + }, + { + "epoch": 0.6893562369561728, + "grad_norm": 1.4577487707138062, + "learning_rate": 0.00019210495195640895, + "loss": 0.7882, + "step": 2147 + }, + { + "epoch": 0.6896773157810242, + "grad_norm": 1.045127034187317, + "learning_rate": 0.0001920913521335368, + "loss": 0.6774, + "step": 2148 + }, + { + "epoch": 0.6899983946058758, + "grad_norm": 0.8924066424369812, + "learning_rate": 0.00019207774108961272, + "loss": 0.764, + "step": 2149 + }, + { + "epoch": 0.6903194734307272, + "grad_norm": 1.1722183227539062, + "learning_rate": 0.00019206411882629517, + "loss": 0.7897, + "step": 2150 + }, + { + "epoch": 0.6906405522555787, + "grad_norm": 1.1900309324264526, + "learning_rate": 0.00019205048534524406, + "loss": 1.2467, + "step": 2151 + }, + { + "epoch": 0.6909616310804303, + "grad_norm": 1.3887277841567993, + "learning_rate": 0.00019203684064812045, + "loss": 1.2293, + "step": 2152 + }, + { + "epoch": 0.6912827099052817, + "grad_norm": 1.1636091470718384, + "learning_rate": 0.00019202318473658705, + "loss": 0.8659, + "step": 2153 + }, + { + "epoch": 0.6916037887301333, + "grad_norm": 1.2683323621749878, + "learning_rate": 0.0001920095176123077, + "loss": 0.7881, + "step": 2154 + }, + { + "epoch": 0.6919248675549847, + "grad_norm": 1.227537989616394, + "learning_rate": 0.00019199583927694772, + "loss": 0.841, + "step": 2155 + }, + { + "epoch": 0.6922459463798363, + "grad_norm": 1.9995452165603638, + "learning_rate": 0.00019198214973217378, + "loss": 0.8185, + "step": 2156 + }, + { + "epoch": 0.6925670252046877, + "grad_norm": 1.2101178169250488, + "learning_rate": 0.00019196844897965393, + "loss": 0.5357, + "step": 2157 + }, + { + "epoch": 0.6928881040295393, + "grad_norm": 0.990922749042511, + "learning_rate": 0.00019195473702105748, + "loss": 0.8727, + "step": 2158 + }, + { + "epoch": 0.6932091828543907, + "grad_norm": 1.5191935300827026, + "learning_rate": 0.0001919410138580553, + "loss": 1.1395, + "step": 2159 + }, + { + "epoch": 0.6935302616792423, + "grad_norm": 1.074205994606018, + "learning_rate": 0.00019192727949231945, + "loss": 1.1821, + "step": 2160 + }, + { + "epoch": 0.6938513405040938, + "grad_norm": 1.0748862028121948, + "learning_rate": 0.00019191353392552344, + "loss": 0.9226, + "step": 2161 + }, + { + "epoch": 0.6941724193289452, + "grad_norm": 1.0452769994735718, + "learning_rate": 0.00019189977715934213, + "loss": 0.8429, + "step": 2162 + }, + { + "epoch": 0.6944934981537968, + "grad_norm": 1.0717366933822632, + "learning_rate": 0.00019188600919545174, + "loss": 0.8875, + "step": 2163 + }, + { + "epoch": 0.6948145769786482, + "grad_norm": 0.7801916599273682, + "learning_rate": 0.00019187223003552985, + "loss": 0.8721, + "step": 2164 + }, + { + "epoch": 0.6951356558034998, + "grad_norm": 1.0750560760498047, + "learning_rate": 0.0001918584396812554, + "loss": 1.103, + "step": 2165 + }, + { + "epoch": 0.6954567346283512, + "grad_norm": 1.0144506692886353, + "learning_rate": 0.00019184463813430873, + "loss": 1.0441, + "step": 2166 + }, + { + "epoch": 0.6957778134532028, + "grad_norm": 1.0422799587249756, + "learning_rate": 0.00019183082539637146, + "loss": 0.8684, + "step": 2167 + }, + { + "epoch": 0.6960988922780542, + "grad_norm": 1.1190441846847534, + "learning_rate": 0.0001918170014691267, + "loss": 0.8669, + "step": 2168 + }, + { + "epoch": 0.6964199711029058, + "grad_norm": 1.2634443044662476, + "learning_rate": 0.0001918031663542588, + "loss": 0.8319, + "step": 2169 + }, + { + "epoch": 0.6967410499277573, + "grad_norm": 1.256981611251831, + "learning_rate": 0.0001917893200534536, + "loss": 0.9238, + "step": 2170 + }, + { + "epoch": 0.6970621287526088, + "grad_norm": 1.5736911296844482, + "learning_rate": 0.00019177546256839812, + "loss": 1.1809, + "step": 2171 + }, + { + "epoch": 0.6973832075774603, + "grad_norm": 0.890072762966156, + "learning_rate": 0.00019176159390078094, + "loss": 0.8873, + "step": 2172 + }, + { + "epoch": 0.6977042864023117, + "grad_norm": 0.9746190309524536, + "learning_rate": 0.00019174771405229186, + "loss": 0.8685, + "step": 2173 + }, + { + "epoch": 0.6980253652271633, + "grad_norm": 1.4396299123764038, + "learning_rate": 0.00019173382302462214, + "loss": 0.9388, + "step": 2174 + }, + { + "epoch": 0.6983464440520147, + "grad_norm": 1.3588879108428955, + "learning_rate": 0.00019171992081946435, + "loss": 1.0749, + "step": 2175 + }, + { + "epoch": 0.6986675228768663, + "grad_norm": 2.2067346572875977, + "learning_rate": 0.0001917060074385124, + "loss": 1.0014, + "step": 2176 + }, + { + "epoch": 0.6989886017017177, + "grad_norm": 1.2330677509307861, + "learning_rate": 0.00019169208288346166, + "loss": 1.1379, + "step": 2177 + }, + { + "epoch": 0.6993096805265693, + "grad_norm": 1.0044240951538086, + "learning_rate": 0.0001916781471560087, + "loss": 0.9738, + "step": 2178 + }, + { + "epoch": 0.6996307593514208, + "grad_norm": 1.3933671712875366, + "learning_rate": 0.00019166420025785164, + "loss": 1.0522, + "step": 2179 + }, + { + "epoch": 0.6999518381762723, + "grad_norm": 0.9893519282341003, + "learning_rate": 0.0001916502421906898, + "loss": 0.8907, + "step": 2180 + }, + { + "epoch": 0.7002729170011238, + "grad_norm": 1.0318580865859985, + "learning_rate": 0.00019163627295622397, + "loss": 0.9829, + "step": 2181 + }, + { + "epoch": 0.7005939958259753, + "grad_norm": 1.4720815420150757, + "learning_rate": 0.00019162229255615624, + "loss": 0.9374, + "step": 2182 + }, + { + "epoch": 0.7009150746508268, + "grad_norm": 1.006734013557434, + "learning_rate": 0.00019160830099219006, + "loss": 1.04, + "step": 2183 + }, + { + "epoch": 0.7012361534756782, + "grad_norm": 2.0803046226501465, + "learning_rate": 0.00019159429826603032, + "loss": 1.1813, + "step": 2184 + }, + { + "epoch": 0.7015572323005298, + "grad_norm": 1.770952820777893, + "learning_rate": 0.00019158028437938317, + "loss": 1.0018, + "step": 2185 + }, + { + "epoch": 0.7018783111253812, + "grad_norm": 1.0838485956192017, + "learning_rate": 0.00019156625933395614, + "loss": 1.0851, + "step": 2186 + }, + { + "epoch": 0.7021993899502328, + "grad_norm": 1.384687066078186, + "learning_rate": 0.00019155222313145816, + "loss": 0.9151, + "step": 2187 + }, + { + "epoch": 0.7025204687750842, + "grad_norm": 0.8908778429031372, + "learning_rate": 0.0001915381757735995, + "loss": 0.9169, + "step": 2188 + }, + { + "epoch": 0.7028415475999358, + "grad_norm": 0.814580500125885, + "learning_rate": 0.00019152411726209176, + "loss": 0.5335, + "step": 2189 + }, + { + "epoch": 0.7031626264247873, + "grad_norm": 1.491808533668518, + "learning_rate": 0.000191510047598648, + "loss": 0.9649, + "step": 2190 + }, + { + "epoch": 0.7034837052496388, + "grad_norm": 1.308821678161621, + "learning_rate": 0.0001914959667849825, + "loss": 0.9609, + "step": 2191 + }, + { + "epoch": 0.7038047840744903, + "grad_norm": 1.6235363483428955, + "learning_rate": 0.00019148187482281097, + "loss": 0.8551, + "step": 2192 + }, + { + "epoch": 0.7041258628993418, + "grad_norm": 1.9721722602844238, + "learning_rate": 0.0001914677717138505, + "loss": 0.8588, + "step": 2193 + }, + { + "epoch": 0.7044469417241933, + "grad_norm": 1.485297679901123, + "learning_rate": 0.00019145365745981948, + "loss": 0.7772, + "step": 2194 + }, + { + "epoch": 0.7047680205490447, + "grad_norm": 1.1313917636871338, + "learning_rate": 0.00019143953206243776, + "loss": 0.8873, + "step": 2195 + }, + { + "epoch": 0.7050890993738963, + "grad_norm": 1.1025351285934448, + "learning_rate": 0.00019142539552342638, + "loss": 0.8683, + "step": 2196 + }, + { + "epoch": 0.7054101781987477, + "grad_norm": 1.7644602060317993, + "learning_rate": 0.0001914112478445079, + "loss": 0.6767, + "step": 2197 + }, + { + "epoch": 0.7057312570235993, + "grad_norm": 0.9576209783554077, + "learning_rate": 0.00019139708902740613, + "loss": 0.6193, + "step": 2198 + }, + { + "epoch": 0.7060523358484508, + "grad_norm": 1.4080398082733154, + "learning_rate": 0.0001913829190738463, + "loss": 0.7909, + "step": 2199 + }, + { + "epoch": 0.7063734146733023, + "grad_norm": 1.04661226272583, + "learning_rate": 0.000191368737985555, + "loss": 0.6024, + "step": 2200 + }, + { + "epoch": 0.7066944934981538, + "grad_norm": 1.448823094367981, + "learning_rate": 0.0001913545457642601, + "loss": 1.4655, + "step": 2201 + }, + { + "epoch": 0.7070155723230053, + "grad_norm": 1.4213786125183105, + "learning_rate": 0.0001913403424116909, + "loss": 1.2999, + "step": 2202 + }, + { + "epoch": 0.7073366511478568, + "grad_norm": 1.3486974239349365, + "learning_rate": 0.00019132612792957808, + "loss": 0.946, + "step": 2203 + }, + { + "epoch": 0.7076577299727083, + "grad_norm": 1.5810649394989014, + "learning_rate": 0.00019131190231965356, + "loss": 0.8893, + "step": 2204 + }, + { + "epoch": 0.7079788087975598, + "grad_norm": 1.3420324325561523, + "learning_rate": 0.00019129766558365074, + "loss": 0.7434, + "step": 2205 + }, + { + "epoch": 0.7082998876224113, + "grad_norm": 1.3447771072387695, + "learning_rate": 0.0001912834177233043, + "loss": 0.8466, + "step": 2206 + }, + { + "epoch": 0.7086209664472628, + "grad_norm": 1.1809029579162598, + "learning_rate": 0.0001912691587403503, + "loss": 0.6462, + "step": 2207 + }, + { + "epoch": 0.7089420452721144, + "grad_norm": 1.2314188480377197, + "learning_rate": 0.00019125488863652615, + "loss": 0.5693, + "step": 2208 + }, + { + "epoch": 0.7092631240969658, + "grad_norm": 1.04121994972229, + "learning_rate": 0.00019124060741357063, + "loss": 0.8847, + "step": 2209 + }, + { + "epoch": 0.7095842029218173, + "grad_norm": 1.0673465728759766, + "learning_rate": 0.00019122631507322387, + "loss": 1.0044, + "step": 2210 + }, + { + "epoch": 0.7099052817466688, + "grad_norm": 0.6472902894020081, + "learning_rate": 0.0001912120116172273, + "loss": 0.7363, + "step": 2211 + }, + { + "epoch": 0.7102263605715203, + "grad_norm": 0.9594241380691528, + "learning_rate": 0.00019119769704732382, + "loss": 0.7698, + "step": 2212 + }, + { + "epoch": 0.7105474393963718, + "grad_norm": 1.1029056310653687, + "learning_rate": 0.0001911833713652576, + "loss": 1.025, + "step": 2213 + }, + { + "epoch": 0.7108685182212233, + "grad_norm": 1.1992616653442383, + "learning_rate": 0.00019116903457277413, + "loss": 1.0412, + "step": 2214 + }, + { + "epoch": 0.7111895970460748, + "grad_norm": 1.4310470819473267, + "learning_rate": 0.00019115468667162038, + "loss": 0.8216, + "step": 2215 + }, + { + "epoch": 0.7115106758709263, + "grad_norm": 1.5409244298934937, + "learning_rate": 0.00019114032766354453, + "loss": 0.5844, + "step": 2216 + }, + { + "epoch": 0.7118317546957779, + "grad_norm": 1.4199776649475098, + "learning_rate": 0.00019112595755029624, + "loss": 1.0082, + "step": 2217 + }, + { + "epoch": 0.7121528335206293, + "grad_norm": 1.1927727460861206, + "learning_rate": 0.0001911115763336264, + "loss": 0.8184, + "step": 2218 + }, + { + "epoch": 0.7124739123454809, + "grad_norm": 0.9337337017059326, + "learning_rate": 0.0001910971840152874, + "loss": 0.8842, + "step": 2219 + }, + { + "epoch": 0.7127949911703323, + "grad_norm": 0.9035426378250122, + "learning_rate": 0.0001910827805970328, + "loss": 0.8288, + "step": 2220 + }, + { + "epoch": 0.7131160699951838, + "grad_norm": 1.3903597593307495, + "learning_rate": 0.00019106836608061772, + "loss": 0.8678, + "step": 2221 + }, + { + "epoch": 0.7134371488200353, + "grad_norm": 1.2221095561981201, + "learning_rate": 0.00019105394046779845, + "loss": 0.7565, + "step": 2222 + }, + { + "epoch": 0.7137582276448868, + "grad_norm": 1.1277964115142822, + "learning_rate": 0.00019103950376033276, + "loss": 0.9622, + "step": 2223 + }, + { + "epoch": 0.7140793064697383, + "grad_norm": 1.453009843826294, + "learning_rate": 0.00019102505595997965, + "loss": 0.9864, + "step": 2224 + }, + { + "epoch": 0.7144003852945898, + "grad_norm": 1.2029697895050049, + "learning_rate": 0.00019101059706849957, + "loss": 1.1135, + "step": 2225 + }, + { + "epoch": 0.7147214641194414, + "grad_norm": 1.6069939136505127, + "learning_rate": 0.00019099612708765434, + "loss": 1.078, + "step": 2226 + }, + { + "epoch": 0.7150425429442928, + "grad_norm": 1.1098419427871704, + "learning_rate": 0.000190981646019207, + "loss": 0.8924, + "step": 2227 + }, + { + "epoch": 0.7153636217691444, + "grad_norm": 1.2886768579483032, + "learning_rate": 0.0001909671538649221, + "loss": 1.105, + "step": 2228 + }, + { + "epoch": 0.7156847005939958, + "grad_norm": 0.9400178790092468, + "learning_rate": 0.00019095265062656544, + "loss": 1.1208, + "step": 2229 + }, + { + "epoch": 0.7160057794188474, + "grad_norm": 1.2319140434265137, + "learning_rate": 0.00019093813630590418, + "loss": 0.8575, + "step": 2230 + }, + { + "epoch": 0.7163268582436988, + "grad_norm": 1.095289945602417, + "learning_rate": 0.00019092361090470688, + "loss": 0.842, + "step": 2231 + }, + { + "epoch": 0.7166479370685503, + "grad_norm": 1.421386957168579, + "learning_rate": 0.00019090907442474334, + "loss": 1.0803, + "step": 2232 + }, + { + "epoch": 0.7169690158934018, + "grad_norm": 1.5871707201004028, + "learning_rate": 0.00019089452686778488, + "loss": 1.0677, + "step": 2233 + }, + { + "epoch": 0.7172900947182533, + "grad_norm": 1.591778039932251, + "learning_rate": 0.00019087996823560402, + "loss": 1.0679, + "step": 2234 + }, + { + "epoch": 0.7176111735431049, + "grad_norm": 2.42740797996521, + "learning_rate": 0.0001908653985299747, + "loss": 1.0548, + "step": 2235 + }, + { + "epoch": 0.7179322523679563, + "grad_norm": 1.0320085287094116, + "learning_rate": 0.0001908508177526722, + "loss": 0.7861, + "step": 2236 + }, + { + "epoch": 0.7182533311928079, + "grad_norm": 1.18021821975708, + "learning_rate": 0.00019083622590547312, + "loss": 0.8066, + "step": 2237 + }, + { + "epoch": 0.7185744100176593, + "grad_norm": 1.0343506336212158, + "learning_rate": 0.00019082162299015546, + "loss": 0.8142, + "step": 2238 + }, + { + "epoch": 0.7188954888425109, + "grad_norm": 1.4337462186813354, + "learning_rate": 0.00019080700900849851, + "loss": 0.9664, + "step": 2239 + }, + { + "epoch": 0.7192165676673623, + "grad_norm": 1.5604829788208008, + "learning_rate": 0.000190792383962283, + "loss": 1.024, + "step": 2240 + }, + { + "epoch": 0.7195376464922139, + "grad_norm": 1.152066707611084, + "learning_rate": 0.00019077774785329087, + "loss": 1.0221, + "step": 2241 + }, + { + "epoch": 0.7198587253170653, + "grad_norm": 1.9510457515716553, + "learning_rate": 0.00019076310068330554, + "loss": 1.0129, + "step": 2242 + }, + { + "epoch": 0.7201798041419168, + "grad_norm": 1.1222991943359375, + "learning_rate": 0.0001907484424541117, + "loss": 0.8, + "step": 2243 + }, + { + "epoch": 0.7205008829667684, + "grad_norm": 1.8768889904022217, + "learning_rate": 0.00019073377316749542, + "loss": 0.8479, + "step": 2244 + }, + { + "epoch": 0.7208219617916198, + "grad_norm": 1.251123309135437, + "learning_rate": 0.00019071909282524413, + "loss": 1.0588, + "step": 2245 + }, + { + "epoch": 0.7211430406164714, + "grad_norm": 1.2855134010314941, + "learning_rate": 0.0001907044014291465, + "loss": 0.9369, + "step": 2246 + }, + { + "epoch": 0.7214641194413228, + "grad_norm": 0.9821279644966125, + "learning_rate": 0.0001906896989809927, + "loss": 0.6617, + "step": 2247 + }, + { + "epoch": 0.7217851982661744, + "grad_norm": 1.674775242805481, + "learning_rate": 0.00019067498548257423, + "loss": 0.7747, + "step": 2248 + }, + { + "epoch": 0.7221062770910258, + "grad_norm": 1.1248822212219238, + "learning_rate": 0.00019066026093568378, + "loss": 0.739, + "step": 2249 + }, + { + "epoch": 0.7224273559158774, + "grad_norm": 0.8824935555458069, + "learning_rate": 0.00019064552534211554, + "loss": 0.5349, + "step": 2250 + }, + { + "epoch": 0.7227484347407288, + "grad_norm": 1.197925329208374, + "learning_rate": 0.000190630778703665, + "loss": 1.3444, + "step": 2251 + }, + { + "epoch": 0.7230695135655804, + "grad_norm": 0.9842731356620789, + "learning_rate": 0.00019061602102212898, + "loss": 1.3156, + "step": 2252 + }, + { + "epoch": 0.7233905923904318, + "grad_norm": 1.0988041162490845, + "learning_rate": 0.0001906012522993057, + "loss": 1.1763, + "step": 2253 + }, + { + "epoch": 0.7237116712152833, + "grad_norm": 1.067678451538086, + "learning_rate": 0.0001905864725369946, + "loss": 0.8682, + "step": 2254 + }, + { + "epoch": 0.7240327500401349, + "grad_norm": 1.3331475257873535, + "learning_rate": 0.00019057168173699664, + "loss": 0.7538, + "step": 2255 + }, + { + "epoch": 0.7243538288649863, + "grad_norm": 1.1442729234695435, + "learning_rate": 0.00019055687990111398, + "loss": 0.7896, + "step": 2256 + }, + { + "epoch": 0.7246749076898379, + "grad_norm": 1.0878592729568481, + "learning_rate": 0.0001905420670311502, + "loss": 0.5776, + "step": 2257 + }, + { + "epoch": 0.7249959865146893, + "grad_norm": 1.124906301498413, + "learning_rate": 0.00019052724312891014, + "loss": 0.8746, + "step": 2258 + }, + { + "epoch": 0.7253170653395409, + "grad_norm": 1.1294090747833252, + "learning_rate": 0.00019051240819620014, + "loss": 1.0163, + "step": 2259 + }, + { + "epoch": 0.7256381441643923, + "grad_norm": 1.1215919256210327, + "learning_rate": 0.0001904975622348278, + "loss": 1.042, + "step": 2260 + }, + { + "epoch": 0.7259592229892439, + "grad_norm": 0.8851989507675171, + "learning_rate": 0.00019048270524660196, + "loss": 0.6864, + "step": 2261 + }, + { + "epoch": 0.7262803018140953, + "grad_norm": 0.9922202229499817, + "learning_rate": 0.00019046783723333297, + "loss": 0.7434, + "step": 2262 + }, + { + "epoch": 0.7266013806389469, + "grad_norm": 1.0879603624343872, + "learning_rate": 0.00019045295819683242, + "loss": 0.7117, + "step": 2263 + }, + { + "epoch": 0.7269224594637984, + "grad_norm": 1.3650341033935547, + "learning_rate": 0.0001904380681389133, + "loss": 0.9001, + "step": 2264 + }, + { + "epoch": 0.7272435382886498, + "grad_norm": 1.5964879989624023, + "learning_rate": 0.00019042316706138987, + "loss": 1.0102, + "step": 2265 + }, + { + "epoch": 0.7275646171135014, + "grad_norm": 0.8338374495506287, + "learning_rate": 0.00019040825496607786, + "loss": 0.8816, + "step": 2266 + }, + { + "epoch": 0.7278856959383528, + "grad_norm": 1.0702760219573975, + "learning_rate": 0.00019039333185479418, + "loss": 0.8459, + "step": 2267 + }, + { + "epoch": 0.7282067747632044, + "grad_norm": 0.976639449596405, + "learning_rate": 0.0001903783977293572, + "loss": 0.8101, + "step": 2268 + }, + { + "epoch": 0.7285278535880558, + "grad_norm": 1.2155683040618896, + "learning_rate": 0.00019036345259158667, + "loss": 1.0607, + "step": 2269 + }, + { + "epoch": 0.7288489324129074, + "grad_norm": 1.2647230625152588, + "learning_rate": 0.0001903484964433035, + "loss": 1.1008, + "step": 2270 + }, + { + "epoch": 0.7291700112377588, + "grad_norm": 0.9674025177955627, + "learning_rate": 0.0001903335292863301, + "loss": 0.7245, + "step": 2271 + }, + { + "epoch": 0.7294910900626104, + "grad_norm": 1.2419253587722778, + "learning_rate": 0.00019031855112249015, + "loss": 0.8682, + "step": 2272 + }, + { + "epoch": 0.7298121688874619, + "grad_norm": 0.9692038297653198, + "learning_rate": 0.00019030356195360874, + "loss": 0.9075, + "step": 2273 + }, + { + "epoch": 0.7301332477123134, + "grad_norm": 1.1425732374191284, + "learning_rate": 0.0001902885617815122, + "loss": 0.7757, + "step": 2274 + }, + { + "epoch": 0.7304543265371649, + "grad_norm": 0.9863192439079285, + "learning_rate": 0.0001902735506080283, + "loss": 0.8154, + "step": 2275 + }, + { + "epoch": 0.7307754053620164, + "grad_norm": 0.9807154536247253, + "learning_rate": 0.00019025852843498607, + "loss": 0.9582, + "step": 2276 + }, + { + "epoch": 0.7310964841868679, + "grad_norm": 1.2582796812057495, + "learning_rate": 0.00019024349526421594, + "loss": 1.009, + "step": 2277 + }, + { + "epoch": 0.7314175630117193, + "grad_norm": 1.0475612878799438, + "learning_rate": 0.00019022845109754966, + "loss": 0.997, + "step": 2278 + }, + { + "epoch": 0.7317386418365709, + "grad_norm": 1.6267993450164795, + "learning_rate": 0.00019021339593682028, + "loss": 0.9519, + "step": 2279 + }, + { + "epoch": 0.7320597206614223, + "grad_norm": 1.2485898733139038, + "learning_rate": 0.00019019832978386228, + "loss": 0.8931, + "step": 2280 + }, + { + "epoch": 0.7323807994862739, + "grad_norm": 0.9349825382232666, + "learning_rate": 0.0001901832526405114, + "loss": 0.8948, + "step": 2281 + }, + { + "epoch": 0.7327018783111254, + "grad_norm": 1.5776710510253906, + "learning_rate": 0.00019016816450860474, + "loss": 0.9504, + "step": 2282 + }, + { + "epoch": 0.7330229571359769, + "grad_norm": 0.9056895971298218, + "learning_rate": 0.0001901530653899807, + "loss": 0.8712, + "step": 2283 + }, + { + "epoch": 0.7333440359608284, + "grad_norm": 1.3263038396835327, + "learning_rate": 0.00019013795528647912, + "loss": 0.8974, + "step": 2284 + }, + { + "epoch": 0.7336651147856799, + "grad_norm": 1.0501210689544678, + "learning_rate": 0.00019012283419994115, + "loss": 0.888, + "step": 2285 + }, + { + "epoch": 0.7339861936105314, + "grad_norm": 1.9327582120895386, + "learning_rate": 0.00019010770213220916, + "loss": 1.0191, + "step": 2286 + }, + { + "epoch": 0.7343072724353829, + "grad_norm": 0.9051433205604553, + "learning_rate": 0.000190092559085127, + "loss": 0.736, + "step": 2287 + }, + { + "epoch": 0.7346283512602344, + "grad_norm": 2.308243989944458, + "learning_rate": 0.00019007740506053983, + "loss": 0.9449, + "step": 2288 + }, + { + "epoch": 0.7349494300850858, + "grad_norm": 0.9226866960525513, + "learning_rate": 0.00019006224006029406, + "loss": 0.9529, + "step": 2289 + }, + { + "epoch": 0.7352705089099374, + "grad_norm": 1.1376844644546509, + "learning_rate": 0.0001900470640862375, + "loss": 0.8272, + "step": 2290 + }, + { + "epoch": 0.7355915877347889, + "grad_norm": 1.475791335105896, + "learning_rate": 0.00019003187714021938, + "loss": 1.1438, + "step": 2291 + }, + { + "epoch": 0.7359126665596404, + "grad_norm": 0.9544022679328918, + "learning_rate": 0.00019001667922409008, + "loss": 0.8274, + "step": 2292 + }, + { + "epoch": 0.7362337453844919, + "grad_norm": 1.031445860862732, + "learning_rate": 0.00019000147033970148, + "loss": 0.7776, + "step": 2293 + }, + { + "epoch": 0.7365548242093434, + "grad_norm": 0.8652722835540771, + "learning_rate": 0.00018998625048890672, + "loss": 0.7621, + "step": 2294 + }, + { + "epoch": 0.7368759030341949, + "grad_norm": 1.1719671487808228, + "learning_rate": 0.0001899710196735603, + "loss": 0.9207, + "step": 2295 + }, + { + "epoch": 0.7371969818590464, + "grad_norm": 1.2041524648666382, + "learning_rate": 0.00018995577789551803, + "loss": 0.8044, + "step": 2296 + }, + { + "epoch": 0.7375180606838979, + "grad_norm": 1.1736308336257935, + "learning_rate": 0.0001899405251566371, + "loss": 0.8288, + "step": 2297 + }, + { + "epoch": 0.7378391395087494, + "grad_norm": 0.8900883793830872, + "learning_rate": 0.000189925261458776, + "loss": 0.7487, + "step": 2298 + }, + { + "epoch": 0.7381602183336009, + "grad_norm": 0.7392171621322632, + "learning_rate": 0.00018990998680379456, + "loss": 0.6899, + "step": 2299 + }, + { + "epoch": 0.7384812971584525, + "grad_norm": 1.4253010749816895, + "learning_rate": 0.00018989470119355398, + "loss": 0.6706, + "step": 2300 + }, + { + "epoch": 0.7388023759833039, + "grad_norm": 0.8190407752990723, + "learning_rate": 0.0001898794046299167, + "loss": 1.2835, + "step": 2301 + }, + { + "epoch": 0.7391234548081554, + "grad_norm": 1.0139261484146118, + "learning_rate": 0.00018986409711474665, + "loss": 0.9747, + "step": 2302 + }, + { + "epoch": 0.7394445336330069, + "grad_norm": 1.0037243366241455, + "learning_rate": 0.00018984877864990888, + "loss": 0.7472, + "step": 2303 + }, + { + "epoch": 0.7397656124578584, + "grad_norm": 1.3445838689804077, + "learning_rate": 0.00018983344923727003, + "loss": 0.8011, + "step": 2304 + }, + { + "epoch": 0.7400866912827099, + "grad_norm": 1.1861538887023926, + "learning_rate": 0.00018981810887869785, + "loss": 0.5261, + "step": 2305 + }, + { + "epoch": 0.7404077701075614, + "grad_norm": 1.1027861833572388, + "learning_rate": 0.00018980275757606157, + "loss": 0.6695, + "step": 2306 + }, + { + "epoch": 0.7407288489324129, + "grad_norm": 1.280778169631958, + "learning_rate": 0.0001897873953312317, + "loss": 0.7354, + "step": 2307 + }, + { + "epoch": 0.7410499277572644, + "grad_norm": 1.1856905221939087, + "learning_rate": 0.00018977202214608, + "loss": 1.0707, + "step": 2308 + }, + { + "epoch": 0.7413710065821159, + "grad_norm": 0.8002036809921265, + "learning_rate": 0.00018975663802247976, + "loss": 0.763, + "step": 2309 + }, + { + "epoch": 0.7416920854069674, + "grad_norm": 1.3091520071029663, + "learning_rate": 0.0001897412429623054, + "loss": 1.1745, + "step": 2310 + }, + { + "epoch": 0.742013164231819, + "grad_norm": 1.0086097717285156, + "learning_rate": 0.00018972583696743285, + "loss": 1.039, + "step": 2311 + }, + { + "epoch": 0.7423342430566704, + "grad_norm": 1.0016741752624512, + "learning_rate": 0.00018971042003973924, + "loss": 0.9677, + "step": 2312 + }, + { + "epoch": 0.742655321881522, + "grad_norm": 1.2029709815979004, + "learning_rate": 0.000189694992181103, + "loss": 1.062, + "step": 2313 + }, + { + "epoch": 0.7429764007063734, + "grad_norm": 0.854852020740509, + "learning_rate": 0.00018967955339340407, + "loss": 0.8209, + "step": 2314 + }, + { + "epoch": 0.7432974795312249, + "grad_norm": 0.892218828201294, + "learning_rate": 0.00018966410367852362, + "loss": 1.0757, + "step": 2315 + }, + { + "epoch": 0.7436185583560764, + "grad_norm": 0.8785187005996704, + "learning_rate": 0.00018964864303834406, + "loss": 0.9317, + "step": 2316 + }, + { + "epoch": 0.7439396371809279, + "grad_norm": 0.999880850315094, + "learning_rate": 0.0001896331714747493, + "loss": 0.8786, + "step": 2317 + }, + { + "epoch": 0.7442607160057794, + "grad_norm": 1.373669147491455, + "learning_rate": 0.0001896176889896245, + "loss": 1.0008, + "step": 2318 + }, + { + "epoch": 0.7445817948306309, + "grad_norm": 1.0708602666854858, + "learning_rate": 0.0001896021955848561, + "loss": 1.0163, + "step": 2319 + }, + { + "epoch": 0.7449028736554825, + "grad_norm": 1.0174452066421509, + "learning_rate": 0.00018958669126233199, + "loss": 0.9382, + "step": 2320 + }, + { + "epoch": 0.7452239524803339, + "grad_norm": 1.2005442380905151, + "learning_rate": 0.0001895711760239413, + "loss": 0.8355, + "step": 2321 + }, + { + "epoch": 0.7455450313051855, + "grad_norm": 1.3386541604995728, + "learning_rate": 0.0001895556498715745, + "loss": 0.8852, + "step": 2322 + }, + { + "epoch": 0.7458661101300369, + "grad_norm": 1.0639350414276123, + "learning_rate": 0.0001895401128071234, + "loss": 0.9128, + "step": 2323 + }, + { + "epoch": 0.7461871889548884, + "grad_norm": 0.9678146243095398, + "learning_rate": 0.00018952456483248119, + "loss": 0.9178, + "step": 2324 + }, + { + "epoch": 0.7465082677797399, + "grad_norm": 1.260991096496582, + "learning_rate": 0.00018950900594954227, + "loss": 0.9372, + "step": 2325 + }, + { + "epoch": 0.7468293466045914, + "grad_norm": 1.2211194038391113, + "learning_rate": 0.00018949343616020252, + "loss": 0.8576, + "step": 2326 + }, + { + "epoch": 0.7471504254294429, + "grad_norm": 1.1809611320495605, + "learning_rate": 0.00018947785546635904, + "loss": 0.9619, + "step": 2327 + }, + { + "epoch": 0.7474715042542944, + "grad_norm": 2.355696678161621, + "learning_rate": 0.00018946226386991027, + "loss": 1.1663, + "step": 2328 + }, + { + "epoch": 0.747792583079146, + "grad_norm": 1.0631635189056396, + "learning_rate": 0.000189446661372756, + "loss": 0.8316, + "step": 2329 + }, + { + "epoch": 0.7481136619039974, + "grad_norm": 1.7369928359985352, + "learning_rate": 0.0001894310479767974, + "loss": 0.8755, + "step": 2330 + }, + { + "epoch": 0.748434740728849, + "grad_norm": 2.2223386764526367, + "learning_rate": 0.0001894154236839368, + "loss": 1.1898, + "step": 2331 + }, + { + "epoch": 0.7487558195537004, + "grad_norm": 1.2930457592010498, + "learning_rate": 0.00018939978849607814, + "loss": 0.8362, + "step": 2332 + }, + { + "epoch": 0.749076898378552, + "grad_norm": 0.9054352045059204, + "learning_rate": 0.0001893841424151264, + "loss": 0.8712, + "step": 2333 + }, + { + "epoch": 0.7493979772034034, + "grad_norm": 1.637449026107788, + "learning_rate": 0.000189368485442988, + "loss": 1.0388, + "step": 2334 + }, + { + "epoch": 0.749719056028255, + "grad_norm": 1.306647777557373, + "learning_rate": 0.00018935281758157078, + "loss": 0.8911, + "step": 2335 + }, + { + "epoch": 0.7500401348531064, + "grad_norm": 1.2520787715911865, + "learning_rate": 0.00018933713883278376, + "loss": 0.8615, + "step": 2336 + }, + { + "epoch": 0.7503612136779579, + "grad_norm": 1.5709820985794067, + "learning_rate": 0.0001893214491985374, + "loss": 0.8583, + "step": 2337 + }, + { + "epoch": 0.7506822925028095, + "grad_norm": 1.1838668584823608, + "learning_rate": 0.00018930574868074334, + "loss": 0.9393, + "step": 2338 + }, + { + "epoch": 0.7510033713276609, + "grad_norm": 1.7159479856491089, + "learning_rate": 0.0001892900372813147, + "loss": 0.7978, + "step": 2339 + }, + { + "epoch": 0.7513244501525125, + "grad_norm": 0.8659077882766724, + "learning_rate": 0.00018927431500216586, + "loss": 0.8576, + "step": 2340 + }, + { + "epoch": 0.7516455289773639, + "grad_norm": 1.2940974235534668, + "learning_rate": 0.00018925858184521256, + "loss": 0.7817, + "step": 2341 + }, + { + "epoch": 0.7519666078022155, + "grad_norm": 1.3620824813842773, + "learning_rate": 0.0001892428378123718, + "loss": 1.0294, + "step": 2342 + }, + { + "epoch": 0.7522876866270669, + "grad_norm": 1.067583680152893, + "learning_rate": 0.00018922708290556198, + "loss": 0.6811, + "step": 2343 + }, + { + "epoch": 0.7526087654519185, + "grad_norm": 1.2058768272399902, + "learning_rate": 0.0001892113171267027, + "loss": 0.8882, + "step": 2344 + }, + { + "epoch": 0.7529298442767699, + "grad_norm": 1.3467442989349365, + "learning_rate": 0.0001891955404777151, + "loss": 0.6371, + "step": 2345 + }, + { + "epoch": 0.7532509231016215, + "grad_norm": 0.8499478697776794, + "learning_rate": 0.00018917975296052142, + "loss": 0.7051, + "step": 2346 + }, + { + "epoch": 0.753572001926473, + "grad_norm": 1.8910988569259644, + "learning_rate": 0.00018916395457704534, + "loss": 0.7791, + "step": 2347 + }, + { + "epoch": 0.7538930807513244, + "grad_norm": 1.0912384986877441, + "learning_rate": 0.00018914814532921187, + "loss": 0.6608, + "step": 2348 + }, + { + "epoch": 0.754214159576176, + "grad_norm": 0.564804196357727, + "learning_rate": 0.00018913232521894732, + "loss": 0.4543, + "step": 2349 + }, + { + "epoch": 0.7545352384010274, + "grad_norm": 0.8918449878692627, + "learning_rate": 0.00018911649424817933, + "loss": 0.4786, + "step": 2350 + }, + { + "epoch": 0.754856317225879, + "grad_norm": 1.0368508100509644, + "learning_rate": 0.0001891006524188368, + "loss": 1.2446, + "step": 2351 + }, + { + "epoch": 0.7551773960507304, + "grad_norm": 1.0669269561767578, + "learning_rate": 0.00018908479973285005, + "loss": 1.1619, + "step": 2352 + }, + { + "epoch": 0.755498474875582, + "grad_norm": 1.34662663936615, + "learning_rate": 0.00018906893619215066, + "loss": 0.9378, + "step": 2353 + }, + { + "epoch": 0.7558195537004334, + "grad_norm": 1.1142168045043945, + "learning_rate": 0.0001890530617986716, + "loss": 0.6657, + "step": 2354 + }, + { + "epoch": 0.756140632525285, + "grad_norm": 1.00096595287323, + "learning_rate": 0.00018903717655434707, + "loss": 0.6454, + "step": 2355 + }, + { + "epoch": 0.7564617113501365, + "grad_norm": 1.4639570713043213, + "learning_rate": 0.00018902128046111266, + "loss": 0.6425, + "step": 2356 + }, + { + "epoch": 0.756782790174988, + "grad_norm": 1.1114808320999146, + "learning_rate": 0.00018900537352090524, + "loss": 0.7002, + "step": 2357 + }, + { + "epoch": 0.7571038689998395, + "grad_norm": 1.290031909942627, + "learning_rate": 0.00018898945573566308, + "loss": 0.8956, + "step": 2358 + }, + { + "epoch": 0.7574249478246909, + "grad_norm": 1.2316958904266357, + "learning_rate": 0.00018897352710732564, + "loss": 0.8771, + "step": 2359 + }, + { + "epoch": 0.7577460266495425, + "grad_norm": 1.4026559591293335, + "learning_rate": 0.00018895758763783383, + "loss": 1.0492, + "step": 2360 + }, + { + "epoch": 0.7580671054743939, + "grad_norm": 0.8219192624092102, + "learning_rate": 0.00018894163732912977, + "loss": 0.8172, + "step": 2361 + }, + { + "epoch": 0.7583881842992455, + "grad_norm": 0.9561658501625061, + "learning_rate": 0.000188925676183157, + "loss": 0.8669, + "step": 2362 + }, + { + "epoch": 0.7587092631240969, + "grad_norm": 0.9221644997596741, + "learning_rate": 0.00018890970420186033, + "loss": 0.701, + "step": 2363 + }, + { + "epoch": 0.7590303419489485, + "grad_norm": 1.0757882595062256, + "learning_rate": 0.0001888937213871859, + "loss": 0.7451, + "step": 2364 + }, + { + "epoch": 0.7593514207738, + "grad_norm": 1.2633696794509888, + "learning_rate": 0.00018887772774108116, + "loss": 0.9686, + "step": 2365 + }, + { + "epoch": 0.7596724995986515, + "grad_norm": 1.1361249685287476, + "learning_rate": 0.0001888617232654949, + "loss": 1.032, + "step": 2366 + }, + { + "epoch": 0.759993578423503, + "grad_norm": 1.2562459707260132, + "learning_rate": 0.00018884570796237718, + "loss": 0.9559, + "step": 2367 + }, + { + "epoch": 0.7603146572483545, + "grad_norm": 0.9838060140609741, + "learning_rate": 0.00018882968183367947, + "loss": 0.9249, + "step": 2368 + }, + { + "epoch": 0.760635736073206, + "grad_norm": 0.8353525996208191, + "learning_rate": 0.00018881364488135448, + "loss": 0.8962, + "step": 2369 + }, + { + "epoch": 0.7609568148980574, + "grad_norm": 1.24613618850708, + "learning_rate": 0.00018879759710735622, + "loss": 0.927, + "step": 2370 + }, + { + "epoch": 0.761277893722909, + "grad_norm": 1.1252332925796509, + "learning_rate": 0.00018878153851364013, + "loss": 0.9016, + "step": 2371 + }, + { + "epoch": 0.7615989725477604, + "grad_norm": 0.9512665271759033, + "learning_rate": 0.00018876546910216288, + "loss": 0.9949, + "step": 2372 + }, + { + "epoch": 0.761920051372612, + "grad_norm": 1.5389331579208374, + "learning_rate": 0.00018874938887488248, + "loss": 1.0535, + "step": 2373 + }, + { + "epoch": 0.7622411301974634, + "grad_norm": 0.9745280146598816, + "learning_rate": 0.00018873329783375824, + "loss": 0.9594, + "step": 2374 + }, + { + "epoch": 0.762562209022315, + "grad_norm": 1.1034287214279175, + "learning_rate": 0.0001887171959807508, + "loss": 0.9506, + "step": 2375 + }, + { + "epoch": 0.7628832878471665, + "grad_norm": 1.1382795572280884, + "learning_rate": 0.00018870108331782217, + "loss": 0.771, + "step": 2376 + }, + { + "epoch": 0.763204366672018, + "grad_norm": 1.042029619216919, + "learning_rate": 0.0001886849598469356, + "loss": 0.8624, + "step": 2377 + }, + { + "epoch": 0.7635254454968695, + "grad_norm": 1.0116221904754639, + "learning_rate": 0.00018866882557005567, + "loss": 0.9996, + "step": 2378 + }, + { + "epoch": 0.763846524321721, + "grad_norm": 0.6933526992797852, + "learning_rate": 0.00018865268048914828, + "loss": 0.657, + "step": 2379 + }, + { + "epoch": 0.7641676031465725, + "grad_norm": 1.2314203977584839, + "learning_rate": 0.0001886365246061807, + "loss": 1.1146, + "step": 2380 + }, + { + "epoch": 0.764488681971424, + "grad_norm": 1.3230317831039429, + "learning_rate": 0.00018862035792312147, + "loss": 0.8496, + "step": 2381 + }, + { + "epoch": 0.7648097607962755, + "grad_norm": 0.9279382824897766, + "learning_rate": 0.00018860418044194045, + "loss": 0.8664, + "step": 2382 + }, + { + "epoch": 0.7651308396211269, + "grad_norm": 1.3693788051605225, + "learning_rate": 0.00018858799216460881, + "loss": 0.935, + "step": 2383 + }, + { + "epoch": 0.7654519184459785, + "grad_norm": 1.4639575481414795, + "learning_rate": 0.00018857179309309901, + "loss": 0.9289, + "step": 2384 + }, + { + "epoch": 0.76577299727083, + "grad_norm": 1.192893147468567, + "learning_rate": 0.00018855558322938493, + "loss": 0.841, + "step": 2385 + }, + { + "epoch": 0.7660940760956815, + "grad_norm": 2.4525952339172363, + "learning_rate": 0.0001885393625754416, + "loss": 0.9004, + "step": 2386 + }, + { + "epoch": 0.766415154920533, + "grad_norm": 0.7626288533210754, + "learning_rate": 0.00018852313113324552, + "loss": 0.5966, + "step": 2387 + }, + { + "epoch": 0.7667362337453845, + "grad_norm": 1.5377343893051147, + "learning_rate": 0.00018850688890477445, + "loss": 0.9392, + "step": 2388 + }, + { + "epoch": 0.767057312570236, + "grad_norm": 1.555207371711731, + "learning_rate": 0.00018849063589200743, + "loss": 1.0675, + "step": 2389 + }, + { + "epoch": 0.7673783913950875, + "grad_norm": 1.0154179334640503, + "learning_rate": 0.00018847437209692486, + "loss": 0.6848, + "step": 2390 + }, + { + "epoch": 0.767699470219939, + "grad_norm": 1.9315353631973267, + "learning_rate": 0.0001884580975215084, + "loss": 0.8662, + "step": 2391 + }, + { + "epoch": 0.7680205490447904, + "grad_norm": 1.4244552850723267, + "learning_rate": 0.0001884418121677411, + "loss": 1.0924, + "step": 2392 + }, + { + "epoch": 0.768341627869642, + "grad_norm": 1.190861701965332, + "learning_rate": 0.00018842551603760724, + "loss": 0.8864, + "step": 2393 + }, + { + "epoch": 0.7686627066944935, + "grad_norm": 0.8999507427215576, + "learning_rate": 0.0001884092091330925, + "loss": 0.6656, + "step": 2394 + }, + { + "epoch": 0.768983785519345, + "grad_norm": 1.2089276313781738, + "learning_rate": 0.00018839289145618378, + "loss": 0.7695, + "step": 2395 + }, + { + "epoch": 0.7693048643441965, + "grad_norm": 0.7257319092750549, + "learning_rate": 0.00018837656300886937, + "loss": 0.5922, + "step": 2396 + }, + { + "epoch": 0.769625943169048, + "grad_norm": 0.9687317609786987, + "learning_rate": 0.00018836022379313883, + "loss": 0.6401, + "step": 2397 + }, + { + "epoch": 0.7699470219938995, + "grad_norm": 1.0825469493865967, + "learning_rate": 0.000188343873810983, + "loss": 0.8336, + "step": 2398 + }, + { + "epoch": 0.770268100818751, + "grad_norm": 1.286629319190979, + "learning_rate": 0.00018832751306439418, + "loss": 0.9844, + "step": 2399 + }, + { + "epoch": 0.7705891796436025, + "grad_norm": 0.8030791878700256, + "learning_rate": 0.0001883111415553658, + "loss": 0.5217, + "step": 2400 + }, + { + "epoch": 0.770910258468454, + "grad_norm": 1.3023558855056763, + "learning_rate": 0.00018829475928589271, + "loss": 1.1508, + "step": 2401 + }, + { + "epoch": 0.7712313372933055, + "grad_norm": 1.1620362997055054, + "learning_rate": 0.00018827836625797103, + "loss": 1.1917, + "step": 2402 + }, + { + "epoch": 0.7715524161181571, + "grad_norm": 1.134852647781372, + "learning_rate": 0.00018826196247359817, + "loss": 0.8599, + "step": 2403 + }, + { + "epoch": 0.7718734949430085, + "grad_norm": 0.962922990322113, + "learning_rate": 0.00018824554793477294, + "loss": 0.708, + "step": 2404 + }, + { + "epoch": 0.77219457376786, + "grad_norm": 1.1392818689346313, + "learning_rate": 0.00018822912264349534, + "loss": 0.6393, + "step": 2405 + }, + { + "epoch": 0.7725156525927115, + "grad_norm": 1.5677722692489624, + "learning_rate": 0.00018821268660176678, + "loss": 0.7293, + "step": 2406 + }, + { + "epoch": 0.772836731417563, + "grad_norm": 1.1908127069473267, + "learning_rate": 0.00018819623981158995, + "loss": 0.8353, + "step": 2407 + }, + { + "epoch": 0.7731578102424145, + "grad_norm": 0.9969918727874756, + "learning_rate": 0.00018817978227496883, + "loss": 0.9109, + "step": 2408 + }, + { + "epoch": 0.773478889067266, + "grad_norm": 0.8602403402328491, + "learning_rate": 0.0001881633139939087, + "loss": 0.9064, + "step": 2409 + }, + { + "epoch": 0.7737999678921175, + "grad_norm": 0.8332744836807251, + "learning_rate": 0.0001881468349704162, + "loss": 0.8529, + "step": 2410 + }, + { + "epoch": 0.774121046716969, + "grad_norm": 1.0467489957809448, + "learning_rate": 0.0001881303452064992, + "loss": 0.8339, + "step": 2411 + }, + { + "epoch": 0.7744421255418206, + "grad_norm": 1.618408441543579, + "learning_rate": 0.00018811384470416705, + "loss": 0.7408, + "step": 2412 + }, + { + "epoch": 0.774763204366672, + "grad_norm": 1.06851327419281, + "learning_rate": 0.00018809733346543013, + "loss": 0.857, + "step": 2413 + }, + { + "epoch": 0.7750842831915236, + "grad_norm": 1.3492529392242432, + "learning_rate": 0.00018808081149230036, + "loss": 0.8873, + "step": 2414 + }, + { + "epoch": 0.775405362016375, + "grad_norm": 1.1693357229232788, + "learning_rate": 0.00018806427878679093, + "loss": 1.1389, + "step": 2415 + }, + { + "epoch": 0.7757264408412265, + "grad_norm": 1.2862006425857544, + "learning_rate": 0.0001880477353509162, + "loss": 0.9218, + "step": 2416 + }, + { + "epoch": 0.776047519666078, + "grad_norm": 0.9942718744277954, + "learning_rate": 0.00018803118118669202, + "loss": 0.7227, + "step": 2417 + }, + { + "epoch": 0.7763685984909295, + "grad_norm": 1.2188372611999512, + "learning_rate": 0.00018801461629613546, + "loss": 0.9224, + "step": 2418 + }, + { + "epoch": 0.776689677315781, + "grad_norm": 1.206196904182434, + "learning_rate": 0.00018799804068126485, + "loss": 0.9985, + "step": 2419 + }, + { + "epoch": 0.7770107561406325, + "grad_norm": 1.327648639678955, + "learning_rate": 0.0001879814543440999, + "loss": 0.932, + "step": 2420 + }, + { + "epoch": 0.7773318349654841, + "grad_norm": 1.5388193130493164, + "learning_rate": 0.00018796485728666165, + "loss": 0.9052, + "step": 2421 + }, + { + "epoch": 0.7776529137903355, + "grad_norm": 1.7653731107711792, + "learning_rate": 0.00018794824951097236, + "loss": 1.032, + "step": 2422 + }, + { + "epoch": 0.7779739926151871, + "grad_norm": 1.1087538003921509, + "learning_rate": 0.00018793163101905563, + "loss": 1.283, + "step": 2423 + }, + { + "epoch": 0.7782950714400385, + "grad_norm": 1.3659240007400513, + "learning_rate": 0.0001879150018129364, + "loss": 1.0172, + "step": 2424 + }, + { + "epoch": 0.7786161502648901, + "grad_norm": 0.9709173440933228, + "learning_rate": 0.00018789836189464086, + "loss": 0.9285, + "step": 2425 + }, + { + "epoch": 0.7789372290897415, + "grad_norm": 1.2485631704330444, + "learning_rate": 0.00018788171126619653, + "loss": 0.9621, + "step": 2426 + }, + { + "epoch": 0.779258307914593, + "grad_norm": 1.27434504032135, + "learning_rate": 0.0001878650499296323, + "loss": 0.984, + "step": 2427 + }, + { + "epoch": 0.7795793867394445, + "grad_norm": 1.2106627225875854, + "learning_rate": 0.00018784837788697823, + "loss": 0.9109, + "step": 2428 + }, + { + "epoch": 0.779900465564296, + "grad_norm": 0.9945119023323059, + "learning_rate": 0.00018783169514026578, + "loss": 1.0478, + "step": 2429 + }, + { + "epoch": 0.7802215443891475, + "grad_norm": 2.2942955493927, + "learning_rate": 0.00018781500169152773, + "loss": 1.0132, + "step": 2430 + }, + { + "epoch": 0.780542623213999, + "grad_norm": 1.0083764791488647, + "learning_rate": 0.00018779829754279805, + "loss": 1.0231, + "step": 2431 + }, + { + "epoch": 0.7808637020388506, + "grad_norm": 1.2520384788513184, + "learning_rate": 0.00018778158269611218, + "loss": 0.8075, + "step": 2432 + }, + { + "epoch": 0.781184780863702, + "grad_norm": 1.1380735635757446, + "learning_rate": 0.00018776485715350671, + "loss": 0.8545, + "step": 2433 + }, + { + "epoch": 0.7815058596885536, + "grad_norm": 1.0106878280639648, + "learning_rate": 0.00018774812091701962, + "loss": 0.8459, + "step": 2434 + }, + { + "epoch": 0.781826938513405, + "grad_norm": 1.134502649307251, + "learning_rate": 0.00018773137398869015, + "loss": 0.9232, + "step": 2435 + }, + { + "epoch": 0.7821480173382566, + "grad_norm": 1.356572151184082, + "learning_rate": 0.00018771461637055888, + "loss": 0.9854, + "step": 2436 + }, + { + "epoch": 0.782469096163108, + "grad_norm": 1.156056523323059, + "learning_rate": 0.0001876978480646677, + "loss": 0.9877, + "step": 2437 + }, + { + "epoch": 0.7827901749879596, + "grad_norm": 1.1275618076324463, + "learning_rate": 0.00018768106907305973, + "loss": 0.7909, + "step": 2438 + }, + { + "epoch": 0.783111253812811, + "grad_norm": 0.8512035012245178, + "learning_rate": 0.00018766427939777945, + "loss": 0.9474, + "step": 2439 + }, + { + "epoch": 0.7834323326376625, + "grad_norm": 1.328540563583374, + "learning_rate": 0.00018764747904087263, + "loss": 0.9969, + "step": 2440 + }, + { + "epoch": 0.7837534114625141, + "grad_norm": 0.8241159319877625, + "learning_rate": 0.00018763066800438636, + "loss": 0.5979, + "step": 2441 + }, + { + "epoch": 0.7840744902873655, + "grad_norm": 1.58833646774292, + "learning_rate": 0.00018761384629036902, + "loss": 0.9713, + "step": 2442 + }, + { + "epoch": 0.7843955691122171, + "grad_norm": 1.2584527730941772, + "learning_rate": 0.00018759701390087027, + "loss": 1.0281, + "step": 2443 + }, + { + "epoch": 0.7847166479370685, + "grad_norm": 0.8919162154197693, + "learning_rate": 0.0001875801708379411, + "loss": 0.9197, + "step": 2444 + }, + { + "epoch": 0.7850377267619201, + "grad_norm": 2.073561906814575, + "learning_rate": 0.00018756331710363374, + "loss": 0.9238, + "step": 2445 + }, + { + "epoch": 0.7853588055867715, + "grad_norm": 0.9129263162612915, + "learning_rate": 0.0001875464527000018, + "loss": 0.8524, + "step": 2446 + }, + { + "epoch": 0.7856798844116231, + "grad_norm": 1.1756608486175537, + "learning_rate": 0.00018752957762910018, + "loss": 0.9372, + "step": 2447 + }, + { + "epoch": 0.7860009632364745, + "grad_norm": 1.1006115674972534, + "learning_rate": 0.000187512691892985, + "loss": 0.8847, + "step": 2448 + }, + { + "epoch": 0.786322042061326, + "grad_norm": 1.1314774751663208, + "learning_rate": 0.0001874957954937138, + "loss": 0.724, + "step": 2449 + }, + { + "epoch": 0.7866431208861776, + "grad_norm": 0.5091024041175842, + "learning_rate": 0.0001874788884333453, + "loss": 0.4491, + "step": 2450 + }, + { + "epoch": 0.786964199711029, + "grad_norm": 0.8892128467559814, + "learning_rate": 0.00018746197071393958, + "loss": 1.5905, + "step": 2451 + }, + { + "epoch": 0.7872852785358806, + "grad_norm": 0.9708239436149597, + "learning_rate": 0.00018744504233755805, + "loss": 1.2197, + "step": 2452 + }, + { + "epoch": 0.787606357360732, + "grad_norm": 1.2645493745803833, + "learning_rate": 0.00018742810330626337, + "loss": 0.8199, + "step": 2453 + }, + { + "epoch": 0.7879274361855836, + "grad_norm": 0.9297351837158203, + "learning_rate": 0.00018741115362211949, + "loss": 0.6392, + "step": 2454 + }, + { + "epoch": 0.788248515010435, + "grad_norm": 1.3193695545196533, + "learning_rate": 0.0001873941932871917, + "loss": 0.7132, + "step": 2455 + }, + { + "epoch": 0.7885695938352866, + "grad_norm": 1.7561746835708618, + "learning_rate": 0.00018737722230354655, + "loss": 0.607, + "step": 2456 + }, + { + "epoch": 0.788890672660138, + "grad_norm": 1.2228854894638062, + "learning_rate": 0.00018736024067325188, + "loss": 1.0434, + "step": 2457 + }, + { + "epoch": 0.7892117514849896, + "grad_norm": 1.381565809249878, + "learning_rate": 0.0001873432483983769, + "loss": 1.0155, + "step": 2458 + }, + { + "epoch": 0.7895328303098411, + "grad_norm": 1.2549446821212769, + "learning_rate": 0.00018732624548099204, + "loss": 0.6954, + "step": 2459 + }, + { + "epoch": 0.7898539091346926, + "grad_norm": 1.154805064201355, + "learning_rate": 0.00018730923192316902, + "loss": 0.6555, + "step": 2460 + }, + { + "epoch": 0.7901749879595441, + "grad_norm": 0.7833784818649292, + "learning_rate": 0.00018729220772698097, + "loss": 0.8146, + "step": 2461 + }, + { + "epoch": 0.7904960667843955, + "grad_norm": 0.9661710858345032, + "learning_rate": 0.0001872751728945022, + "loss": 0.9448, + "step": 2462 + }, + { + "epoch": 0.7908171456092471, + "grad_norm": 0.9815911054611206, + "learning_rate": 0.00018725812742780834, + "loss": 0.9413, + "step": 2463 + }, + { + "epoch": 0.7911382244340985, + "grad_norm": 0.823572039604187, + "learning_rate": 0.0001872410713289763, + "loss": 0.8122, + "step": 2464 + }, + { + "epoch": 0.7914593032589501, + "grad_norm": 0.8957955837249756, + "learning_rate": 0.0001872240046000844, + "loss": 0.8169, + "step": 2465 + }, + { + "epoch": 0.7917803820838015, + "grad_norm": 1.1782604455947876, + "learning_rate": 0.00018720692724321207, + "loss": 0.8323, + "step": 2466 + }, + { + "epoch": 0.7921014609086531, + "grad_norm": 0.887837827205658, + "learning_rate": 0.0001871898392604402, + "loss": 0.8662, + "step": 2467 + }, + { + "epoch": 0.7924225397335046, + "grad_norm": 0.903867781162262, + "learning_rate": 0.0001871727406538509, + "loss": 1.0044, + "step": 2468 + }, + { + "epoch": 0.7927436185583561, + "grad_norm": 0.8444812297821045, + "learning_rate": 0.00018715563142552758, + "loss": 0.9237, + "step": 2469 + }, + { + "epoch": 0.7930646973832076, + "grad_norm": 1.5705596208572388, + "learning_rate": 0.00018713851157755492, + "loss": 0.9272, + "step": 2470 + }, + { + "epoch": 0.7933857762080591, + "grad_norm": 0.9843676090240479, + "learning_rate": 0.00018712138111201895, + "loss": 0.9544, + "step": 2471 + }, + { + "epoch": 0.7937068550329106, + "grad_norm": 1.2598280906677246, + "learning_rate": 0.00018710424003100698, + "loss": 0.8223, + "step": 2472 + }, + { + "epoch": 0.794027933857762, + "grad_norm": 0.903620183467865, + "learning_rate": 0.00018708708833660754, + "loss": 0.6692, + "step": 2473 + }, + { + "epoch": 0.7943490126826136, + "grad_norm": 1.244788408279419, + "learning_rate": 0.00018706992603091058, + "loss": 1.0025, + "step": 2474 + }, + { + "epoch": 0.794670091507465, + "grad_norm": 1.1981257200241089, + "learning_rate": 0.00018705275311600722, + "loss": 0.8593, + "step": 2475 + }, + { + "epoch": 0.7949911703323166, + "grad_norm": 0.8418868184089661, + "learning_rate": 0.00018703556959398998, + "loss": 0.7955, + "step": 2476 + }, + { + "epoch": 0.7953122491571681, + "grad_norm": 0.8239766955375671, + "learning_rate": 0.0001870183754669526, + "loss": 0.796, + "step": 2477 + }, + { + "epoch": 0.7956333279820196, + "grad_norm": 1.1803929805755615, + "learning_rate": 0.0001870011707369901, + "loss": 1.0684, + "step": 2478 + }, + { + "epoch": 0.7959544068068711, + "grad_norm": 1.2590785026550293, + "learning_rate": 0.0001869839554061988, + "loss": 0.8635, + "step": 2479 + }, + { + "epoch": 0.7962754856317226, + "grad_norm": 1.4843522310256958, + "learning_rate": 0.00018696672947667646, + "loss": 0.7823, + "step": 2480 + }, + { + "epoch": 0.7965965644565741, + "grad_norm": 0.9807465076446533, + "learning_rate": 0.0001869494929505219, + "loss": 0.8197, + "step": 2481 + }, + { + "epoch": 0.7969176432814256, + "grad_norm": 1.2640116214752197, + "learning_rate": 0.0001869322458298354, + "loss": 0.9056, + "step": 2482 + }, + { + "epoch": 0.7972387221062771, + "grad_norm": 1.0154175758361816, + "learning_rate": 0.0001869149881167184, + "loss": 0.8573, + "step": 2483 + }, + { + "epoch": 0.7975598009311285, + "grad_norm": 1.578016996383667, + "learning_rate": 0.00018689771981327376, + "loss": 0.9812, + "step": 2484 + }, + { + "epoch": 0.7978808797559801, + "grad_norm": 0.9746622443199158, + "learning_rate": 0.00018688044092160551, + "loss": 0.8187, + "step": 2485 + }, + { + "epoch": 0.7982019585808315, + "grad_norm": 1.5574668645858765, + "learning_rate": 0.00018686315144381913, + "loss": 0.7508, + "step": 2486 + }, + { + "epoch": 0.7985230374056831, + "grad_norm": 0.9885238409042358, + "learning_rate": 0.00018684585138202122, + "loss": 0.8462, + "step": 2487 + }, + { + "epoch": 0.7988441162305346, + "grad_norm": 1.220053791999817, + "learning_rate": 0.00018682854073831973, + "loss": 0.9497, + "step": 2488 + }, + { + "epoch": 0.7991651950553861, + "grad_norm": 0.8417456746101379, + "learning_rate": 0.00018681121951482393, + "loss": 0.6397, + "step": 2489 + }, + { + "epoch": 0.7994862738802376, + "grad_norm": 1.1695717573165894, + "learning_rate": 0.00018679388771364436, + "loss": 0.6754, + "step": 2490 + }, + { + "epoch": 0.7998073527050891, + "grad_norm": 1.0518317222595215, + "learning_rate": 0.00018677654533689287, + "loss": 0.8917, + "step": 2491 + }, + { + "epoch": 0.8001284315299406, + "grad_norm": 1.123487114906311, + "learning_rate": 0.0001867591923866825, + "loss": 0.977, + "step": 2492 + }, + { + "epoch": 0.8004495103547921, + "grad_norm": 1.3612045049667358, + "learning_rate": 0.00018674182886512774, + "loss": 0.8039, + "step": 2493 + }, + { + "epoch": 0.8007705891796436, + "grad_norm": 1.34379243850708, + "learning_rate": 0.00018672445477434425, + "loss": 0.8616, + "step": 2494 + }, + { + "epoch": 0.801091668004495, + "grad_norm": 0.8489518761634827, + "learning_rate": 0.000186707070116449, + "loss": 0.7731, + "step": 2495 + }, + { + "epoch": 0.8014127468293466, + "grad_norm": 1.2689493894577026, + "learning_rate": 0.00018668967489356028, + "loss": 0.7308, + "step": 2496 + }, + { + "epoch": 0.8017338256541982, + "grad_norm": 1.7729278802871704, + "learning_rate": 0.00018667226910779765, + "loss": 0.6356, + "step": 2497 + }, + { + "epoch": 0.8020549044790496, + "grad_norm": 0.9951494932174683, + "learning_rate": 0.00018665485276128188, + "loss": 0.9106, + "step": 2498 + }, + { + "epoch": 0.8023759833039011, + "grad_norm": 1.5119075775146484, + "learning_rate": 0.00018663742585613518, + "loss": 0.7492, + "step": 2499 + }, + { + "epoch": 0.8026970621287526, + "grad_norm": 1.1886110305786133, + "learning_rate": 0.00018661998839448094, + "loss": 0.5986, + "step": 2500 + }, + { + "epoch": 0.8030181409536041, + "grad_norm": 1.4970862865447998, + "learning_rate": 0.00018660254037844388, + "loss": 1.4061, + "step": 2501 + }, + { + "epoch": 0.8033392197784556, + "grad_norm": 1.2722395658493042, + "learning_rate": 0.00018658508181014995, + "loss": 1.2115, + "step": 2502 + }, + { + "epoch": 0.8036602986033071, + "grad_norm": 1.3451300859451294, + "learning_rate": 0.00018656761269172643, + "loss": 0.9685, + "step": 2503 + }, + { + "epoch": 0.8039813774281586, + "grad_norm": 1.2562284469604492, + "learning_rate": 0.0001865501330253019, + "loss": 0.7432, + "step": 2504 + }, + { + "epoch": 0.8043024562530101, + "grad_norm": 1.0537221431732178, + "learning_rate": 0.00018653264281300622, + "loss": 0.7564, + "step": 2505 + }, + { + "epoch": 0.8046235350778617, + "grad_norm": 1.2182271480560303, + "learning_rate": 0.00018651514205697046, + "loss": 0.7227, + "step": 2506 + }, + { + "epoch": 0.8049446139027131, + "grad_norm": 0.9667792320251465, + "learning_rate": 0.00018649763075932708, + "loss": 0.6023, + "step": 2507 + }, + { + "epoch": 0.8052656927275647, + "grad_norm": 1.413209319114685, + "learning_rate": 0.00018648010892220978, + "loss": 0.9355, + "step": 2508 + }, + { + "epoch": 0.8055867715524161, + "grad_norm": 1.3829725980758667, + "learning_rate": 0.0001864625765477535, + "loss": 0.7942, + "step": 2509 + }, + { + "epoch": 0.8059078503772676, + "grad_norm": 0.8415265083312988, + "learning_rate": 0.00018644503363809457, + "loss": 0.7862, + "step": 2510 + }, + { + "epoch": 0.8062289292021191, + "grad_norm": 1.514174222946167, + "learning_rate": 0.0001864274801953705, + "loss": 1.0991, + "step": 2511 + }, + { + "epoch": 0.8065500080269706, + "grad_norm": 1.3786250352859497, + "learning_rate": 0.0001864099162217201, + "loss": 0.8292, + "step": 2512 + }, + { + "epoch": 0.8068710868518221, + "grad_norm": 1.0749139785766602, + "learning_rate": 0.00018639234171928353, + "loss": 0.9804, + "step": 2513 + }, + { + "epoch": 0.8071921656766736, + "grad_norm": 0.8056579828262329, + "learning_rate": 0.0001863747566902022, + "loss": 0.8092, + "step": 2514 + }, + { + "epoch": 0.8075132445015252, + "grad_norm": 1.0548845529556274, + "learning_rate": 0.00018635716113661873, + "loss": 0.7606, + "step": 2515 + }, + { + "epoch": 0.8078343233263766, + "grad_norm": 0.9919301271438599, + "learning_rate": 0.00018633955506067718, + "loss": 1.0425, + "step": 2516 + }, + { + "epoch": 0.8081554021512282, + "grad_norm": 0.918125569820404, + "learning_rate": 0.0001863219384645227, + "loss": 0.8707, + "step": 2517 + }, + { + "epoch": 0.8084764809760796, + "grad_norm": 1.2536126375198364, + "learning_rate": 0.0001863043113503019, + "loss": 1.1534, + "step": 2518 + }, + { + "epoch": 0.8087975598009312, + "grad_norm": 1.1653470993041992, + "learning_rate": 0.0001862866737201625, + "loss": 0.8979, + "step": 2519 + }, + { + "epoch": 0.8091186386257826, + "grad_norm": 1.016139030456543, + "learning_rate": 0.00018626902557625368, + "loss": 0.7874, + "step": 2520 + }, + { + "epoch": 0.8094397174506341, + "grad_norm": 1.2816565036773682, + "learning_rate": 0.00018625136692072575, + "loss": 0.8338, + "step": 2521 + }, + { + "epoch": 0.8097607962754856, + "grad_norm": 1.5095349550247192, + "learning_rate": 0.0001862336977557304, + "loss": 1.0349, + "step": 2522 + }, + { + "epoch": 0.8100818751003371, + "grad_norm": 1.753609299659729, + "learning_rate": 0.00018621601808342056, + "loss": 0.932, + "step": 2523 + }, + { + "epoch": 0.8104029539251887, + "grad_norm": 1.1441911458969116, + "learning_rate": 0.00018619832790595043, + "loss": 1.0346, + "step": 2524 + }, + { + "epoch": 0.8107240327500401, + "grad_norm": 1.1393861770629883, + "learning_rate": 0.0001861806272254755, + "loss": 0.8634, + "step": 2525 + }, + { + "epoch": 0.8110451115748917, + "grad_norm": 0.9023036956787109, + "learning_rate": 0.00018616291604415258, + "loss": 0.7442, + "step": 2526 + }, + { + "epoch": 0.8113661903997431, + "grad_norm": 0.9922143220901489, + "learning_rate": 0.0001861451943641397, + "loss": 0.893, + "step": 2527 + }, + { + "epoch": 0.8116872692245947, + "grad_norm": 1.2461748123168945, + "learning_rate": 0.00018612746218759618, + "loss": 1.0521, + "step": 2528 + }, + { + "epoch": 0.8120083480494461, + "grad_norm": 1.3164596557617188, + "learning_rate": 0.00018610971951668265, + "loss": 0.9787, + "step": 2529 + }, + { + "epoch": 0.8123294268742977, + "grad_norm": 1.3357511758804321, + "learning_rate": 0.000186091966353561, + "loss": 0.8539, + "step": 2530 + }, + { + "epoch": 0.8126505056991491, + "grad_norm": 0.9411560297012329, + "learning_rate": 0.0001860742027003944, + "loss": 1.0298, + "step": 2531 + }, + { + "epoch": 0.8129715845240006, + "grad_norm": 0.8316774368286133, + "learning_rate": 0.00018605642855934725, + "loss": 0.8219, + "step": 2532 + }, + { + "epoch": 0.8132926633488522, + "grad_norm": 1.3453702926635742, + "learning_rate": 0.00018603864393258534, + "loss": 0.9055, + "step": 2533 + }, + { + "epoch": 0.8136137421737036, + "grad_norm": 0.9413175582885742, + "learning_rate": 0.00018602084882227566, + "loss": 0.9423, + "step": 2534 + }, + { + "epoch": 0.8139348209985552, + "grad_norm": 0.9945281744003296, + "learning_rate": 0.00018600304323058647, + "loss": 0.9676, + "step": 2535 + }, + { + "epoch": 0.8142558998234066, + "grad_norm": 1.2609598636627197, + "learning_rate": 0.00018598522715968736, + "loss": 0.877, + "step": 2536 + }, + { + "epoch": 0.8145769786482582, + "grad_norm": 1.2044451236724854, + "learning_rate": 0.0001859674006117491, + "loss": 1.0152, + "step": 2537 + }, + { + "epoch": 0.8148980574731096, + "grad_norm": 1.1789966821670532, + "learning_rate": 0.00018594956358894388, + "loss": 0.7836, + "step": 2538 + }, + { + "epoch": 0.8152191362979612, + "grad_norm": 2.6686882972717285, + "learning_rate": 0.00018593171609344503, + "loss": 0.7848, + "step": 2539 + }, + { + "epoch": 0.8155402151228126, + "grad_norm": 0.900510311126709, + "learning_rate": 0.00018591385812742725, + "loss": 0.8703, + "step": 2540 + }, + { + "epoch": 0.8158612939476642, + "grad_norm": 0.9134002923965454, + "learning_rate": 0.00018589598969306645, + "loss": 0.7962, + "step": 2541 + }, + { + "epoch": 0.8161823727725157, + "grad_norm": 1.1984633207321167, + "learning_rate": 0.00018587811079253985, + "loss": 0.984, + "step": 2542 + }, + { + "epoch": 0.8165034515973671, + "grad_norm": 1.6189078092575073, + "learning_rate": 0.00018586022142802597, + "loss": 1.013, + "step": 2543 + }, + { + "epoch": 0.8168245304222187, + "grad_norm": 1.4700372219085693, + "learning_rate": 0.00018584232160170452, + "loss": 1.1043, + "step": 2544 + }, + { + "epoch": 0.8171456092470701, + "grad_norm": 1.035675287246704, + "learning_rate": 0.0001858244113157566, + "loss": 0.8351, + "step": 2545 + }, + { + "epoch": 0.8174666880719217, + "grad_norm": 1.505395531654358, + "learning_rate": 0.00018580649057236447, + "loss": 0.8241, + "step": 2546 + }, + { + "epoch": 0.8177877668967731, + "grad_norm": 1.6375924348831177, + "learning_rate": 0.00018578855937371173, + "loss": 0.6124, + "step": 2547 + }, + { + "epoch": 0.8181088457216247, + "grad_norm": 0.5977188944816589, + "learning_rate": 0.0001857706177219833, + "loss": 0.5429, + "step": 2548 + }, + { + "epoch": 0.8184299245464761, + "grad_norm": 0.6796979308128357, + "learning_rate": 0.00018575266561936523, + "loss": 0.5959, + "step": 2549 + }, + { + "epoch": 0.8187510033713277, + "grad_norm": 0.6622598171234131, + "learning_rate": 0.00018573470306804498, + "loss": 0.5182, + "step": 2550 + }, + { + "epoch": 0.8190720821961791, + "grad_norm": 0.9923622012138367, + "learning_rate": 0.00018571673007021123, + "loss": 1.2836, + "step": 2551 + }, + { + "epoch": 0.8193931610210307, + "grad_norm": 1.1485919952392578, + "learning_rate": 0.00018569874662805393, + "loss": 1.3207, + "step": 2552 + }, + { + "epoch": 0.8197142398458822, + "grad_norm": 0.9416031837463379, + "learning_rate": 0.0001856807527437643, + "loss": 0.9675, + "step": 2553 + }, + { + "epoch": 0.8200353186707336, + "grad_norm": 1.0910120010375977, + "learning_rate": 0.00018566274841953483, + "loss": 0.8907, + "step": 2554 + }, + { + "epoch": 0.8203563974955852, + "grad_norm": 1.061464786529541, + "learning_rate": 0.00018564473365755935, + "loss": 0.6902, + "step": 2555 + }, + { + "epoch": 0.8206774763204366, + "grad_norm": 1.0407187938690186, + "learning_rate": 0.00018562670846003284, + "loss": 0.6008, + "step": 2556 + }, + { + "epoch": 0.8209985551452882, + "grad_norm": 1.0993876457214355, + "learning_rate": 0.0001856086728291516, + "loss": 0.7588, + "step": 2557 + }, + { + "epoch": 0.8213196339701396, + "grad_norm": 1.2485312223434448, + "learning_rate": 0.00018559062676711332, + "loss": 0.9553, + "step": 2558 + }, + { + "epoch": 0.8216407127949912, + "grad_norm": 1.184083342552185, + "learning_rate": 0.00018557257027611675, + "loss": 1.0036, + "step": 2559 + }, + { + "epoch": 0.8219617916198426, + "grad_norm": 1.219896674156189, + "learning_rate": 0.00018555450335836206, + "loss": 0.9131, + "step": 2560 + }, + { + "epoch": 0.8222828704446942, + "grad_norm": 0.8921108841896057, + "learning_rate": 0.00018553642601605068, + "loss": 0.7521, + "step": 2561 + }, + { + "epoch": 0.8226039492695457, + "grad_norm": 1.3684065341949463, + "learning_rate": 0.0001855183382513852, + "loss": 0.8931, + "step": 2562 + }, + { + "epoch": 0.8229250280943972, + "grad_norm": 0.8778723478317261, + "learning_rate": 0.00018550024006656966, + "loss": 0.8721, + "step": 2563 + }, + { + "epoch": 0.8232461069192487, + "grad_norm": 1.4915300607681274, + "learning_rate": 0.00018548213146380918, + "loss": 0.9267, + "step": 2564 + }, + { + "epoch": 0.8235671857441001, + "grad_norm": 1.0653166770935059, + "learning_rate": 0.0001854640124453103, + "loss": 0.9316, + "step": 2565 + }, + { + "epoch": 0.8238882645689517, + "grad_norm": 1.8792741298675537, + "learning_rate": 0.00018544588301328075, + "loss": 0.9906, + "step": 2566 + }, + { + "epoch": 0.8242093433938031, + "grad_norm": 1.2481502294540405, + "learning_rate": 0.0001854277431699295, + "loss": 0.9631, + "step": 2567 + }, + { + "epoch": 0.8245304222186547, + "grad_norm": 1.4939336776733398, + "learning_rate": 0.00018540959291746693, + "loss": 0.9405, + "step": 2568 + }, + { + "epoch": 0.8248515010435061, + "grad_norm": 1.0083293914794922, + "learning_rate": 0.0001853914322581045, + "loss": 0.9924, + "step": 2569 + }, + { + "epoch": 0.8251725798683577, + "grad_norm": 1.3089494705200195, + "learning_rate": 0.00018537326119405506, + "loss": 0.899, + "step": 2570 + }, + { + "epoch": 0.8254936586932092, + "grad_norm": 1.0954056978225708, + "learning_rate": 0.00018535507972753274, + "loss": 0.977, + "step": 2571 + }, + { + "epoch": 0.8258147375180607, + "grad_norm": 0.9227792620658875, + "learning_rate": 0.00018533688786075288, + "loss": 0.7727, + "step": 2572 + }, + { + "epoch": 0.8261358163429122, + "grad_norm": 1.0734128952026367, + "learning_rate": 0.00018531868559593204, + "loss": 1.0625, + "step": 2573 + }, + { + "epoch": 0.8264568951677637, + "grad_norm": 0.9797103404998779, + "learning_rate": 0.00018530047293528819, + "loss": 1.0345, + "step": 2574 + }, + { + "epoch": 0.8267779739926152, + "grad_norm": 1.2002710103988647, + "learning_rate": 0.00018528224988104044, + "loss": 0.9635, + "step": 2575 + }, + { + "epoch": 0.8270990528174667, + "grad_norm": 1.1027469635009766, + "learning_rate": 0.00018526401643540922, + "loss": 0.8992, + "step": 2576 + }, + { + "epoch": 0.8274201316423182, + "grad_norm": 1.8506336212158203, + "learning_rate": 0.00018524577260061627, + "loss": 0.9184, + "step": 2577 + }, + { + "epoch": 0.8277412104671696, + "grad_norm": 1.1233233213424683, + "learning_rate": 0.0001852275183788845, + "loss": 0.9805, + "step": 2578 + }, + { + "epoch": 0.8280622892920212, + "grad_norm": 1.0749657154083252, + "learning_rate": 0.0001852092537724381, + "loss": 0.999, + "step": 2579 + }, + { + "epoch": 0.8283833681168727, + "grad_norm": 1.4672601222991943, + "learning_rate": 0.00018519097878350263, + "loss": 1.0324, + "step": 2580 + }, + { + "epoch": 0.8287044469417242, + "grad_norm": 1.0896575450897217, + "learning_rate": 0.00018517269341430476, + "loss": 0.9614, + "step": 2581 + }, + { + "epoch": 0.8290255257665757, + "grad_norm": 1.345746397972107, + "learning_rate": 0.00018515439766707262, + "loss": 0.9667, + "step": 2582 + }, + { + "epoch": 0.8293466045914272, + "grad_norm": 0.8869307041168213, + "learning_rate": 0.00018513609154403534, + "loss": 0.8017, + "step": 2583 + }, + { + "epoch": 0.8296676834162787, + "grad_norm": 1.2558448314666748, + "learning_rate": 0.00018511777504742362, + "loss": 0.9269, + "step": 2584 + }, + { + "epoch": 0.8299887622411302, + "grad_norm": 1.5162900686264038, + "learning_rate": 0.00018509944817946922, + "loss": 0.9928, + "step": 2585 + }, + { + "epoch": 0.8303098410659817, + "grad_norm": 1.0953887701034546, + "learning_rate": 0.00018508111094240514, + "loss": 0.7442, + "step": 2586 + }, + { + "epoch": 0.8306309198908332, + "grad_norm": 0.9997647404670715, + "learning_rate": 0.00018506276333846579, + "loss": 0.8279, + "step": 2587 + }, + { + "epoch": 0.8309519987156847, + "grad_norm": 0.8126912117004395, + "learning_rate": 0.00018504440536988673, + "loss": 0.6527, + "step": 2588 + }, + { + "epoch": 0.8312730775405363, + "grad_norm": 0.7459484338760376, + "learning_rate": 0.00018502603703890488, + "loss": 0.7202, + "step": 2589 + }, + { + "epoch": 0.8315941563653877, + "grad_norm": 1.0116833448410034, + "learning_rate": 0.00018500765834775828, + "loss": 0.8612, + "step": 2590 + }, + { + "epoch": 0.8319152351902392, + "grad_norm": 1.7899558544158936, + "learning_rate": 0.00018498926929868642, + "loss": 1.0347, + "step": 2591 + }, + { + "epoch": 0.8322363140150907, + "grad_norm": 1.1006872653961182, + "learning_rate": 0.00018497086989392988, + "loss": 0.9109, + "step": 2592 + }, + { + "epoch": 0.8325573928399422, + "grad_norm": 2.0811855792999268, + "learning_rate": 0.00018495246013573054, + "loss": 0.9743, + "step": 2593 + }, + { + "epoch": 0.8328784716647937, + "grad_norm": 1.6364414691925049, + "learning_rate": 0.00018493404002633166, + "loss": 1.1582, + "step": 2594 + }, + { + "epoch": 0.8331995504896452, + "grad_norm": 1.5922627449035645, + "learning_rate": 0.00018491560956797765, + "loss": 0.7103, + "step": 2595 + }, + { + "epoch": 0.8335206293144967, + "grad_norm": 1.3164536952972412, + "learning_rate": 0.00018489716876291415, + "loss": 1.0022, + "step": 2596 + }, + { + "epoch": 0.8338417081393482, + "grad_norm": 0.8407567739486694, + "learning_rate": 0.0001848787176133882, + "loss": 0.7196, + "step": 2597 + }, + { + "epoch": 0.8341627869641998, + "grad_norm": 1.0273271799087524, + "learning_rate": 0.00018486025612164794, + "loss": 0.7724, + "step": 2598 + }, + { + "epoch": 0.8344838657890512, + "grad_norm": 0.7496247291564941, + "learning_rate": 0.0001848417842899429, + "loss": 0.5834, + "step": 2599 + }, + { + "epoch": 0.8348049446139028, + "grad_norm": 0.6890442371368408, + "learning_rate": 0.00018482330212052378, + "loss": 0.6603, + "step": 2600 + }, + { + "epoch": 0.8351260234387542, + "grad_norm": 1.2235509157180786, + "learning_rate": 0.0001848048096156426, + "loss": 1.3789, + "step": 2601 + }, + { + "epoch": 0.8354471022636057, + "grad_norm": 0.9681376218795776, + "learning_rate": 0.00018478630677755262, + "loss": 0.9582, + "step": 2602 + }, + { + "epoch": 0.8357681810884572, + "grad_norm": 1.0608433485031128, + "learning_rate": 0.00018476779360850832, + "loss": 0.7719, + "step": 2603 + }, + { + "epoch": 0.8360892599133087, + "grad_norm": 1.0130113363265991, + "learning_rate": 0.00018474927011076552, + "loss": 0.822, + "step": 2604 + }, + { + "epoch": 0.8364103387381602, + "grad_norm": 1.0975375175476074, + "learning_rate": 0.0001847307362865812, + "loss": 0.7262, + "step": 2605 + }, + { + "epoch": 0.8367314175630117, + "grad_norm": 0.8877381086349487, + "learning_rate": 0.00018471219213821375, + "loss": 0.5131, + "step": 2606 + }, + { + "epoch": 0.8370524963878632, + "grad_norm": 1.0735324621200562, + "learning_rate": 0.00018469363766792255, + "loss": 0.6324, + "step": 2607 + }, + { + "epoch": 0.8373735752127147, + "grad_norm": 0.9613997936248779, + "learning_rate": 0.00018467507287796856, + "loss": 0.8337, + "step": 2608 + }, + { + "epoch": 0.8376946540375663, + "grad_norm": 1.0283273458480835, + "learning_rate": 0.0001846564977706138, + "loss": 0.8794, + "step": 2609 + }, + { + "epoch": 0.8380157328624177, + "grad_norm": 1.1222115755081177, + "learning_rate": 0.00018463791234812153, + "loss": 0.9956, + "step": 2610 + }, + { + "epoch": 0.8383368116872693, + "grad_norm": 0.8465763330459595, + "learning_rate": 0.00018461931661275643, + "loss": 0.8214, + "step": 2611 + }, + { + "epoch": 0.8386578905121207, + "grad_norm": 1.5119593143463135, + "learning_rate": 0.00018460071056678422, + "loss": 1.0962, + "step": 2612 + }, + { + "epoch": 0.8389789693369722, + "grad_norm": 0.7871680855751038, + "learning_rate": 0.00018458209421247208, + "loss": 0.6603, + "step": 2613 + }, + { + "epoch": 0.8393000481618237, + "grad_norm": 0.7733088731765747, + "learning_rate": 0.00018456346755208833, + "loss": 0.8264, + "step": 2614 + }, + { + "epoch": 0.8396211269866752, + "grad_norm": 0.8866434693336487, + "learning_rate": 0.00018454483058790255, + "loss": 0.7796, + "step": 2615 + }, + { + "epoch": 0.8399422058115267, + "grad_norm": 0.9938840270042419, + "learning_rate": 0.00018452618332218563, + "loss": 1.0233, + "step": 2616 + }, + { + "epoch": 0.8402632846363782, + "grad_norm": 1.1391087770462036, + "learning_rate": 0.00018450752575720967, + "loss": 0.8702, + "step": 2617 + }, + { + "epoch": 0.8405843634612298, + "grad_norm": 1.0326181650161743, + "learning_rate": 0.00018448885789524802, + "loss": 1.1119, + "step": 2618 + }, + { + "epoch": 0.8409054422860812, + "grad_norm": 1.0216528177261353, + "learning_rate": 0.00018447017973857532, + "loss": 0.9276, + "step": 2619 + }, + { + "epoch": 0.8412265211109328, + "grad_norm": 1.6243035793304443, + "learning_rate": 0.00018445149128946744, + "loss": 0.7381, + "step": 2620 + }, + { + "epoch": 0.8415475999357842, + "grad_norm": 1.0764888525009155, + "learning_rate": 0.00018443279255020152, + "loss": 1.0441, + "step": 2621 + }, + { + "epoch": 0.8418686787606358, + "grad_norm": 0.9923635721206665, + "learning_rate": 0.00018441408352305594, + "loss": 0.9418, + "step": 2622 + }, + { + "epoch": 0.8421897575854872, + "grad_norm": 1.145235538482666, + "learning_rate": 0.00018439536421031033, + "loss": 0.8303, + "step": 2623 + }, + { + "epoch": 0.8425108364103387, + "grad_norm": 1.0477502346038818, + "learning_rate": 0.0001843766346142456, + "loss": 0.9138, + "step": 2624 + }, + { + "epoch": 0.8428319152351902, + "grad_norm": 0.9633687734603882, + "learning_rate": 0.0001843578947371439, + "loss": 0.7834, + "step": 2625 + }, + { + "epoch": 0.8431529940600417, + "grad_norm": 0.8642808198928833, + "learning_rate": 0.0001843391445812886, + "loss": 0.9069, + "step": 2626 + }, + { + "epoch": 0.8434740728848933, + "grad_norm": 1.472249984741211, + "learning_rate": 0.00018432038414896434, + "loss": 0.8867, + "step": 2627 + }, + { + "epoch": 0.8437951517097447, + "grad_norm": 1.1954059600830078, + "learning_rate": 0.00018430161344245707, + "loss": 0.9881, + "step": 2628 + }, + { + "epoch": 0.8441162305345963, + "grad_norm": 0.8830155730247498, + "learning_rate": 0.0001842828324640539, + "loss": 0.6729, + "step": 2629 + }, + { + "epoch": 0.8444373093594477, + "grad_norm": 1.3963950872421265, + "learning_rate": 0.00018426404121604323, + "loss": 1.0219, + "step": 2630 + }, + { + "epoch": 0.8447583881842993, + "grad_norm": 1.185807704925537, + "learning_rate": 0.00018424523970071477, + "loss": 0.9033, + "step": 2631 + }, + { + "epoch": 0.8450794670091507, + "grad_norm": 0.9597305059432983, + "learning_rate": 0.0001842264279203594, + "loss": 1.0832, + "step": 2632 + }, + { + "epoch": 0.8454005458340023, + "grad_norm": 0.8251401782035828, + "learning_rate": 0.00018420760587726923, + "loss": 0.7083, + "step": 2633 + }, + { + "epoch": 0.8457216246588537, + "grad_norm": 1.246910572052002, + "learning_rate": 0.00018418877357373776, + "loss": 1.1274, + "step": 2634 + }, + { + "epoch": 0.8460427034837052, + "grad_norm": 1.0847307443618774, + "learning_rate": 0.00018416993101205958, + "loss": 0.8948, + "step": 2635 + }, + { + "epoch": 0.8463637823085568, + "grad_norm": 1.0887922048568726, + "learning_rate": 0.00018415107819453062, + "loss": 1.111, + "step": 2636 + }, + { + "epoch": 0.8466848611334082, + "grad_norm": 0.9815394878387451, + "learning_rate": 0.00018413221512344805, + "loss": 0.8554, + "step": 2637 + }, + { + "epoch": 0.8470059399582598, + "grad_norm": 1.085922122001648, + "learning_rate": 0.00018411334180111027, + "loss": 0.7271, + "step": 2638 + }, + { + "epoch": 0.8473270187831112, + "grad_norm": 1.687097430229187, + "learning_rate": 0.00018409445822981693, + "loss": 0.9256, + "step": 2639 + }, + { + "epoch": 0.8476480976079628, + "grad_norm": 1.0354297161102295, + "learning_rate": 0.00018407556441186893, + "loss": 0.7517, + "step": 2640 + }, + { + "epoch": 0.8479691764328142, + "grad_norm": 1.799309492111206, + "learning_rate": 0.00018405666034956844, + "loss": 0.888, + "step": 2641 + }, + { + "epoch": 0.8482902552576658, + "grad_norm": 1.172786831855774, + "learning_rate": 0.00018403774604521886, + "loss": 0.9754, + "step": 2642 + }, + { + "epoch": 0.8486113340825172, + "grad_norm": 1.1656599044799805, + "learning_rate": 0.00018401882150112484, + "loss": 0.8011, + "step": 2643 + }, + { + "epoch": 0.8489324129073688, + "grad_norm": 1.0657223463058472, + "learning_rate": 0.00018399988671959227, + "loss": 0.8496, + "step": 2644 + }, + { + "epoch": 0.8492534917322203, + "grad_norm": 1.51372230052948, + "learning_rate": 0.0001839809417029283, + "loss": 0.9055, + "step": 2645 + }, + { + "epoch": 0.8495745705570718, + "grad_norm": 1.423654556274414, + "learning_rate": 0.00018396198645344135, + "loss": 0.8467, + "step": 2646 + }, + { + "epoch": 0.8498956493819233, + "grad_norm": 2.3389534950256348, + "learning_rate": 0.000183943020973441, + "loss": 0.8266, + "step": 2647 + }, + { + "epoch": 0.8502167282067747, + "grad_norm": 0.9923876523971558, + "learning_rate": 0.00018392404526523817, + "loss": 0.7428, + "step": 2648 + }, + { + "epoch": 0.8505378070316263, + "grad_norm": 0.8069606423377991, + "learning_rate": 0.000183905059331145, + "loss": 0.5436, + "step": 2649 + }, + { + "epoch": 0.8508588858564777, + "grad_norm": 0.787869930267334, + "learning_rate": 0.0001838860631734749, + "loss": 0.4813, + "step": 2650 + }, + { + "epoch": 0.8511799646813293, + "grad_norm": 1.2456567287445068, + "learning_rate": 0.00018386705679454242, + "loss": 1.2521, + "step": 2651 + }, + { + "epoch": 0.8515010435061807, + "grad_norm": 1.0350762605667114, + "learning_rate": 0.00018384804019666345, + "loss": 1.4132, + "step": 2652 + }, + { + "epoch": 0.8518221223310323, + "grad_norm": 1.5614581108093262, + "learning_rate": 0.00018382901338215516, + "loss": 0.9286, + "step": 2653 + }, + { + "epoch": 0.8521432011558838, + "grad_norm": 1.5894172191619873, + "learning_rate": 0.00018380997635333585, + "loss": 0.7567, + "step": 2654 + }, + { + "epoch": 0.8524642799807353, + "grad_norm": 1.0441161394119263, + "learning_rate": 0.00018379092911252514, + "loss": 0.5728, + "step": 2655 + }, + { + "epoch": 0.8527853588055868, + "grad_norm": 1.1320891380310059, + "learning_rate": 0.0001837718716620439, + "loss": 0.5784, + "step": 2656 + }, + { + "epoch": 0.8531064376304383, + "grad_norm": 1.0418906211853027, + "learning_rate": 0.0001837528040042142, + "loss": 0.5328, + "step": 2657 + }, + { + "epoch": 0.8534275164552898, + "grad_norm": 1.1594635248184204, + "learning_rate": 0.00018373372614135936, + "loss": 0.7899, + "step": 2658 + }, + { + "epoch": 0.8537485952801412, + "grad_norm": 1.4470210075378418, + "learning_rate": 0.000183714638075804, + "loss": 1.0683, + "step": 2659 + }, + { + "epoch": 0.8540696741049928, + "grad_norm": 0.8838327527046204, + "learning_rate": 0.0001836955398098739, + "loss": 0.8985, + "step": 2660 + }, + { + "epoch": 0.8543907529298442, + "grad_norm": 0.9166834354400635, + "learning_rate": 0.00018367643134589617, + "loss": 0.9936, + "step": 2661 + }, + { + "epoch": 0.8547118317546958, + "grad_norm": 0.998890221118927, + "learning_rate": 0.0001836573126861991, + "loss": 0.9257, + "step": 2662 + }, + { + "epoch": 0.8550329105795473, + "grad_norm": 1.1718299388885498, + "learning_rate": 0.00018363818383311225, + "loss": 1.0218, + "step": 2663 + }, + { + "epoch": 0.8553539894043988, + "grad_norm": 1.2179023027420044, + "learning_rate": 0.0001836190447889664, + "loss": 1.0098, + "step": 2664 + }, + { + "epoch": 0.8556750682292503, + "grad_norm": 1.2525956630706787, + "learning_rate": 0.00018359989555609353, + "loss": 0.8362, + "step": 2665 + }, + { + "epoch": 0.8559961470541018, + "grad_norm": 1.1447235345840454, + "learning_rate": 0.00018358073613682706, + "loss": 0.825, + "step": 2666 + }, + { + "epoch": 0.8563172258789533, + "grad_norm": 1.0088034868240356, + "learning_rate": 0.00018356156653350137, + "loss": 0.8964, + "step": 2667 + }, + { + "epoch": 0.8566383047038048, + "grad_norm": 1.095577597618103, + "learning_rate": 0.00018354238674845225, + "loss": 0.8193, + "step": 2668 + }, + { + "epoch": 0.8569593835286563, + "grad_norm": 1.3043547868728638, + "learning_rate": 0.00018352319678401676, + "loss": 1.0768, + "step": 2669 + }, + { + "epoch": 0.8572804623535077, + "grad_norm": 1.7319622039794922, + "learning_rate": 0.00018350399664253305, + "loss": 0.9552, + "step": 2670 + }, + { + "epoch": 0.8576015411783593, + "grad_norm": 1.5099067687988281, + "learning_rate": 0.00018348478632634066, + "loss": 1.0515, + "step": 2671 + }, + { + "epoch": 0.8579226200032107, + "grad_norm": 1.069471001625061, + "learning_rate": 0.0001834655658377803, + "loss": 0.8308, + "step": 2672 + }, + { + "epoch": 0.8582436988280623, + "grad_norm": 0.9602334499359131, + "learning_rate": 0.00018344633517919392, + "loss": 0.8164, + "step": 2673 + }, + { + "epoch": 0.8585647776529138, + "grad_norm": 1.416114091873169, + "learning_rate": 0.00018342709435292473, + "loss": 0.9465, + "step": 2674 + }, + { + "epoch": 0.8588858564777653, + "grad_norm": 1.6077988147735596, + "learning_rate": 0.00018340784336131713, + "loss": 0.907, + "step": 2675 + }, + { + "epoch": 0.8592069353026168, + "grad_norm": 1.6769777536392212, + "learning_rate": 0.00018338858220671682, + "loss": 0.8495, + "step": 2676 + }, + { + "epoch": 0.8595280141274683, + "grad_norm": 0.7927365899085999, + "learning_rate": 0.00018336931089147073, + "loss": 0.7901, + "step": 2677 + }, + { + "epoch": 0.8598490929523198, + "grad_norm": 0.780259907245636, + "learning_rate": 0.00018335002941792698, + "loss": 0.714, + "step": 2678 + }, + { + "epoch": 0.8601701717771713, + "grad_norm": 1.1689207553863525, + "learning_rate": 0.000183330737788435, + "loss": 0.9876, + "step": 2679 + }, + { + "epoch": 0.8604912506020228, + "grad_norm": 1.2812306880950928, + "learning_rate": 0.00018331143600534535, + "loss": 1.1554, + "step": 2680 + }, + { + "epoch": 0.8608123294268742, + "grad_norm": 1.0278218984603882, + "learning_rate": 0.00018329212407100994, + "loss": 0.75, + "step": 2681 + }, + { + "epoch": 0.8611334082517258, + "grad_norm": 1.0668963193893433, + "learning_rate": 0.0001832728019877819, + "loss": 0.8591, + "step": 2682 + }, + { + "epoch": 0.8614544870765773, + "grad_norm": 1.1060330867767334, + "learning_rate": 0.0001832534697580155, + "loss": 1.0402, + "step": 2683 + }, + { + "epoch": 0.8617755659014288, + "grad_norm": 1.13728928565979, + "learning_rate": 0.00018323412738406635, + "loss": 0.8727, + "step": 2684 + }, + { + "epoch": 0.8620966447262803, + "grad_norm": 1.4041433334350586, + "learning_rate": 0.00018321477486829126, + "loss": 0.949, + "step": 2685 + }, + { + "epoch": 0.8624177235511318, + "grad_norm": 1.0087988376617432, + "learning_rate": 0.00018319541221304827, + "loss": 0.8199, + "step": 2686 + }, + { + "epoch": 0.8627388023759833, + "grad_norm": 0.8733333945274353, + "learning_rate": 0.00018317603942069664, + "loss": 0.8297, + "step": 2687 + }, + { + "epoch": 0.8630598812008348, + "grad_norm": 1.4244256019592285, + "learning_rate": 0.00018315665649359692, + "loss": 0.9084, + "step": 2688 + }, + { + "epoch": 0.8633809600256863, + "grad_norm": 0.8898962736129761, + "learning_rate": 0.00018313726343411086, + "loss": 0.7508, + "step": 2689 + }, + { + "epoch": 0.8637020388505378, + "grad_norm": 1.4125561714172363, + "learning_rate": 0.0001831178602446014, + "loss": 0.8825, + "step": 2690 + }, + { + "epoch": 0.8640231176753893, + "grad_norm": 0.8810192346572876, + "learning_rate": 0.00018309844692743283, + "loss": 0.8294, + "step": 2691 + }, + { + "epoch": 0.8643441965002409, + "grad_norm": 2.355106830596924, + "learning_rate": 0.00018307902348497056, + "loss": 0.6482, + "step": 2692 + }, + { + "epoch": 0.8646652753250923, + "grad_norm": 0.9504086375236511, + "learning_rate": 0.00018305958991958127, + "loss": 0.7463, + "step": 2693 + }, + { + "epoch": 0.8649863541499438, + "grad_norm": 1.0893597602844238, + "learning_rate": 0.0001830401462336329, + "loss": 0.8072, + "step": 2694 + }, + { + "epoch": 0.8653074329747953, + "grad_norm": 0.7756816148757935, + "learning_rate": 0.0001830206924294946, + "loss": 0.6733, + "step": 2695 + }, + { + "epoch": 0.8656285117996468, + "grad_norm": 0.8419047594070435, + "learning_rate": 0.00018300122850953675, + "loss": 0.6931, + "step": 2696 + }, + { + "epoch": 0.8659495906244983, + "grad_norm": 0.8006436228752136, + "learning_rate": 0.00018298175447613096, + "loss": 0.6323, + "step": 2697 + }, + { + "epoch": 0.8662706694493498, + "grad_norm": 0.9535905122756958, + "learning_rate": 0.00018296227033165013, + "loss": 0.6195, + "step": 2698 + }, + { + "epoch": 0.8665917482742013, + "grad_norm": 0.7773939371109009, + "learning_rate": 0.00018294277607846832, + "loss": 0.5081, + "step": 2699 + }, + { + "epoch": 0.8669128270990528, + "grad_norm": 2.4419236183166504, + "learning_rate": 0.0001829232717189608, + "loss": 0.7581, + "step": 2700 + }, + { + "epoch": 0.8672339059239044, + "grad_norm": 0.919539213180542, + "learning_rate": 0.00018290375725550417, + "loss": 1.1346, + "step": 2701 + }, + { + "epoch": 0.8675549847487558, + "grad_norm": 0.9994780421257019, + "learning_rate": 0.0001828842326904762, + "loss": 0.9724, + "step": 2702 + }, + { + "epoch": 0.8678760635736074, + "grad_norm": 1.1944332122802734, + "learning_rate": 0.00018286469802625589, + "loss": 0.8469, + "step": 2703 + }, + { + "epoch": 0.8681971423984588, + "grad_norm": 1.1087571382522583, + "learning_rate": 0.00018284515326522346, + "loss": 0.6316, + "step": 2704 + }, + { + "epoch": 0.8685182212233103, + "grad_norm": 1.079188346862793, + "learning_rate": 0.00018282559840976042, + "loss": 0.5803, + "step": 2705 + }, + { + "epoch": 0.8688393000481618, + "grad_norm": 0.9261389374732971, + "learning_rate": 0.00018280603346224945, + "loss": 0.5796, + "step": 2706 + }, + { + "epoch": 0.8691603788730133, + "grad_norm": 1.1797443628311157, + "learning_rate": 0.00018278645842507448, + "loss": 0.7453, + "step": 2707 + }, + { + "epoch": 0.8694814576978648, + "grad_norm": 1.0713974237442017, + "learning_rate": 0.00018276687330062065, + "loss": 0.994, + "step": 2708 + }, + { + "epoch": 0.8698025365227163, + "grad_norm": 1.1797808408737183, + "learning_rate": 0.00018274727809127438, + "loss": 0.7688, + "step": 2709 + }, + { + "epoch": 0.8701236153475679, + "grad_norm": 0.9432153105735779, + "learning_rate": 0.00018272767279942328, + "loss": 1.0629, + "step": 2710 + }, + { + "epoch": 0.8704446941724193, + "grad_norm": 0.9762241840362549, + "learning_rate": 0.00018270805742745617, + "loss": 1.0634, + "step": 2711 + }, + { + "epoch": 0.8707657729972709, + "grad_norm": 0.8841230273246765, + "learning_rate": 0.00018268843197776318, + "loss": 0.9472, + "step": 2712 + }, + { + "epoch": 0.8710868518221223, + "grad_norm": 1.085910439491272, + "learning_rate": 0.00018266879645273556, + "loss": 0.9591, + "step": 2713 + }, + { + "epoch": 0.8714079306469739, + "grad_norm": 0.8603227138519287, + "learning_rate": 0.00018264915085476583, + "loss": 0.8901, + "step": 2714 + }, + { + "epoch": 0.8717290094718253, + "grad_norm": 1.1412354707717896, + "learning_rate": 0.0001826294951862478, + "loss": 0.8354, + "step": 2715 + }, + { + "epoch": 0.8720500882966769, + "grad_norm": 0.9545773267745972, + "learning_rate": 0.00018260982944957638, + "loss": 1.0581, + "step": 2716 + }, + { + "epoch": 0.8723711671215283, + "grad_norm": 1.4084380865097046, + "learning_rate": 0.00018259015364714787, + "loss": 0.9433, + "step": 2717 + }, + { + "epoch": 0.8726922459463798, + "grad_norm": 1.183406949043274, + "learning_rate": 0.00018257046778135964, + "loss": 0.9878, + "step": 2718 + }, + { + "epoch": 0.8730133247712314, + "grad_norm": 0.791094958782196, + "learning_rate": 0.00018255077185461038, + "loss": 0.8771, + "step": 2719 + }, + { + "epoch": 0.8733344035960828, + "grad_norm": 1.0063689947128296, + "learning_rate": 0.00018253106586929997, + "loss": 0.8799, + "step": 2720 + }, + { + "epoch": 0.8736554824209344, + "grad_norm": 1.1546752452850342, + "learning_rate": 0.00018251134982782952, + "loss": 1.0092, + "step": 2721 + }, + { + "epoch": 0.8739765612457858, + "grad_norm": 1.525952696800232, + "learning_rate": 0.00018249162373260141, + "loss": 0.9339, + "step": 2722 + }, + { + "epoch": 0.8742976400706374, + "grad_norm": 1.4072719812393188, + "learning_rate": 0.0001824718875860191, + "loss": 0.8635, + "step": 2723 + }, + { + "epoch": 0.8746187188954888, + "grad_norm": 1.070801854133606, + "learning_rate": 0.00018245214139048753, + "loss": 1.107, + "step": 2724 + }, + { + "epoch": 0.8749397977203404, + "grad_norm": 0.9687029123306274, + "learning_rate": 0.0001824323851484126, + "loss": 0.6512, + "step": 2725 + }, + { + "epoch": 0.8752608765451918, + "grad_norm": 1.0900896787643433, + "learning_rate": 0.00018241261886220154, + "loss": 1.0686, + "step": 2726 + }, + { + "epoch": 0.8755819553700434, + "grad_norm": 1.1022065877914429, + "learning_rate": 0.00018239284253426295, + "loss": 0.9941, + "step": 2727 + }, + { + "epoch": 0.8759030341948948, + "grad_norm": 0.7542319297790527, + "learning_rate": 0.00018237305616700637, + "loss": 0.783, + "step": 2728 + }, + { + "epoch": 0.8762241130197463, + "grad_norm": 1.090649127960205, + "learning_rate": 0.00018235325976284275, + "loss": 0.8426, + "step": 2729 + }, + { + "epoch": 0.8765451918445979, + "grad_norm": 0.8522968888282776, + "learning_rate": 0.00018233345332418423, + "loss": 0.689, + "step": 2730 + }, + { + "epoch": 0.8768662706694493, + "grad_norm": 0.9584747552871704, + "learning_rate": 0.0001823136368534442, + "loss": 0.6896, + "step": 2731 + }, + { + "epoch": 0.8771873494943009, + "grad_norm": 1.115478754043579, + "learning_rate": 0.00018229381035303718, + "loss": 0.72, + "step": 2732 + }, + { + "epoch": 0.8775084283191523, + "grad_norm": 0.9085679650306702, + "learning_rate": 0.000182273973825379, + "loss": 0.6926, + "step": 2733 + }, + { + "epoch": 0.8778295071440039, + "grad_norm": 1.5002387762069702, + "learning_rate": 0.00018225412727288667, + "loss": 1.0068, + "step": 2734 + }, + { + "epoch": 0.8781505859688553, + "grad_norm": 1.0805754661560059, + "learning_rate": 0.00018223427069797844, + "loss": 0.8847, + "step": 2735 + }, + { + "epoch": 0.8784716647937069, + "grad_norm": 1.1780469417572021, + "learning_rate": 0.00018221440410307374, + "loss": 0.798, + "step": 2736 + }, + { + "epoch": 0.8787927436185583, + "grad_norm": 1.034372091293335, + "learning_rate": 0.0001821945274905933, + "loss": 1.0309, + "step": 2737 + }, + { + "epoch": 0.8791138224434099, + "grad_norm": 1.680379867553711, + "learning_rate": 0.00018217464086295904, + "loss": 1.0186, + "step": 2738 + }, + { + "epoch": 0.8794349012682614, + "grad_norm": 1.7660083770751953, + "learning_rate": 0.00018215474422259402, + "loss": 0.8119, + "step": 2739 + }, + { + "epoch": 0.8797559800931128, + "grad_norm": 1.747609257698059, + "learning_rate": 0.00018213483757192263, + "loss": 0.5625, + "step": 2740 + }, + { + "epoch": 0.8800770589179644, + "grad_norm": 1.156343936920166, + "learning_rate": 0.00018211492091337042, + "loss": 1.0188, + "step": 2741 + }, + { + "epoch": 0.8803981377428158, + "grad_norm": 1.104516863822937, + "learning_rate": 0.00018209499424936415, + "loss": 0.6375, + "step": 2742 + }, + { + "epoch": 0.8807192165676674, + "grad_norm": 3.744687080383301, + "learning_rate": 0.0001820750575823319, + "loss": 0.9079, + "step": 2743 + }, + { + "epoch": 0.8810402953925188, + "grad_norm": 1.1239784955978394, + "learning_rate": 0.00018205511091470283, + "loss": 0.7289, + "step": 2744 + }, + { + "epoch": 0.8813613742173704, + "grad_norm": 0.7483261823654175, + "learning_rate": 0.0001820351542489074, + "loss": 0.5682, + "step": 2745 + }, + { + "epoch": 0.8816824530422218, + "grad_norm": 1.107564926147461, + "learning_rate": 0.00018201518758737724, + "loss": 0.6602, + "step": 2746 + }, + { + "epoch": 0.8820035318670734, + "grad_norm": 4.356315612792969, + "learning_rate": 0.00018199521093254523, + "loss": 0.7124, + "step": 2747 + }, + { + "epoch": 0.8823246106919249, + "grad_norm": 1.0629299879074097, + "learning_rate": 0.00018197522428684552, + "loss": 0.4317, + "step": 2748 + }, + { + "epoch": 0.8826456895167764, + "grad_norm": 0.7678312063217163, + "learning_rate": 0.0001819552276527134, + "loss": 0.447, + "step": 2749 + }, + { + "epoch": 0.8829667683416279, + "grad_norm": 1.091256856918335, + "learning_rate": 0.00018193522103258537, + "loss": 0.5988, + "step": 2750 + }, + { + "epoch": 0.8832878471664793, + "grad_norm": 1.013898253440857, + "learning_rate": 0.0001819152044288992, + "loss": 1.3746, + "step": 2751 + }, + { + "epoch": 0.8836089259913309, + "grad_norm": 0.9919261336326599, + "learning_rate": 0.00018189517784409381, + "loss": 1.3712, + "step": 2752 + }, + { + "epoch": 0.8839300048161823, + "grad_norm": 1.0880647897720337, + "learning_rate": 0.00018187514128060946, + "loss": 0.9107, + "step": 2753 + }, + { + "epoch": 0.8842510836410339, + "grad_norm": 1.6532173156738281, + "learning_rate": 0.0001818550947408875, + "loss": 0.8713, + "step": 2754 + }, + { + "epoch": 0.8845721624658853, + "grad_norm": 0.9895550608634949, + "learning_rate": 0.0001818350382273705, + "loss": 0.6474, + "step": 2755 + }, + { + "epoch": 0.8848932412907369, + "grad_norm": 1.1515357494354248, + "learning_rate": 0.00018181497174250236, + "loss": 0.5816, + "step": 2756 + }, + { + "epoch": 0.8852143201155884, + "grad_norm": 1.0654630661010742, + "learning_rate": 0.00018179489528872807, + "loss": 0.5709, + "step": 2757 + }, + { + "epoch": 0.8855353989404399, + "grad_norm": 1.0934091806411743, + "learning_rate": 0.00018177480886849388, + "loss": 0.9673, + "step": 2758 + }, + { + "epoch": 0.8858564777652914, + "grad_norm": 2.52614426612854, + "learning_rate": 0.0001817547124842473, + "loss": 0.9605, + "step": 2759 + }, + { + "epoch": 0.8861775565901429, + "grad_norm": 0.8149087429046631, + "learning_rate": 0.00018173460613843701, + "loss": 0.981, + "step": 2760 + }, + { + "epoch": 0.8864986354149944, + "grad_norm": 0.758175253868103, + "learning_rate": 0.00018171448983351284, + "loss": 0.7961, + "step": 2761 + }, + { + "epoch": 0.8868197142398458, + "grad_norm": 0.8692430257797241, + "learning_rate": 0.00018169436357192602, + "loss": 0.782, + "step": 2762 + }, + { + "epoch": 0.8871407930646974, + "grad_norm": 1.1529359817504883, + "learning_rate": 0.00018167422735612877, + "loss": 0.9836, + "step": 2763 + }, + { + "epoch": 0.8874618718895488, + "grad_norm": 0.9792761206626892, + "learning_rate": 0.00018165408118857464, + "loss": 0.9161, + "step": 2764 + }, + { + "epoch": 0.8877829507144004, + "grad_norm": 1.1091268062591553, + "learning_rate": 0.00018163392507171842, + "loss": 0.9312, + "step": 2765 + }, + { + "epoch": 0.8881040295392519, + "grad_norm": 0.8531142473220825, + "learning_rate": 0.00018161375900801604, + "loss": 0.6937, + "step": 2766 + }, + { + "epoch": 0.8884251083641034, + "grad_norm": 0.9302113652229309, + "learning_rate": 0.00018159358299992467, + "loss": 0.9284, + "step": 2767 + }, + { + "epoch": 0.8887461871889549, + "grad_norm": 1.1668531894683838, + "learning_rate": 0.00018157339704990275, + "loss": 1.0318, + "step": 2768 + }, + { + "epoch": 0.8890672660138064, + "grad_norm": 0.8978874683380127, + "learning_rate": 0.00018155320116040982, + "loss": 0.9906, + "step": 2769 + }, + { + "epoch": 0.8893883448386579, + "grad_norm": 1.0586960315704346, + "learning_rate": 0.00018153299533390672, + "loss": 0.8478, + "step": 2770 + }, + { + "epoch": 0.8897094236635094, + "grad_norm": 1.5482786893844604, + "learning_rate": 0.00018151277957285543, + "loss": 0.8452, + "step": 2771 + }, + { + "epoch": 0.8900305024883609, + "grad_norm": 1.1627875566482544, + "learning_rate": 0.00018149255387971922, + "loss": 1.1749, + "step": 2772 + }, + { + "epoch": 0.8903515813132123, + "grad_norm": 1.5141947269439697, + "learning_rate": 0.00018147231825696252, + "loss": 0.9226, + "step": 2773 + }, + { + "epoch": 0.8906726601380639, + "grad_norm": 0.9088945984840393, + "learning_rate": 0.00018145207270705096, + "loss": 0.8038, + "step": 2774 + }, + { + "epoch": 0.8909937389629154, + "grad_norm": 1.2037333250045776, + "learning_rate": 0.0001814318172324514, + "loss": 0.926, + "step": 2775 + }, + { + "epoch": 0.8913148177877669, + "grad_norm": 1.1055127382278442, + "learning_rate": 0.00018141155183563193, + "loss": 0.8784, + "step": 2776 + }, + { + "epoch": 0.8916358966126184, + "grad_norm": 0.8244720101356506, + "learning_rate": 0.00018139127651906184, + "loss": 0.8521, + "step": 2777 + }, + { + "epoch": 0.8919569754374699, + "grad_norm": 1.0560851097106934, + "learning_rate": 0.00018137099128521156, + "loss": 0.7876, + "step": 2778 + }, + { + "epoch": 0.8922780542623214, + "grad_norm": 0.7370025515556335, + "learning_rate": 0.0001813506961365528, + "loss": 0.7918, + "step": 2779 + }, + { + "epoch": 0.8925991330871729, + "grad_norm": 0.9496878385543823, + "learning_rate": 0.00018133039107555852, + "loss": 0.9127, + "step": 2780 + }, + { + "epoch": 0.8929202119120244, + "grad_norm": 0.897685706615448, + "learning_rate": 0.00018131007610470276, + "loss": 0.8017, + "step": 2781 + }, + { + "epoch": 0.8932412907368759, + "grad_norm": 1.8280360698699951, + "learning_rate": 0.0001812897512264609, + "loss": 1.2183, + "step": 2782 + }, + { + "epoch": 0.8935623695617274, + "grad_norm": 1.164359450340271, + "learning_rate": 0.0001812694164433094, + "loss": 0.8879, + "step": 2783 + }, + { + "epoch": 0.893883448386579, + "grad_norm": 0.8923983573913574, + "learning_rate": 0.00018124907175772604, + "loss": 0.8414, + "step": 2784 + }, + { + "epoch": 0.8942045272114304, + "grad_norm": 1.2509326934814453, + "learning_rate": 0.0001812287171721897, + "loss": 1.0194, + "step": 2785 + }, + { + "epoch": 0.894525606036282, + "grad_norm": 0.9996488094329834, + "learning_rate": 0.00018120835268918063, + "loss": 0.9708, + "step": 2786 + }, + { + "epoch": 0.8948466848611334, + "grad_norm": 0.8612101078033447, + "learning_rate": 0.0001811879783111801, + "loss": 0.7101, + "step": 2787 + }, + { + "epoch": 0.8951677636859849, + "grad_norm": 1.1495945453643799, + "learning_rate": 0.00018116759404067064, + "loss": 0.7831, + "step": 2788 + }, + { + "epoch": 0.8954888425108364, + "grad_norm": 0.9605495929718018, + "learning_rate": 0.00018114719988013612, + "loss": 0.6721, + "step": 2789 + }, + { + "epoch": 0.8958099213356879, + "grad_norm": 1.1470704078674316, + "learning_rate": 0.00018112679583206137, + "loss": 0.6784, + "step": 2790 + }, + { + "epoch": 0.8961310001605394, + "grad_norm": 1.1517149209976196, + "learning_rate": 0.00018110638189893267, + "loss": 0.9704, + "step": 2791 + }, + { + "epoch": 0.8964520789853909, + "grad_norm": 1.7360931634902954, + "learning_rate": 0.00018108595808323736, + "loss": 0.7464, + "step": 2792 + }, + { + "epoch": 0.8967731578102424, + "grad_norm": 0.8036109805107117, + "learning_rate": 0.000181065524387464, + "loss": 0.7334, + "step": 2793 + }, + { + "epoch": 0.8970942366350939, + "grad_norm": 0.5651756525039673, + "learning_rate": 0.0001810450808141024, + "loss": 0.5383, + "step": 2794 + }, + { + "epoch": 0.8974153154599455, + "grad_norm": 1.0314606428146362, + "learning_rate": 0.00018102462736564355, + "loss": 0.6901, + "step": 2795 + }, + { + "epoch": 0.8977363942847969, + "grad_norm": 1.6273174285888672, + "learning_rate": 0.00018100416404457961, + "loss": 0.7406, + "step": 2796 + }, + { + "epoch": 0.8980574731096485, + "grad_norm": 1.643372893333435, + "learning_rate": 0.00018098369085340398, + "loss": 0.6796, + "step": 2797 + }, + { + "epoch": 0.8983785519344999, + "grad_norm": 1.2636449337005615, + "learning_rate": 0.00018096320779461132, + "loss": 0.6756, + "step": 2798 + }, + { + "epoch": 0.8986996307593514, + "grad_norm": 1.1232649087905884, + "learning_rate": 0.00018094271487069735, + "loss": 0.5207, + "step": 2799 + }, + { + "epoch": 0.8990207095842029, + "grad_norm": 0.5236806869506836, + "learning_rate": 0.00018092221208415907, + "loss": 0.4062, + "step": 2800 + }, + { + "epoch": 0.8993417884090544, + "grad_norm": 0.7692060470581055, + "learning_rate": 0.00018090169943749476, + "loss": 0.9762, + "step": 2801 + }, + { + "epoch": 0.8996628672339059, + "grad_norm": 0.8355540633201599, + "learning_rate": 0.00018088117693320374, + "loss": 1.2168, + "step": 2802 + }, + { + "epoch": 0.8999839460587574, + "grad_norm": 0.8926533460617065, + "learning_rate": 0.00018086064457378665, + "loss": 0.7676, + "step": 2803 + }, + { + "epoch": 0.900305024883609, + "grad_norm": 1.108173131942749, + "learning_rate": 0.00018084010236174534, + "loss": 0.552, + "step": 2804 + }, + { + "epoch": 0.9006261037084604, + "grad_norm": 1.1268749237060547, + "learning_rate": 0.00018081955029958274, + "loss": 0.5069, + "step": 2805 + }, + { + "epoch": 0.900947182533312, + "grad_norm": 1.0203336477279663, + "learning_rate": 0.00018079898838980305, + "loss": 0.6792, + "step": 2806 + }, + { + "epoch": 0.9012682613581634, + "grad_norm": 1.2625064849853516, + "learning_rate": 0.00018077841663491175, + "loss": 0.6916, + "step": 2807 + }, + { + "epoch": 0.901589340183015, + "grad_norm": 1.1094204187393188, + "learning_rate": 0.0001807578350374154, + "loss": 1.146, + "step": 2808 + }, + { + "epoch": 0.9019104190078664, + "grad_norm": 1.1322005987167358, + "learning_rate": 0.00018073724359982186, + "loss": 0.8353, + "step": 2809 + }, + { + "epoch": 0.9022314978327179, + "grad_norm": 0.9779659509658813, + "learning_rate": 0.00018071664232464002, + "loss": 0.7259, + "step": 2810 + }, + { + "epoch": 0.9025525766575694, + "grad_norm": 1.0345367193222046, + "learning_rate": 0.00018069603121438022, + "loss": 0.7205, + "step": 2811 + }, + { + "epoch": 0.9028736554824209, + "grad_norm": 1.5466442108154297, + "learning_rate": 0.00018067541027155375, + "loss": 1.3155, + "step": 2812 + }, + { + "epoch": 0.9031947343072725, + "grad_norm": 1.0183089971542358, + "learning_rate": 0.00018065477949867327, + "loss": 0.7655, + "step": 2813 + }, + { + "epoch": 0.9035158131321239, + "grad_norm": 0.7953174114227295, + "learning_rate": 0.00018063413889825254, + "loss": 0.8727, + "step": 2814 + }, + { + "epoch": 0.9038368919569755, + "grad_norm": 1.6211440563201904, + "learning_rate": 0.0001806134884728066, + "loss": 0.9523, + "step": 2815 + }, + { + "epoch": 0.9041579707818269, + "grad_norm": 1.3157118558883667, + "learning_rate": 0.00018059282822485158, + "loss": 1.0292, + "step": 2816 + }, + { + "epoch": 0.9044790496066785, + "grad_norm": 1.2418993711471558, + "learning_rate": 0.00018057215815690494, + "loss": 0.8633, + "step": 2817 + }, + { + "epoch": 0.9048001284315299, + "grad_norm": 1.4473196268081665, + "learning_rate": 0.00018055147827148523, + "loss": 0.8806, + "step": 2818 + }, + { + "epoch": 0.9051212072563815, + "grad_norm": 1.058814525604248, + "learning_rate": 0.0001805307885711122, + "loss": 0.8762, + "step": 2819 + }, + { + "epoch": 0.9054422860812329, + "grad_norm": 0.8622485995292664, + "learning_rate": 0.0001805100890583069, + "loss": 0.8043, + "step": 2820 + }, + { + "epoch": 0.9057633649060844, + "grad_norm": 1.5182201862335205, + "learning_rate": 0.0001804893797355914, + "loss": 1.0228, + "step": 2821 + }, + { + "epoch": 0.906084443730936, + "grad_norm": 0.8368403315544128, + "learning_rate": 0.00018046866060548918, + "loss": 0.7839, + "step": 2822 + }, + { + "epoch": 0.9064055225557874, + "grad_norm": 1.0724780559539795, + "learning_rate": 0.00018044793167052477, + "loss": 0.9187, + "step": 2823 + }, + { + "epoch": 0.906726601380639, + "grad_norm": 1.226404070854187, + "learning_rate": 0.00018042719293322388, + "loss": 0.9023, + "step": 2824 + }, + { + "epoch": 0.9070476802054904, + "grad_norm": 0.865028977394104, + "learning_rate": 0.00018040644439611348, + "loss": 0.6866, + "step": 2825 + }, + { + "epoch": 0.907368759030342, + "grad_norm": 1.3817094564437866, + "learning_rate": 0.00018038568606172173, + "loss": 1.2747, + "step": 2826 + }, + { + "epoch": 0.9076898378551934, + "grad_norm": 1.4625390768051147, + "learning_rate": 0.00018036491793257798, + "loss": 0.9156, + "step": 2827 + }, + { + "epoch": 0.908010916680045, + "grad_norm": 1.0158188343048096, + "learning_rate": 0.00018034414001121278, + "loss": 0.9246, + "step": 2828 + }, + { + "epoch": 0.9083319955048964, + "grad_norm": 0.98722243309021, + "learning_rate": 0.0001803233523001578, + "loss": 1.1086, + "step": 2829 + }, + { + "epoch": 0.908653074329748, + "grad_norm": 0.8799543976783752, + "learning_rate": 0.000180302554801946, + "loss": 0.724, + "step": 2830 + }, + { + "epoch": 0.9089741531545995, + "grad_norm": 1.1787347793579102, + "learning_rate": 0.00018028174751911146, + "loss": 1.0376, + "step": 2831 + }, + { + "epoch": 0.909295231979451, + "grad_norm": 1.1940038204193115, + "learning_rate": 0.00018026093045418954, + "loss": 0.7911, + "step": 2832 + }, + { + "epoch": 0.9096163108043025, + "grad_norm": 0.9036272168159485, + "learning_rate": 0.0001802401036097167, + "loss": 0.7765, + "step": 2833 + }, + { + "epoch": 0.9099373896291539, + "grad_norm": 1.148139238357544, + "learning_rate": 0.00018021926698823059, + "loss": 0.8591, + "step": 2834 + }, + { + "epoch": 0.9102584684540055, + "grad_norm": 0.8454437851905823, + "learning_rate": 0.00018019842059227012, + "loss": 0.7655, + "step": 2835 + }, + { + "epoch": 0.9105795472788569, + "grad_norm": 1.5326919555664062, + "learning_rate": 0.0001801775644243754, + "loss": 0.9971, + "step": 2836 + }, + { + "epoch": 0.9109006261037085, + "grad_norm": 1.6905158758163452, + "learning_rate": 0.00018015669848708767, + "loss": 1.1873, + "step": 2837 + }, + { + "epoch": 0.9112217049285599, + "grad_norm": 0.8212069869041443, + "learning_rate": 0.00018013582278294935, + "loss": 0.7388, + "step": 2838 + }, + { + "epoch": 0.9115427837534115, + "grad_norm": 1.243600606918335, + "learning_rate": 0.00018011493731450413, + "loss": 0.8089, + "step": 2839 + }, + { + "epoch": 0.911863862578263, + "grad_norm": 1.1280215978622437, + "learning_rate": 0.0001800940420842968, + "loss": 0.8168, + "step": 2840 + }, + { + "epoch": 0.9121849414031145, + "grad_norm": 1.2804031372070312, + "learning_rate": 0.00018007313709487334, + "loss": 1.0121, + "step": 2841 + }, + { + "epoch": 0.912506020227966, + "grad_norm": 1.406071662902832, + "learning_rate": 0.0001800522223487811, + "loss": 0.8801, + "step": 2842 + }, + { + "epoch": 0.9128270990528174, + "grad_norm": 1.1531569957733154, + "learning_rate": 0.0001800312978485683, + "loss": 0.9636, + "step": 2843 + }, + { + "epoch": 0.913148177877669, + "grad_norm": 2.095679521560669, + "learning_rate": 0.00018001036359678469, + "loss": 0.9504, + "step": 2844 + }, + { + "epoch": 0.9134692567025204, + "grad_norm": 0.9581215381622314, + "learning_rate": 0.00017998941959598095, + "loss": 0.798, + "step": 2845 + }, + { + "epoch": 0.913790335527372, + "grad_norm": 0.78875333070755, + "learning_rate": 0.00017996846584870908, + "loss": 0.7823, + "step": 2846 + }, + { + "epoch": 0.9141114143522234, + "grad_norm": 1.2259639501571655, + "learning_rate": 0.0001799475023575222, + "loss": 0.8185, + "step": 2847 + }, + { + "epoch": 0.914432493177075, + "grad_norm": 1.0644735097885132, + "learning_rate": 0.00017992652912497464, + "loss": 0.6927, + "step": 2848 + }, + { + "epoch": 0.9147535720019264, + "grad_norm": 1.1323128938674927, + "learning_rate": 0.00017990554615362198, + "loss": 0.7886, + "step": 2849 + }, + { + "epoch": 0.915074650826778, + "grad_norm": 0.6815614700317383, + "learning_rate": 0.00017988455344602092, + "loss": 0.5591, + "step": 2850 + }, + { + "epoch": 0.9153957296516295, + "grad_norm": 0.9420286417007446, + "learning_rate": 0.00017986355100472928, + "loss": 1.0964, + "step": 2851 + }, + { + "epoch": 0.915716808476481, + "grad_norm": 0.9961267113685608, + "learning_rate": 0.00017984253883230627, + "loss": 1.2273, + "step": 2852 + }, + { + "epoch": 0.9160378873013325, + "grad_norm": 0.790002167224884, + "learning_rate": 0.00017982151693131203, + "loss": 0.8297, + "step": 2853 + }, + { + "epoch": 0.916358966126184, + "grad_norm": 1.0446445941925049, + "learning_rate": 0.0001798004853043081, + "loss": 0.7856, + "step": 2854 + }, + { + "epoch": 0.9166800449510355, + "grad_norm": 0.9418595433235168, + "learning_rate": 0.0001797794439538571, + "loss": 0.5708, + "step": 2855 + }, + { + "epoch": 0.9170011237758869, + "grad_norm": 1.0176951885223389, + "learning_rate": 0.00017975839288252287, + "loss": 0.6077, + "step": 2856 + }, + { + "epoch": 0.9173222026007385, + "grad_norm": 1.1099709272384644, + "learning_rate": 0.00017973733209287036, + "loss": 0.7867, + "step": 2857 + }, + { + "epoch": 0.9176432814255899, + "grad_norm": 1.4851493835449219, + "learning_rate": 0.00017971626158746584, + "loss": 1.0428, + "step": 2858 + }, + { + "epoch": 0.9179643602504415, + "grad_norm": 1.387786865234375, + "learning_rate": 0.00017969518136887663, + "loss": 0.9532, + "step": 2859 + }, + { + "epoch": 0.918285439075293, + "grad_norm": 0.9932065010070801, + "learning_rate": 0.00017967409143967132, + "loss": 0.8475, + "step": 2860 + }, + { + "epoch": 0.9186065179001445, + "grad_norm": 0.8978595733642578, + "learning_rate": 0.00017965299180241963, + "loss": 1.0245, + "step": 2861 + }, + { + "epoch": 0.918927596724996, + "grad_norm": 1.0536555051803589, + "learning_rate": 0.00017963188245969253, + "loss": 0.814, + "step": 2862 + }, + { + "epoch": 0.9192486755498475, + "grad_norm": 1.004948616027832, + "learning_rate": 0.00017961076341406208, + "loss": 0.7993, + "step": 2863 + }, + { + "epoch": 0.919569754374699, + "grad_norm": 0.9587133526802063, + "learning_rate": 0.0001795896346681016, + "loss": 0.7497, + "step": 2864 + }, + { + "epoch": 0.9198908331995505, + "grad_norm": 0.9380829930305481, + "learning_rate": 0.00017956849622438554, + "loss": 0.7871, + "step": 2865 + }, + { + "epoch": 0.920211912024402, + "grad_norm": 0.9536609053611755, + "learning_rate": 0.00017954734808548958, + "loss": 0.8406, + "step": 2866 + }, + { + "epoch": 0.9205329908492534, + "grad_norm": 0.9843786954879761, + "learning_rate": 0.00017952619025399057, + "loss": 0.9366, + "step": 2867 + }, + { + "epoch": 0.920854069674105, + "grad_norm": 0.8176027536392212, + "learning_rate": 0.00017950502273246649, + "loss": 0.8689, + "step": 2868 + }, + { + "epoch": 0.9211751484989565, + "grad_norm": 0.9969504475593567, + "learning_rate": 0.00017948384552349657, + "loss": 1.0414, + "step": 2869 + }, + { + "epoch": 0.921496227323808, + "grad_norm": 1.0049349069595337, + "learning_rate": 0.00017946265862966114, + "loss": 0.8943, + "step": 2870 + }, + { + "epoch": 0.9218173061486595, + "grad_norm": 1.8080761432647705, + "learning_rate": 0.00017944146205354182, + "loss": 1.0632, + "step": 2871 + }, + { + "epoch": 0.922138384973511, + "grad_norm": 0.8137750625610352, + "learning_rate": 0.00017942025579772132, + "loss": 0.7279, + "step": 2872 + }, + { + "epoch": 0.9224594637983625, + "grad_norm": 0.9603304862976074, + "learning_rate": 0.00017939903986478355, + "loss": 0.9531, + "step": 2873 + }, + { + "epoch": 0.922780542623214, + "grad_norm": 1.556550145149231, + "learning_rate": 0.0001793778142573136, + "loss": 0.8103, + "step": 2874 + }, + { + "epoch": 0.9231016214480655, + "grad_norm": 1.2223728895187378, + "learning_rate": 0.0001793565789778978, + "loss": 0.9159, + "step": 2875 + }, + { + "epoch": 0.923422700272917, + "grad_norm": 1.2416236400604248, + "learning_rate": 0.00017933533402912354, + "loss": 0.8169, + "step": 2876 + }, + { + "epoch": 0.9237437790977685, + "grad_norm": 1.7050341367721558, + "learning_rate": 0.00017931407941357947, + "loss": 0.9111, + "step": 2877 + }, + { + "epoch": 0.92406485792262, + "grad_norm": 1.7200475931167603, + "learning_rate": 0.0001792928151338554, + "loss": 1.0214, + "step": 2878 + }, + { + "epoch": 0.9243859367474715, + "grad_norm": 1.3585445880889893, + "learning_rate": 0.00017927154119254236, + "loss": 1.0152, + "step": 2879 + }, + { + "epoch": 0.924707015572323, + "grad_norm": 1.661501407623291, + "learning_rate": 0.00017925025759223245, + "loss": 0.7003, + "step": 2880 + }, + { + "epoch": 0.9250280943971745, + "grad_norm": 0.8686938285827637, + "learning_rate": 0.00017922896433551907, + "loss": 0.8456, + "step": 2881 + }, + { + "epoch": 0.925349173222026, + "grad_norm": 1.1402549743652344, + "learning_rate": 0.00017920766142499672, + "loss": 0.91, + "step": 2882 + }, + { + "epoch": 0.9256702520468775, + "grad_norm": 1.2883620262145996, + "learning_rate": 0.00017918634886326108, + "loss": 0.834, + "step": 2883 + }, + { + "epoch": 0.925991330871729, + "grad_norm": 0.8248382210731506, + "learning_rate": 0.00017916502665290903, + "loss": 0.7228, + "step": 2884 + }, + { + "epoch": 0.9263124096965805, + "grad_norm": 1.0151740312576294, + "learning_rate": 0.0001791436947965386, + "loss": 0.8951, + "step": 2885 + }, + { + "epoch": 0.926633488521432, + "grad_norm": 1.5327341556549072, + "learning_rate": 0.00017912235329674902, + "loss": 0.912, + "step": 2886 + }, + { + "epoch": 0.9269545673462836, + "grad_norm": 1.1290961503982544, + "learning_rate": 0.0001791010021561407, + "loss": 0.8542, + "step": 2887 + }, + { + "epoch": 0.927275646171135, + "grad_norm": 1.0894137620925903, + "learning_rate": 0.0001790796413773152, + "loss": 0.9589, + "step": 2888 + }, + { + "epoch": 0.9275967249959866, + "grad_norm": 1.0757745504379272, + "learning_rate": 0.0001790582709628753, + "loss": 0.9288, + "step": 2889 + }, + { + "epoch": 0.927917803820838, + "grad_norm": 0.8483561277389526, + "learning_rate": 0.0001790368909154249, + "loss": 0.852, + "step": 2890 + }, + { + "epoch": 0.9282388826456895, + "grad_norm": 1.5198390483856201, + "learning_rate": 0.00017901550123756906, + "loss": 0.7621, + "step": 2891 + }, + { + "epoch": 0.928559961470541, + "grad_norm": 1.2075666189193726, + "learning_rate": 0.00017899410193191406, + "loss": 0.9783, + "step": 2892 + }, + { + "epoch": 0.9288810402953925, + "grad_norm": 1.1517733335494995, + "learning_rate": 0.00017897269300106737, + "loss": 0.9489, + "step": 2893 + }, + { + "epoch": 0.929202119120244, + "grad_norm": 1.0166940689086914, + "learning_rate": 0.0001789512744476376, + "loss": 0.8871, + "step": 2894 + }, + { + "epoch": 0.9295231979450955, + "grad_norm": 0.7956641912460327, + "learning_rate": 0.0001789298462742345, + "loss": 0.6786, + "step": 2895 + }, + { + "epoch": 0.9298442767699471, + "grad_norm": 1.2172540426254272, + "learning_rate": 0.00017890840848346908, + "loss": 0.7532, + "step": 2896 + }, + { + "epoch": 0.9301653555947985, + "grad_norm": 1.0997369289398193, + "learning_rate": 0.00017888696107795342, + "loss": 0.8759, + "step": 2897 + }, + { + "epoch": 0.9304864344196501, + "grad_norm": 0.9977442026138306, + "learning_rate": 0.00017886550406030085, + "loss": 0.7112, + "step": 2898 + }, + { + "epoch": 0.9308075132445015, + "grad_norm": 1.1181156635284424, + "learning_rate": 0.00017884403743312582, + "loss": 0.6298, + "step": 2899 + }, + { + "epoch": 0.9311285920693531, + "grad_norm": 0.6041698455810547, + "learning_rate": 0.00017882256119904403, + "loss": 0.4929, + "step": 2900 + }, + { + "epoch": 0.9314496708942045, + "grad_norm": 0.8137570023536682, + "learning_rate": 0.00017880107536067218, + "loss": 0.9051, + "step": 2901 + }, + { + "epoch": 0.931770749719056, + "grad_norm": 0.838915228843689, + "learning_rate": 0.0001787795799206284, + "loss": 1.2534, + "step": 2902 + }, + { + "epoch": 0.9320918285439075, + "grad_norm": 0.7378472685813904, + "learning_rate": 0.00017875807488153175, + "loss": 0.6947, + "step": 2903 + }, + { + "epoch": 0.932412907368759, + "grad_norm": 0.8197500705718994, + "learning_rate": 0.00017873656024600254, + "loss": 0.7199, + "step": 2904 + }, + { + "epoch": 0.9327339861936105, + "grad_norm": 1.34345281124115, + "learning_rate": 0.00017871503601666233, + "loss": 0.737, + "step": 2905 + }, + { + "epoch": 0.933055065018462, + "grad_norm": 1.0809118747711182, + "learning_rate": 0.00017869350219613375, + "loss": 0.5367, + "step": 2906 + }, + { + "epoch": 0.9333761438433136, + "grad_norm": 0.9395135045051575, + "learning_rate": 0.0001786719587870406, + "loss": 0.4426, + "step": 2907 + }, + { + "epoch": 0.933697222668165, + "grad_norm": 0.8438148498535156, + "learning_rate": 0.00017865040579200794, + "loss": 0.5694, + "step": 2908 + }, + { + "epoch": 0.9340183014930166, + "grad_norm": 0.836493194103241, + "learning_rate": 0.00017862884321366188, + "loss": 0.8198, + "step": 2909 + }, + { + "epoch": 0.934339380317868, + "grad_norm": 1.9980072975158691, + "learning_rate": 0.0001786072710546298, + "loss": 0.9351, + "step": 2910 + }, + { + "epoch": 0.9346604591427196, + "grad_norm": 1.1404621601104736, + "learning_rate": 0.0001785856893175402, + "loss": 0.93, + "step": 2911 + }, + { + "epoch": 0.934981537967571, + "grad_norm": 1.2216063737869263, + "learning_rate": 0.00017856409800502272, + "loss": 0.811, + "step": 2912 + }, + { + "epoch": 0.9353026167924225, + "grad_norm": 0.7324502468109131, + "learning_rate": 0.00017854249711970818, + "loss": 0.6464, + "step": 2913 + }, + { + "epoch": 0.935623695617274, + "grad_norm": 0.8018614053726196, + "learning_rate": 0.00017852088666422863, + "loss": 0.8989, + "step": 2914 + }, + { + "epoch": 0.9359447744421255, + "grad_norm": 0.8852442502975464, + "learning_rate": 0.00017849926664121726, + "loss": 0.7535, + "step": 2915 + }, + { + "epoch": 0.9362658532669771, + "grad_norm": 1.2402286529541016, + "learning_rate": 0.0001784776370533083, + "loss": 0.9203, + "step": 2916 + }, + { + "epoch": 0.9365869320918285, + "grad_norm": 0.9982452988624573, + "learning_rate": 0.00017845599790313735, + "loss": 0.8699, + "step": 2917 + }, + { + "epoch": 0.9369080109166801, + "grad_norm": 1.3398115634918213, + "learning_rate": 0.000178434349193341, + "loss": 0.9541, + "step": 2918 + }, + { + "epoch": 0.9372290897415315, + "grad_norm": 1.2586543560028076, + "learning_rate": 0.00017841269092655715, + "loss": 0.8738, + "step": 2919 + }, + { + "epoch": 0.9375501685663831, + "grad_norm": 0.7111466526985168, + "learning_rate": 0.00017839102310542477, + "loss": 0.7364, + "step": 2920 + }, + { + "epoch": 0.9378712473912345, + "grad_norm": 1.377344012260437, + "learning_rate": 0.000178369345732584, + "loss": 0.9916, + "step": 2921 + }, + { + "epoch": 0.9381923262160861, + "grad_norm": 0.881392240524292, + "learning_rate": 0.00017834765881067616, + "loss": 0.8958, + "step": 2922 + }, + { + "epoch": 0.9385134050409375, + "grad_norm": 0.8309992551803589, + "learning_rate": 0.00017832596234234376, + "loss": 0.6358, + "step": 2923 + }, + { + "epoch": 0.938834483865789, + "grad_norm": 0.9630143046379089, + "learning_rate": 0.00017830425633023043, + "loss": 0.7877, + "step": 2924 + }, + { + "epoch": 0.9391555626906406, + "grad_norm": 0.92414790391922, + "learning_rate": 0.000178282540776981, + "loss": 0.9282, + "step": 2925 + }, + { + "epoch": 0.939476641515492, + "grad_norm": 1.1467437744140625, + "learning_rate": 0.0001782608156852414, + "loss": 0.8531, + "step": 2926 + }, + { + "epoch": 0.9397977203403436, + "grad_norm": 1.1309285163879395, + "learning_rate": 0.0001782390810576588, + "loss": 0.969, + "step": 2927 + }, + { + "epoch": 0.940118799165195, + "grad_norm": 1.0550163984298706, + "learning_rate": 0.00017821733689688153, + "loss": 0.8446, + "step": 2928 + }, + { + "epoch": 0.9404398779900466, + "grad_norm": 0.9224303364753723, + "learning_rate": 0.000178195583205559, + "loss": 0.9563, + "step": 2929 + }, + { + "epoch": 0.940760956814898, + "grad_norm": 1.0767320394515991, + "learning_rate": 0.00017817381998634185, + "loss": 0.8241, + "step": 2930 + }, + { + "epoch": 0.9410820356397496, + "grad_norm": 0.7739982604980469, + "learning_rate": 0.00017815204724188187, + "loss": 0.5874, + "step": 2931 + }, + { + "epoch": 0.941403114464601, + "grad_norm": 1.8265200853347778, + "learning_rate": 0.000178130264974832, + "loss": 1.0106, + "step": 2932 + }, + { + "epoch": 0.9417241932894526, + "grad_norm": 0.7935619950294495, + "learning_rate": 0.0001781084731878463, + "loss": 0.7087, + "step": 2933 + }, + { + "epoch": 0.9420452721143041, + "grad_norm": 1.314613938331604, + "learning_rate": 0.00017808667188358012, + "loss": 1.1337, + "step": 2934 + }, + { + "epoch": 0.9423663509391556, + "grad_norm": 1.5377808809280396, + "learning_rate": 0.00017806486106468981, + "loss": 1.074, + "step": 2935 + }, + { + "epoch": 0.9426874297640071, + "grad_norm": 0.8148975968360901, + "learning_rate": 0.000178043040733833, + "loss": 0.6838, + "step": 2936 + }, + { + "epoch": 0.9430085085888585, + "grad_norm": 1.7382673025131226, + "learning_rate": 0.00017802121089366836, + "loss": 0.731, + "step": 2937 + }, + { + "epoch": 0.9433295874137101, + "grad_norm": 1.164584994316101, + "learning_rate": 0.00017799937154685586, + "loss": 1.0691, + "step": 2938 + }, + { + "epoch": 0.9436506662385615, + "grad_norm": 1.2241766452789307, + "learning_rate": 0.00017797752269605653, + "loss": 0.9289, + "step": 2939 + }, + { + "epoch": 0.9439717450634131, + "grad_norm": 1.3293641805648804, + "learning_rate": 0.00017795566434393258, + "loss": 0.7479, + "step": 2940 + }, + { + "epoch": 0.9442928238882645, + "grad_norm": 1.1661382913589478, + "learning_rate": 0.00017793379649314744, + "loss": 0.9162, + "step": 2941 + }, + { + "epoch": 0.9446139027131161, + "grad_norm": 1.2080588340759277, + "learning_rate": 0.00017791191914636554, + "loss": 0.7161, + "step": 2942 + }, + { + "epoch": 0.9449349815379676, + "grad_norm": 1.0982539653778076, + "learning_rate": 0.00017789003230625266, + "loss": 0.8409, + "step": 2943 + }, + { + "epoch": 0.9452560603628191, + "grad_norm": 1.0143113136291504, + "learning_rate": 0.00017786813597547562, + "loss": 0.7521, + "step": 2944 + }, + { + "epoch": 0.9455771391876706, + "grad_norm": 1.105790615081787, + "learning_rate": 0.00017784623015670238, + "loss": 0.9074, + "step": 2945 + }, + { + "epoch": 0.945898218012522, + "grad_norm": 0.9993615746498108, + "learning_rate": 0.00017782431485260212, + "loss": 0.8263, + "step": 2946 + }, + { + "epoch": 0.9462192968373736, + "grad_norm": 0.969996452331543, + "learning_rate": 0.00017780239006584515, + "loss": 0.6984, + "step": 2947 + }, + { + "epoch": 0.946540375662225, + "grad_norm": 0.934131383895874, + "learning_rate": 0.00017778045579910302, + "loss": 0.7838, + "step": 2948 + }, + { + "epoch": 0.9468614544870766, + "grad_norm": 0.7031441330909729, + "learning_rate": 0.0001777585120550482, + "loss": 0.4672, + "step": 2949 + }, + { + "epoch": 0.947182533311928, + "grad_norm": 1.3736737966537476, + "learning_rate": 0.0001777365588363546, + "loss": 0.6343, + "step": 2950 + }, + { + "epoch": 0.9475036121367796, + "grad_norm": 1.2793693542480469, + "learning_rate": 0.0001777145961456971, + "loss": 1.0671, + "step": 2951 + }, + { + "epoch": 0.9478246909616311, + "grad_norm": 1.0697038173675537, + "learning_rate": 0.0001776926239857518, + "loss": 0.9461, + "step": 2952 + }, + { + "epoch": 0.9481457697864826, + "grad_norm": 1.163883090019226, + "learning_rate": 0.00017767064235919592, + "loss": 0.8222, + "step": 2953 + }, + { + "epoch": 0.9484668486113341, + "grad_norm": 1.1357632875442505, + "learning_rate": 0.00017764865126870786, + "loss": 0.6185, + "step": 2954 + }, + { + "epoch": 0.9487879274361856, + "grad_norm": 1.044090986251831, + "learning_rate": 0.0001776266507169672, + "loss": 0.5034, + "step": 2955 + }, + { + "epoch": 0.9491090062610371, + "grad_norm": 1.0913854837417603, + "learning_rate": 0.0001776046407066546, + "loss": 0.4259, + "step": 2956 + }, + { + "epoch": 0.9494300850858886, + "grad_norm": 0.9846095442771912, + "learning_rate": 0.00017758262124045195, + "loss": 0.464, + "step": 2957 + }, + { + "epoch": 0.9497511639107401, + "grad_norm": 1.2047371864318848, + "learning_rate": 0.0001775605923210422, + "loss": 0.9376, + "step": 2958 + }, + { + "epoch": 0.9500722427355915, + "grad_norm": 2.1599411964416504, + "learning_rate": 0.0001775385539511096, + "loss": 1.0841, + "step": 2959 + }, + { + "epoch": 0.9503933215604431, + "grad_norm": 1.1163110733032227, + "learning_rate": 0.00017751650613333935, + "loss": 0.8837, + "step": 2960 + }, + { + "epoch": 0.9507144003852946, + "grad_norm": 1.0459928512573242, + "learning_rate": 0.00017749444887041799, + "loss": 0.9614, + "step": 2961 + }, + { + "epoch": 0.9510354792101461, + "grad_norm": 0.897926390171051, + "learning_rate": 0.00017747238216503307, + "loss": 0.8908, + "step": 2962 + }, + { + "epoch": 0.9513565580349976, + "grad_norm": 0.9355558753013611, + "learning_rate": 0.00017745030601987337, + "loss": 0.8321, + "step": 2963 + }, + { + "epoch": 0.9516776368598491, + "grad_norm": 0.7442376613616943, + "learning_rate": 0.00017742822043762888, + "loss": 0.627, + "step": 2964 + }, + { + "epoch": 0.9519987156847006, + "grad_norm": 0.7983213067054749, + "learning_rate": 0.00017740612542099053, + "loss": 0.8255, + "step": 2965 + }, + { + "epoch": 0.9523197945095521, + "grad_norm": 1.141676425933838, + "learning_rate": 0.00017738402097265064, + "loss": 0.9653, + "step": 2966 + }, + { + "epoch": 0.9526408733344036, + "grad_norm": 0.9923720955848694, + "learning_rate": 0.0001773619070953025, + "loss": 0.7411, + "step": 2967 + }, + { + "epoch": 0.952961952159255, + "grad_norm": 1.1766963005065918, + "learning_rate": 0.00017733978379164066, + "loss": 1.008, + "step": 2968 + }, + { + "epoch": 0.9532830309841066, + "grad_norm": 1.0631762742996216, + "learning_rate": 0.00017731765106436073, + "loss": 0.8621, + "step": 2969 + }, + { + "epoch": 0.953604109808958, + "grad_norm": 0.9990087747573853, + "learning_rate": 0.00017729550891615957, + "loss": 0.8143, + "step": 2970 + }, + { + "epoch": 0.9539251886338096, + "grad_norm": 0.9863772988319397, + "learning_rate": 0.00017727335734973512, + "loss": 0.8346, + "step": 2971 + }, + { + "epoch": 0.9542462674586611, + "grad_norm": 1.3948047161102295, + "learning_rate": 0.00017725119636778644, + "loss": 0.7758, + "step": 2972 + }, + { + "epoch": 0.9545673462835126, + "grad_norm": 1.3944754600524902, + "learning_rate": 0.00017722902597301383, + "loss": 0.9348, + "step": 2973 + }, + { + "epoch": 0.9548884251083641, + "grad_norm": 1.196573257446289, + "learning_rate": 0.00017720684616811866, + "loss": 0.9216, + "step": 2974 + }, + { + "epoch": 0.9552095039332156, + "grad_norm": 1.4773893356323242, + "learning_rate": 0.0001771846569558035, + "loss": 1.0486, + "step": 2975 + }, + { + "epoch": 0.9555305827580671, + "grad_norm": 1.0879054069519043, + "learning_rate": 0.00017716245833877201, + "loss": 0.7509, + "step": 2976 + }, + { + "epoch": 0.9558516615829186, + "grad_norm": 0.9991624355316162, + "learning_rate": 0.00017714025031972903, + "loss": 0.7418, + "step": 2977 + }, + { + "epoch": 0.9561727404077701, + "grad_norm": 1.0229377746582031, + "learning_rate": 0.00017711803290138052, + "loss": 0.8292, + "step": 2978 + }, + { + "epoch": 0.9564938192326216, + "grad_norm": 0.9216927289962769, + "learning_rate": 0.00017709580608643363, + "loss": 0.7256, + "step": 2979 + }, + { + "epoch": 0.9568148980574731, + "grad_norm": 1.1415516138076782, + "learning_rate": 0.0001770735698775966, + "loss": 0.9657, + "step": 2980 + }, + { + "epoch": 0.9571359768823247, + "grad_norm": 1.4062246084213257, + "learning_rate": 0.00017705132427757895, + "loss": 1.0777, + "step": 2981 + }, + { + "epoch": 0.9574570557071761, + "grad_norm": 1.2299398183822632, + "learning_rate": 0.00017702906928909108, + "loss": 1.1786, + "step": 2982 + }, + { + "epoch": 0.9577781345320276, + "grad_norm": 1.3134931325912476, + "learning_rate": 0.0001770068049148448, + "loss": 1.0092, + "step": 2983 + }, + { + "epoch": 0.9580992133568791, + "grad_norm": 1.0825444459915161, + "learning_rate": 0.00017698453115755293, + "loss": 0.6504, + "step": 2984 + }, + { + "epoch": 0.9584202921817306, + "grad_norm": 1.2085427045822144, + "learning_rate": 0.00017696224801992945, + "loss": 0.9662, + "step": 2985 + }, + { + "epoch": 0.9587413710065821, + "grad_norm": 1.1950358152389526, + "learning_rate": 0.0001769399555046895, + "loss": 0.8202, + "step": 2986 + }, + { + "epoch": 0.9590624498314336, + "grad_norm": 1.1132547855377197, + "learning_rate": 0.00017691765361454938, + "loss": 0.9621, + "step": 2987 + }, + { + "epoch": 0.9593835286562851, + "grad_norm": 0.9960354566574097, + "learning_rate": 0.00017689534235222648, + "loss": 0.9149, + "step": 2988 + }, + { + "epoch": 0.9597046074811366, + "grad_norm": 1.1196013689041138, + "learning_rate": 0.00017687302172043933, + "loss": 0.8249, + "step": 2989 + }, + { + "epoch": 0.9600256863059882, + "grad_norm": 1.1874531507492065, + "learning_rate": 0.00017685069172190766, + "loss": 0.8245, + "step": 2990 + }, + { + "epoch": 0.9603467651308396, + "grad_norm": 1.0298069715499878, + "learning_rate": 0.00017682835235935236, + "loss": 0.7936, + "step": 2991 + }, + { + "epoch": 0.9606678439556912, + "grad_norm": 1.3679097890853882, + "learning_rate": 0.00017680600363549533, + "loss": 0.8939, + "step": 2992 + }, + { + "epoch": 0.9609889227805426, + "grad_norm": 0.9797439575195312, + "learning_rate": 0.00017678364555305978, + "loss": 0.6787, + "step": 2993 + }, + { + "epoch": 0.9613100016053941, + "grad_norm": 1.0256136655807495, + "learning_rate": 0.00017676127811476987, + "loss": 0.6678, + "step": 2994 + }, + { + "epoch": 0.9616310804302456, + "grad_norm": 1.0464802980422974, + "learning_rate": 0.0001767389013233511, + "loss": 0.6863, + "step": 2995 + }, + { + "epoch": 0.9619521592550971, + "grad_norm": 1.0149427652359009, + "learning_rate": 0.00017671651518153, + "loss": 0.6783, + "step": 2996 + }, + { + "epoch": 0.9622732380799486, + "grad_norm": 0.8050268292427063, + "learning_rate": 0.00017669411969203417, + "loss": 0.7203, + "step": 2997 + }, + { + "epoch": 0.9625943169048001, + "grad_norm": 0.896780252456665, + "learning_rate": 0.00017667171485759252, + "loss": 0.575, + "step": 2998 + }, + { + "epoch": 0.9629153957296517, + "grad_norm": 0.813828706741333, + "learning_rate": 0.00017664930068093498, + "loss": 0.7544, + "step": 2999 + }, + { + "epoch": 0.9632364745545031, + "grad_norm": 0.922319769859314, + "learning_rate": 0.00017662687716479266, + "loss": 0.5257, + "step": 3000 + }, + { + "epoch": 0.9635575533793547, + "grad_norm": 1.0877915620803833, + "learning_rate": 0.0001766044443118978, + "loss": 1.0061, + "step": 3001 + }, + { + "epoch": 0.9638786322042061, + "grad_norm": 1.045781135559082, + "learning_rate": 0.00017658200212498378, + "loss": 0.9707, + "step": 3002 + }, + { + "epoch": 0.9641997110290577, + "grad_norm": 1.023611307144165, + "learning_rate": 0.00017655955060678506, + "loss": 0.7057, + "step": 3003 + }, + { + "epoch": 0.9645207898539091, + "grad_norm": 1.043655514717102, + "learning_rate": 0.00017653708976003737, + "loss": 0.5738, + "step": 3004 + }, + { + "epoch": 0.9648418686787607, + "grad_norm": 0.8784222602844238, + "learning_rate": 0.00017651461958747745, + "loss": 0.5103, + "step": 3005 + }, + { + "epoch": 0.9651629475036121, + "grad_norm": 1.0596736669540405, + "learning_rate": 0.0001764921400918432, + "loss": 0.7845, + "step": 3006 + }, + { + "epoch": 0.9654840263284636, + "grad_norm": 1.0180422067642212, + "learning_rate": 0.0001764696512758737, + "loss": 0.8355, + "step": 3007 + }, + { + "epoch": 0.9658051051533152, + "grad_norm": 0.8986467719078064, + "learning_rate": 0.00017644715314230918, + "loss": 0.8457, + "step": 3008 + }, + { + "epoch": 0.9661261839781666, + "grad_norm": 0.9878740906715393, + "learning_rate": 0.0001764246456938909, + "loss": 0.7791, + "step": 3009 + }, + { + "epoch": 0.9664472628030182, + "grad_norm": 0.8736189603805542, + "learning_rate": 0.00017640212893336142, + "loss": 0.8261, + "step": 3010 + }, + { + "epoch": 0.9667683416278696, + "grad_norm": 0.9508904218673706, + "learning_rate": 0.00017637960286346425, + "loss": 0.8671, + "step": 3011 + }, + { + "epoch": 0.9670894204527212, + "grad_norm": 1.029610514640808, + "learning_rate": 0.00017635706748694413, + "loss": 0.8247, + "step": 3012 + }, + { + "epoch": 0.9674104992775726, + "grad_norm": 1.0193253755569458, + "learning_rate": 0.000176334522806547, + "loss": 0.8908, + "step": 3013 + }, + { + "epoch": 0.9677315781024242, + "grad_norm": 1.048719882965088, + "learning_rate": 0.00017631196882501973, + "loss": 0.985, + "step": 3014 + }, + { + "epoch": 0.9680526569272756, + "grad_norm": 0.8610148429870605, + "learning_rate": 0.00017628940554511061, + "loss": 0.7015, + "step": 3015 + }, + { + "epoch": 0.9683737357521272, + "grad_norm": 0.8026429414749146, + "learning_rate": 0.00017626683296956882, + "loss": 0.7906, + "step": 3016 + }, + { + "epoch": 0.9686948145769787, + "grad_norm": 1.1094107627868652, + "learning_rate": 0.0001762442511011448, + "loss": 1.057, + "step": 3017 + }, + { + "epoch": 0.9690158934018301, + "grad_norm": 1.203913688659668, + "learning_rate": 0.00017622165994259, + "loss": 0.7759, + "step": 3018 + }, + { + "epoch": 0.9693369722266817, + "grad_norm": 0.8827437162399292, + "learning_rate": 0.0001761990594966572, + "loss": 0.948, + "step": 3019 + }, + { + "epoch": 0.9696580510515331, + "grad_norm": 1.1221593618392944, + "learning_rate": 0.0001761764497661001, + "loss": 0.9794, + "step": 3020 + }, + { + "epoch": 0.9699791298763847, + "grad_norm": 0.9867079854011536, + "learning_rate": 0.0001761538307536737, + "loss": 1.082, + "step": 3021 + }, + { + "epoch": 0.9703002087012361, + "grad_norm": 5.697073936462402, + "learning_rate": 0.000176131202462134, + "loss": 1.1723, + "step": 3022 + }, + { + "epoch": 0.9706212875260877, + "grad_norm": 0.8379837870597839, + "learning_rate": 0.0001761085648942382, + "loss": 0.6992, + "step": 3023 + }, + { + "epoch": 0.9709423663509391, + "grad_norm": 1.10336172580719, + "learning_rate": 0.00017608591805274464, + "loss": 0.9453, + "step": 3024 + }, + { + "epoch": 0.9712634451757907, + "grad_norm": 1.2008827924728394, + "learning_rate": 0.00017606326194041273, + "loss": 0.8227, + "step": 3025 + }, + { + "epoch": 0.9715845240006421, + "grad_norm": 0.9085438251495361, + "learning_rate": 0.0001760405965600031, + "loss": 0.8305, + "step": 3026 + }, + { + "epoch": 0.9719056028254937, + "grad_norm": 1.2480196952819824, + "learning_rate": 0.00017601792191427741, + "loss": 1.0148, + "step": 3027 + }, + { + "epoch": 0.9722266816503452, + "grad_norm": 0.7692457437515259, + "learning_rate": 0.0001759952380059986, + "loss": 0.7948, + "step": 3028 + }, + { + "epoch": 0.9725477604751966, + "grad_norm": 1.0976073741912842, + "learning_rate": 0.00017597254483793048, + "loss": 0.9424, + "step": 3029 + }, + { + "epoch": 0.9728688393000482, + "grad_norm": 0.9526044130325317, + "learning_rate": 0.00017594984241283825, + "loss": 0.8496, + "step": 3030 + }, + { + "epoch": 0.9731899181248996, + "grad_norm": 0.7856842279434204, + "learning_rate": 0.00017592713073348807, + "loss": 0.6682, + "step": 3031 + }, + { + "epoch": 0.9735109969497512, + "grad_norm": 1.1540440320968628, + "learning_rate": 0.00017590440980264738, + "loss": 0.8165, + "step": 3032 + }, + { + "epoch": 0.9738320757746026, + "grad_norm": 1.6081081628799438, + "learning_rate": 0.00017588167962308458, + "loss": 1.0609, + "step": 3033 + }, + { + "epoch": 0.9741531545994542, + "grad_norm": 0.7435811758041382, + "learning_rate": 0.00017585894019756925, + "loss": 0.6044, + "step": 3034 + }, + { + "epoch": 0.9744742334243056, + "grad_norm": 0.9327528476715088, + "learning_rate": 0.0001758361915288722, + "loss": 0.8865, + "step": 3035 + }, + { + "epoch": 0.9747953122491572, + "grad_norm": 0.8505873680114746, + "learning_rate": 0.00017581343361976524, + "loss": 0.7442, + "step": 3036 + }, + { + "epoch": 0.9751163910740087, + "grad_norm": 0.8487008213996887, + "learning_rate": 0.00017579066647302133, + "loss": 0.7235, + "step": 3037 + }, + { + "epoch": 0.9754374698988602, + "grad_norm": 1.0709935426712036, + "learning_rate": 0.00017576789009141465, + "loss": 0.9574, + "step": 3038 + }, + { + "epoch": 0.9757585487237117, + "grad_norm": 0.9425415992736816, + "learning_rate": 0.00017574510447772039, + "loss": 0.7343, + "step": 3039 + }, + { + "epoch": 0.9760796275485631, + "grad_norm": 1.0272222757339478, + "learning_rate": 0.00017572230963471488, + "loss": 0.9407, + "step": 3040 + }, + { + "epoch": 0.9764007063734147, + "grad_norm": 1.2393730878829956, + "learning_rate": 0.00017569950556517566, + "loss": 0.93, + "step": 3041 + }, + { + "epoch": 0.9767217851982661, + "grad_norm": 1.0345895290374756, + "learning_rate": 0.00017567669227188128, + "loss": 0.6844, + "step": 3042 + }, + { + "epoch": 0.9770428640231177, + "grad_norm": 1.0678569078445435, + "learning_rate": 0.0001756538697576115, + "loss": 0.5873, + "step": 3043 + }, + { + "epoch": 0.9773639428479691, + "grad_norm": 1.1300995349884033, + "learning_rate": 0.0001756310380251472, + "loss": 0.8044, + "step": 3044 + }, + { + "epoch": 0.9776850216728207, + "grad_norm": 1.4100874662399292, + "learning_rate": 0.00017560819707727033, + "loss": 0.9445, + "step": 3045 + }, + { + "epoch": 0.9780061004976722, + "grad_norm": 0.9837530255317688, + "learning_rate": 0.00017558534691676397, + "loss": 0.6677, + "step": 3046 + }, + { + "epoch": 0.9783271793225237, + "grad_norm": 1.2359012365341187, + "learning_rate": 0.00017556248754641235, + "loss": 0.8114, + "step": 3047 + }, + { + "epoch": 0.9786482581473752, + "grad_norm": 0.6732703447341919, + "learning_rate": 0.00017553961896900087, + "loss": 0.5295, + "step": 3048 + }, + { + "epoch": 0.9789693369722267, + "grad_norm": 1.0407007932662964, + "learning_rate": 0.00017551674118731591, + "loss": 0.714, + "step": 3049 + }, + { + "epoch": 0.9792904157970782, + "grad_norm": 0.6249756217002869, + "learning_rate": 0.00017549385420414514, + "loss": 0.4261, + "step": 3050 + }, + { + "epoch": 0.9796114946219296, + "grad_norm": 0.8719810247421265, + "learning_rate": 0.00017547095802227723, + "loss": 0.9618, + "step": 3051 + }, + { + "epoch": 0.9799325734467812, + "grad_norm": 0.9490910768508911, + "learning_rate": 0.00017544805264450196, + "loss": 0.925, + "step": 3052 + }, + { + "epoch": 0.9802536522716326, + "grad_norm": 1.0451079607009888, + "learning_rate": 0.00017542513807361037, + "loss": 0.8166, + "step": 3053 + }, + { + "epoch": 0.9805747310964842, + "grad_norm": 1.0744620561599731, + "learning_rate": 0.00017540221431239453, + "loss": 0.5847, + "step": 3054 + }, + { + "epoch": 0.9808958099213357, + "grad_norm": 1.3825839757919312, + "learning_rate": 0.00017537928136364755, + "loss": 0.4744, + "step": 3055 + }, + { + "epoch": 0.9812168887461872, + "grad_norm": 1.0428171157836914, + "learning_rate": 0.0001753563392301638, + "loss": 0.7581, + "step": 3056 + }, + { + "epoch": 0.9815379675710387, + "grad_norm": 1.382125973701477, + "learning_rate": 0.0001753333879147387, + "loss": 0.9721, + "step": 3057 + }, + { + "epoch": 0.9818590463958902, + "grad_norm": 0.9676405787467957, + "learning_rate": 0.00017531042742016876, + "loss": 0.9361, + "step": 3058 + }, + { + "epoch": 0.9821801252207417, + "grad_norm": 0.9411641955375671, + "learning_rate": 0.00017528745774925172, + "loss": 0.9277, + "step": 3059 + }, + { + "epoch": 0.9825012040455932, + "grad_norm": 1.0995005369186401, + "learning_rate": 0.00017526447890478633, + "loss": 0.9208, + "step": 3060 + }, + { + "epoch": 0.9828222828704447, + "grad_norm": 0.9930598735809326, + "learning_rate": 0.00017524149088957245, + "loss": 0.798, + "step": 3061 + }, + { + "epoch": 0.9831433616952961, + "grad_norm": 0.6785845756530762, + "learning_rate": 0.00017521849370641114, + "loss": 0.8495, + "step": 3062 + }, + { + "epoch": 0.9834644405201477, + "grad_norm": 0.7817926406860352, + "learning_rate": 0.00017519548735810456, + "loss": 0.787, + "step": 3063 + }, + { + "epoch": 0.9837855193449992, + "grad_norm": 0.8893129825592041, + "learning_rate": 0.00017517247184745593, + "loss": 0.9347, + "step": 3064 + }, + { + "epoch": 0.9841065981698507, + "grad_norm": 0.8218941688537598, + "learning_rate": 0.00017514944717726962, + "loss": 0.7465, + "step": 3065 + }, + { + "epoch": 0.9844276769947022, + "grad_norm": 1.1597498655319214, + "learning_rate": 0.00017512641335035113, + "loss": 0.9377, + "step": 3066 + }, + { + "epoch": 0.9847487558195537, + "grad_norm": 1.1089355945587158, + "learning_rate": 0.00017510337036950703, + "loss": 1.0179, + "step": 3067 + }, + { + "epoch": 0.9850698346444052, + "grad_norm": 0.9803544878959656, + "learning_rate": 0.0001750803182375451, + "loss": 1.0501, + "step": 3068 + }, + { + "epoch": 0.9853909134692567, + "grad_norm": 1.3686444759368896, + "learning_rate": 0.00017505725695727412, + "loss": 1.0379, + "step": 3069 + }, + { + "epoch": 0.9857119922941082, + "grad_norm": 1.3570899963378906, + "learning_rate": 0.00017503418653150405, + "loss": 0.8903, + "step": 3070 + }, + { + "epoch": 0.9860330711189597, + "grad_norm": 1.031009554862976, + "learning_rate": 0.00017501110696304596, + "loss": 0.8283, + "step": 3071 + }, + { + "epoch": 0.9863541499438112, + "grad_norm": 1.3445180654525757, + "learning_rate": 0.00017498801825471203, + "loss": 0.9019, + "step": 3072 + }, + { + "epoch": 0.9866752287686628, + "grad_norm": 0.8621981143951416, + "learning_rate": 0.00017496492040931552, + "loss": 0.7137, + "step": 3073 + }, + { + "epoch": 0.9869963075935142, + "grad_norm": 1.229825735092163, + "learning_rate": 0.00017494181342967083, + "loss": 1.0208, + "step": 3074 + }, + { + "epoch": 0.9873173864183657, + "grad_norm": 0.9832011461257935, + "learning_rate": 0.00017491869731859353, + "loss": 0.7463, + "step": 3075 + }, + { + "epoch": 0.9876384652432172, + "grad_norm": 0.7911801934242249, + "learning_rate": 0.00017489557207890023, + "loss": 0.8144, + "step": 3076 + }, + { + "epoch": 0.9879595440680687, + "grad_norm": 0.849236011505127, + "learning_rate": 0.0001748724377134086, + "loss": 0.7658, + "step": 3077 + }, + { + "epoch": 0.9882806228929202, + "grad_norm": 0.7961871027946472, + "learning_rate": 0.0001748492942249376, + "loss": 0.8704, + "step": 3078 + }, + { + "epoch": 0.9886017017177717, + "grad_norm": 1.293888807296753, + "learning_rate": 0.00017482614161630714, + "loss": 0.9067, + "step": 3079 + }, + { + "epoch": 0.9889227805426232, + "grad_norm": 1.098433256149292, + "learning_rate": 0.00017480297989033825, + "loss": 0.9446, + "step": 3080 + }, + { + "epoch": 0.9892438593674747, + "grad_norm": 1.4394235610961914, + "learning_rate": 0.0001747798090498532, + "loss": 0.7206, + "step": 3081 + }, + { + "epoch": 0.9895649381923263, + "grad_norm": 1.047020435333252, + "learning_rate": 0.00017475662909767522, + "loss": 0.9727, + "step": 3082 + }, + { + "epoch": 0.9898860170171777, + "grad_norm": 0.9796097278594971, + "learning_rate": 0.00017473344003662877, + "loss": 0.8787, + "step": 3083 + }, + { + "epoch": 0.9902070958420293, + "grad_norm": 0.9924256801605225, + "learning_rate": 0.00017471024186953936, + "loss": 0.7212, + "step": 3084 + }, + { + "epoch": 0.9905281746668807, + "grad_norm": 1.5609381198883057, + "learning_rate": 0.00017468703459923356, + "loss": 0.9614, + "step": 3085 + }, + { + "epoch": 0.9908492534917323, + "grad_norm": 0.9042816162109375, + "learning_rate": 0.00017466381822853915, + "loss": 0.8655, + "step": 3086 + }, + { + "epoch": 0.9911703323165837, + "grad_norm": 1.2502557039260864, + "learning_rate": 0.00017464059276028497, + "loss": 0.8242, + "step": 3087 + }, + { + "epoch": 0.9914914111414352, + "grad_norm": 1.919909119606018, + "learning_rate": 0.00017461735819730096, + "loss": 0.8001, + "step": 3088 + }, + { + "epoch": 0.9918124899662867, + "grad_norm": 1.0362643003463745, + "learning_rate": 0.00017459411454241822, + "loss": 0.6697, + "step": 3089 + }, + { + "epoch": 0.9921335687911382, + "grad_norm": 1.6059205532073975, + "learning_rate": 0.00017457086179846888, + "loss": 0.898, + "step": 3090 + }, + { + "epoch": 0.9924546476159897, + "grad_norm": 1.2551214694976807, + "learning_rate": 0.00017454759996828623, + "loss": 0.8499, + "step": 3091 + }, + { + "epoch": 0.9927757264408412, + "grad_norm": 0.9736458659172058, + "learning_rate": 0.00017452432905470464, + "loss": 0.8884, + "step": 3092 + }, + { + "epoch": 0.9930968052656928, + "grad_norm": 1.4679756164550781, + "learning_rate": 0.00017450104906055963, + "loss": 0.9162, + "step": 3093 + }, + { + "epoch": 0.9934178840905442, + "grad_norm": 0.9897777438163757, + "learning_rate": 0.00017447775998868776, + "loss": 0.7051, + "step": 3094 + }, + { + "epoch": 0.9937389629153958, + "grad_norm": 0.9767149686813354, + "learning_rate": 0.00017445446184192676, + "loss": 0.7716, + "step": 3095 + }, + { + "epoch": 0.9940600417402472, + "grad_norm": 1.2053884267807007, + "learning_rate": 0.0001744311546231154, + "loss": 0.8858, + "step": 3096 + }, + { + "epoch": 0.9943811205650988, + "grad_norm": 1.160662293434143, + "learning_rate": 0.00017440783833509366, + "loss": 0.5346, + "step": 3097 + }, + { + "epoch": 0.9947021993899502, + "grad_norm": 0.992720365524292, + "learning_rate": 0.00017438451298070252, + "loss": 0.6864, + "step": 3098 + }, + { + "epoch": 0.9950232782148017, + "grad_norm": 0.705855667591095, + "learning_rate": 0.0001743611785627841, + "loss": 0.6184, + "step": 3099 + }, + { + "epoch": 0.9953443570396532, + "grad_norm": 1.4650824069976807, + "learning_rate": 0.00017433783508418162, + "loss": 0.4631, + "step": 3100 + }, + { + "epoch": 0.9956654358645047, + "grad_norm": 1.113871693611145, + "learning_rate": 0.00017431448254773944, + "loss": 1.1905, + "step": 3101 + }, + { + "epoch": 0.9959865146893563, + "grad_norm": 1.3432302474975586, + "learning_rate": 0.00017429112095630295, + "loss": 0.6531, + "step": 3102 + }, + { + "epoch": 0.9963075935142077, + "grad_norm": 1.2041120529174805, + "learning_rate": 0.00017426775031271875, + "loss": 0.8234, + "step": 3103 + }, + { + "epoch": 0.9966286723390593, + "grad_norm": 0.980356752872467, + "learning_rate": 0.00017424437061983446, + "loss": 1.0048, + "step": 3104 + }, + { + "epoch": 0.9969497511639107, + "grad_norm": 0.9739211201667786, + "learning_rate": 0.00017422098188049883, + "loss": 0.648, + "step": 3105 + }, + { + "epoch": 0.9972708299887623, + "grad_norm": 1.0046653747558594, + "learning_rate": 0.00017419758409756164, + "loss": 0.8891, + "step": 3106 + }, + { + "epoch": 0.9975919088136137, + "grad_norm": 0.8817589282989502, + "learning_rate": 0.00017417417727387394, + "loss": 0.8698, + "step": 3107 + }, + { + "epoch": 0.9979129876384653, + "grad_norm": 0.8024824857711792, + "learning_rate": 0.0001741507614122877, + "loss": 0.7637, + "step": 3108 + }, + { + "epoch": 0.9982340664633167, + "grad_norm": 1.0565115213394165, + "learning_rate": 0.0001741273365156561, + "loss": 0.9549, + "step": 3109 + }, + { + "epoch": 0.9985551452881682, + "grad_norm": 1.4340211153030396, + "learning_rate": 0.00017410390258683345, + "loss": 0.9085, + "step": 3110 + }, + { + "epoch": 0.9988762241130198, + "grad_norm": 0.8469492197036743, + "learning_rate": 0.000174080459628675, + "loss": 0.7797, + "step": 3111 + }, + { + "epoch": 0.9991973029378712, + "grad_norm": 2.2130372524261475, + "learning_rate": 0.00017405700764403726, + "loss": 0.7562, + "step": 3112 + }, + { + "epoch": 0.9995183817627228, + "grad_norm": 0.984209418296814, + "learning_rate": 0.00017403354663577783, + "loss": 0.9092, + "step": 3113 + }, + { + "epoch": 0.9998394605875742, + "grad_norm": 1.7540405988693237, + "learning_rate": 0.00017401007660675525, + "loss": 0.6898, + "step": 3114 + }, + { + "epoch": 1.0001605394124258, + "grad_norm": 0.4434870779514313, + "learning_rate": 0.00017398659755982936, + "loss": 0.5678, + "step": 3115 + }, + { + "epoch": 1.0004816182372773, + "grad_norm": 0.6553979516029358, + "learning_rate": 0.000173963109497861, + "loss": 1.0352, + "step": 3116 + }, + { + "epoch": 1.0008026970621287, + "grad_norm": 0.8394825458526611, + "learning_rate": 0.00017393961242371205, + "loss": 0.4912, + "step": 3117 + }, + { + "epoch": 1.0011237758869802, + "grad_norm": 1.0343304872512817, + "learning_rate": 0.00017391610634024564, + "loss": 0.6326, + "step": 3118 + }, + { + "epoch": 1.0014448547118318, + "grad_norm": 0.8101858496665955, + "learning_rate": 0.0001738925912503259, + "loss": 0.5709, + "step": 3119 + }, + { + "epoch": 1.0017659335366833, + "grad_norm": 0.8755748271942139, + "learning_rate": 0.000173869067156818, + "loss": 0.377, + "step": 3120 + }, + { + "epoch": 1.0020870123615349, + "grad_norm": 0.9467462301254272, + "learning_rate": 0.00017384553406258842, + "loss": 0.6883, + "step": 3121 + }, + { + "epoch": 1.0024080911863862, + "grad_norm": 0.8870365619659424, + "learning_rate": 0.0001738219919705044, + "loss": 0.7174, + "step": 3122 + }, + { + "epoch": 1.0027291700112377, + "grad_norm": 1.0948724746704102, + "learning_rate": 0.00017379844088343468, + "loss": 0.9455, + "step": 3123 + }, + { + "epoch": 1.0030502488360893, + "grad_norm": 0.8313069939613342, + "learning_rate": 0.00017377488080424876, + "loss": 0.6838, + "step": 3124 + }, + { + "epoch": 1.0033713276609408, + "grad_norm": 0.9020499587059021, + "learning_rate": 0.0001737513117358174, + "loss": 0.7241, + "step": 3125 + }, + { + "epoch": 1.0036924064857922, + "grad_norm": 1.03434419631958, + "learning_rate": 0.0001737277336810124, + "loss": 0.5053, + "step": 3126 + }, + { + "epoch": 1.0040134853106437, + "grad_norm": 0.6919084191322327, + "learning_rate": 0.00017370414664270674, + "loss": 0.5537, + "step": 3127 + }, + { + "epoch": 1.0043345641354953, + "grad_norm": 0.7697117924690247, + "learning_rate": 0.00017368055062377434, + "loss": 0.5852, + "step": 3128 + }, + { + "epoch": 1.0046556429603468, + "grad_norm": 0.8752188086509705, + "learning_rate": 0.00017365694562709034, + "loss": 0.6626, + "step": 3129 + }, + { + "epoch": 1.0049767217851984, + "grad_norm": 1.3550692796707153, + "learning_rate": 0.00017363333165553092, + "loss": 0.5762, + "step": 3130 + }, + { + "epoch": 1.0052978006100497, + "grad_norm": 1.0785512924194336, + "learning_rate": 0.00017360970871197346, + "loss": 0.6023, + "step": 3131 + }, + { + "epoch": 1.0056188794349012, + "grad_norm": 0.6877568960189819, + "learning_rate": 0.0001735860767992962, + "loss": 0.5769, + "step": 3132 + }, + { + "epoch": 1.0059399582597528, + "grad_norm": 1.3127740621566772, + "learning_rate": 0.00017356243592037875, + "loss": 0.5348, + "step": 3133 + }, + { + "epoch": 1.0062610370846043, + "grad_norm": 1.041380763053894, + "learning_rate": 0.0001735387860781016, + "loss": 0.8355, + "step": 3134 + }, + { + "epoch": 1.0065821159094557, + "grad_norm": 1.9624297618865967, + "learning_rate": 0.00017351512727534644, + "loss": 0.7688, + "step": 3135 + }, + { + "epoch": 1.0069031947343072, + "grad_norm": 1.034769892692566, + "learning_rate": 0.000173491459514996, + "loss": 0.6289, + "step": 3136 + }, + { + "epoch": 1.0072242735591588, + "grad_norm": 0.9164260625839233, + "learning_rate": 0.00017346778279993415, + "loss": 0.6022, + "step": 3137 + }, + { + "epoch": 1.0075453523840103, + "grad_norm": 0.9046801328659058, + "learning_rate": 0.0001734440971330458, + "loss": 0.7448, + "step": 3138 + }, + { + "epoch": 1.0078664312088619, + "grad_norm": 1.0306909084320068, + "learning_rate": 0.00017342040251721702, + "loss": 0.738, + "step": 3139 + }, + { + "epoch": 1.0081875100337132, + "grad_norm": 1.137430191040039, + "learning_rate": 0.0001733966989553349, + "loss": 0.7579, + "step": 3140 + }, + { + "epoch": 1.0085085888585648, + "grad_norm": 0.7871647477149963, + "learning_rate": 0.00017337298645028764, + "loss": 0.5093, + "step": 3141 + }, + { + "epoch": 1.0088296676834163, + "grad_norm": 0.9174445867538452, + "learning_rate": 0.00017334926500496456, + "loss": 0.6771, + "step": 3142 + }, + { + "epoch": 1.0091507465082679, + "grad_norm": 0.8707671761512756, + "learning_rate": 0.00017332553462225602, + "loss": 0.698, + "step": 3143 + }, + { + "epoch": 1.0094718253331192, + "grad_norm": 1.3214155435562134, + "learning_rate": 0.00017330179530505348, + "loss": 0.7583, + "step": 3144 + }, + { + "epoch": 1.0097929041579707, + "grad_norm": 0.8207698464393616, + "learning_rate": 0.00017327804705624957, + "loss": 0.6534, + "step": 3145 + }, + { + "epoch": 1.0101139829828223, + "grad_norm": 0.9219654202461243, + "learning_rate": 0.0001732542898787379, + "loss": 0.5564, + "step": 3146 + }, + { + "epoch": 1.0104350618076738, + "grad_norm": 0.9626970887184143, + "learning_rate": 0.00017323052377541317, + "loss": 0.758, + "step": 3147 + }, + { + "epoch": 1.0107561406325254, + "grad_norm": 0.8818027377128601, + "learning_rate": 0.0001732067487491713, + "loss": 0.7285, + "step": 3148 + }, + { + "epoch": 1.0110772194573767, + "grad_norm": 1.1400665044784546, + "learning_rate": 0.0001731829648029091, + "loss": 0.7988, + "step": 3149 + }, + { + "epoch": 1.0113982982822283, + "grad_norm": 1.296278476715088, + "learning_rate": 0.0001731591719395247, + "loss": 0.5509, + "step": 3150 + }, + { + "epoch": 1.0117193771070798, + "grad_norm": 0.886254608631134, + "learning_rate": 0.00017313537016191706, + "loss": 0.5616, + "step": 3151 + }, + { + "epoch": 1.0120404559319314, + "grad_norm": 0.9640228748321533, + "learning_rate": 0.00017311155947298643, + "loss": 0.6448, + "step": 3152 + }, + { + "epoch": 1.0123615347567827, + "grad_norm": 1.8710490465164185, + "learning_rate": 0.00017308773987563406, + "loss": 0.5566, + "step": 3153 + }, + { + "epoch": 1.0126826135816343, + "grad_norm": 1.0365729331970215, + "learning_rate": 0.00017306391137276224, + "loss": 0.7364, + "step": 3154 + }, + { + "epoch": 1.0130036924064858, + "grad_norm": 1.0165393352508545, + "learning_rate": 0.00017304007396727448, + "loss": 0.6858, + "step": 3155 + }, + { + "epoch": 1.0133247712313374, + "grad_norm": 1.3366596698760986, + "learning_rate": 0.00017301622766207527, + "loss": 0.6922, + "step": 3156 + }, + { + "epoch": 1.013645850056189, + "grad_norm": 1.253536581993103, + "learning_rate": 0.00017299237246007015, + "loss": 0.7062, + "step": 3157 + }, + { + "epoch": 1.0139669288810402, + "grad_norm": 0.9793636202812195, + "learning_rate": 0.00017296850836416588, + "loss": 0.6105, + "step": 3158 + }, + { + "epoch": 1.0142880077058918, + "grad_norm": 1.0851402282714844, + "learning_rate": 0.00017294463537727024, + "loss": 0.6383, + "step": 3159 + }, + { + "epoch": 1.0146090865307433, + "grad_norm": 0.8400055766105652, + "learning_rate": 0.000172920753502292, + "loss": 0.6843, + "step": 3160 + }, + { + "epoch": 1.0149301653555949, + "grad_norm": 1.1264251470565796, + "learning_rate": 0.00017289686274214118, + "loss": 0.5577, + "step": 3161 + }, + { + "epoch": 1.0152512441804462, + "grad_norm": 1.5338242053985596, + "learning_rate": 0.0001728729630997287, + "loss": 0.4795, + "step": 3162 + }, + { + "epoch": 1.0155723230052978, + "grad_norm": 0.7459766864776611, + "learning_rate": 0.00017284905457796675, + "loss": 0.4932, + "step": 3163 + }, + { + "epoch": 1.0158934018301493, + "grad_norm": 0.6073582172393799, + "learning_rate": 0.00017282513717976848, + "loss": 0.3484, + "step": 3164 + }, + { + "epoch": 1.0162144806550009, + "grad_norm": 0.8098936080932617, + "learning_rate": 0.00017280121090804812, + "loss": 0.4744, + "step": 3165 + }, + { + "epoch": 1.0165355594798524, + "grad_norm": 1.1062911748886108, + "learning_rate": 0.00017277727576572107, + "loss": 1.023, + "step": 3166 + }, + { + "epoch": 1.0168566383047037, + "grad_norm": 1.3004356622695923, + "learning_rate": 0.00017275333175570368, + "loss": 0.7562, + "step": 3167 + }, + { + "epoch": 1.0171777171295553, + "grad_norm": 1.5146620273590088, + "learning_rate": 0.00017272937888091353, + "loss": 0.6177, + "step": 3168 + }, + { + "epoch": 1.0174987959544068, + "grad_norm": 1.1251643896102905, + "learning_rate": 0.0001727054171442692, + "loss": 0.4857, + "step": 3169 + }, + { + "epoch": 1.0178198747792584, + "grad_norm": 0.8808386921882629, + "learning_rate": 0.0001726814465486903, + "loss": 0.2652, + "step": 3170 + }, + { + "epoch": 1.0181409536041097, + "grad_norm": 1.0438605546951294, + "learning_rate": 0.0001726574670970976, + "loss": 0.5482, + "step": 3171 + }, + { + "epoch": 1.0184620324289613, + "grad_norm": 3.0426223278045654, + "learning_rate": 0.00017263347879241291, + "loss": 0.7843, + "step": 3172 + }, + { + "epoch": 1.0187831112538128, + "grad_norm": 0.8068621754646301, + "learning_rate": 0.00017260948163755918, + "loss": 0.7201, + "step": 3173 + }, + { + "epoch": 1.0191041900786644, + "grad_norm": 0.9241558313369751, + "learning_rate": 0.00017258547563546038, + "loss": 0.719, + "step": 3174 + }, + { + "epoch": 1.019425268903516, + "grad_norm": 0.9264689087867737, + "learning_rate": 0.00017256146078904153, + "loss": 0.8149, + "step": 3175 + }, + { + "epoch": 1.0197463477283673, + "grad_norm": 0.7384179830551147, + "learning_rate": 0.00017253743710122875, + "loss": 0.5689, + "step": 3176 + }, + { + "epoch": 1.0200674265532188, + "grad_norm": 1.2575445175170898, + "learning_rate": 0.00017251340457494934, + "loss": 0.6949, + "step": 3177 + }, + { + "epoch": 1.0203885053780704, + "grad_norm": 0.8369306921958923, + "learning_rate": 0.0001724893632131315, + "loss": 0.6599, + "step": 3178 + }, + { + "epoch": 1.020709584202922, + "grad_norm": 1.2006936073303223, + "learning_rate": 0.0001724653130187047, + "loss": 0.6771, + "step": 3179 + }, + { + "epoch": 1.0210306630277732, + "grad_norm": 0.7915051579475403, + "learning_rate": 0.00017244125399459926, + "loss": 0.7149, + "step": 3180 + }, + { + "epoch": 1.0213517418526248, + "grad_norm": 1.273621916770935, + "learning_rate": 0.00017241718614374678, + "loss": 0.7439, + "step": 3181 + }, + { + "epoch": 1.0216728206774763, + "grad_norm": 0.8545572757720947, + "learning_rate": 0.0001723931094690798, + "loss": 0.72, + "step": 3182 + }, + { + "epoch": 1.0219938995023279, + "grad_norm": 0.8133922815322876, + "learning_rate": 0.00017236902397353205, + "loss": 0.5812, + "step": 3183 + }, + { + "epoch": 1.0223149783271792, + "grad_norm": 1.2312626838684082, + "learning_rate": 0.00017234492966003824, + "loss": 0.6452, + "step": 3184 + }, + { + "epoch": 1.0226360571520308, + "grad_norm": 1.0506107807159424, + "learning_rate": 0.00017232082653153422, + "loss": 0.697, + "step": 3185 + }, + { + "epoch": 1.0229571359768823, + "grad_norm": 0.9486799240112305, + "learning_rate": 0.00017229671459095683, + "loss": 0.8227, + "step": 3186 + }, + { + "epoch": 1.0232782148017339, + "grad_norm": 0.8300420641899109, + "learning_rate": 0.0001722725938412441, + "loss": 0.7616, + "step": 3187 + }, + { + "epoch": 1.0235992936265854, + "grad_norm": 1.1895195245742798, + "learning_rate": 0.00017224846428533499, + "loss": 0.8887, + "step": 3188 + }, + { + "epoch": 1.0239203724514367, + "grad_norm": 0.9756418466567993, + "learning_rate": 0.0001722243259261697, + "loss": 0.5706, + "step": 3189 + }, + { + "epoch": 1.0242414512762883, + "grad_norm": 0.9104772210121155, + "learning_rate": 0.00017220017876668934, + "loss": 0.6564, + "step": 3190 + }, + { + "epoch": 1.0245625301011398, + "grad_norm": 0.8529773950576782, + "learning_rate": 0.00017217602280983623, + "loss": 0.664, + "step": 3191 + }, + { + "epoch": 1.0248836089259914, + "grad_norm": 1.0890889167785645, + "learning_rate": 0.00017215185805855368, + "loss": 0.5597, + "step": 3192 + }, + { + "epoch": 1.0252046877508427, + "grad_norm": 0.9177720546722412, + "learning_rate": 0.00017212768451578607, + "loss": 0.8042, + "step": 3193 + }, + { + "epoch": 1.0255257665756943, + "grad_norm": 0.8690738677978516, + "learning_rate": 0.00017210350218447887, + "loss": 0.6965, + "step": 3194 + }, + { + "epoch": 1.0258468454005458, + "grad_norm": 0.9098244309425354, + "learning_rate": 0.00017207931106757868, + "loss": 0.5547, + "step": 3195 + }, + { + "epoch": 1.0261679242253974, + "grad_norm": 0.8864629864692688, + "learning_rate": 0.00017205511116803306, + "loss": 0.6666, + "step": 3196 + }, + { + "epoch": 1.026489003050249, + "grad_norm": 0.9058293700218201, + "learning_rate": 0.0001720309024887907, + "loss": 0.6675, + "step": 3197 + }, + { + "epoch": 1.0268100818751003, + "grad_norm": 1.372377634048462, + "learning_rate": 0.00017200668503280136, + "loss": 0.7582, + "step": 3198 + }, + { + "epoch": 1.0271311606999518, + "grad_norm": 0.7774912118911743, + "learning_rate": 0.0001719824588030159, + "loss": 0.6099, + "step": 3199 + }, + { + "epoch": 1.0274522395248034, + "grad_norm": 1.0164647102355957, + "learning_rate": 0.00017195822380238615, + "loss": 0.7993, + "step": 3200 + }, + { + "epoch": 1.027773318349655, + "grad_norm": 0.9077091813087463, + "learning_rate": 0.0001719339800338651, + "loss": 0.7304, + "step": 3201 + }, + { + "epoch": 1.0280943971745062, + "grad_norm": 1.8159345388412476, + "learning_rate": 0.00017190972750040682, + "loss": 0.6752, + "step": 3202 + }, + { + "epoch": 1.0284154759993578, + "grad_norm": 1.1675589084625244, + "learning_rate": 0.00017188546620496635, + "loss": 0.8066, + "step": 3203 + }, + { + "epoch": 1.0287365548242093, + "grad_norm": 1.103464126586914, + "learning_rate": 0.00017186119615049988, + "loss": 0.6484, + "step": 3204 + }, + { + "epoch": 1.0290576336490609, + "grad_norm": 1.758217692375183, + "learning_rate": 0.00017183691733996462, + "loss": 0.6094, + "step": 3205 + }, + { + "epoch": 1.0293787124739124, + "grad_norm": 1.3336718082427979, + "learning_rate": 0.00017181262977631888, + "loss": 0.6994, + "step": 3206 + }, + { + "epoch": 1.0296997912987638, + "grad_norm": 1.3006914854049683, + "learning_rate": 0.00017178833346252206, + "loss": 0.5225, + "step": 3207 + }, + { + "epoch": 1.0300208701236153, + "grad_norm": 1.329729437828064, + "learning_rate": 0.00017176402840153455, + "loss": 0.6402, + "step": 3208 + }, + { + "epoch": 1.0303419489484669, + "grad_norm": 1.2530442476272583, + "learning_rate": 0.00017173971459631787, + "loss": 0.6543, + "step": 3209 + }, + { + "epoch": 1.0306630277733184, + "grad_norm": 1.1140762567520142, + "learning_rate": 0.0001717153920498346, + "loss": 0.5683, + "step": 3210 + }, + { + "epoch": 1.0309841065981697, + "grad_norm": 1.3410987854003906, + "learning_rate": 0.0001716910607650483, + "loss": 0.5983, + "step": 3211 + }, + { + "epoch": 1.0313051854230213, + "grad_norm": 1.056822419166565, + "learning_rate": 0.0001716667207449237, + "loss": 0.5871, + "step": 3212 + }, + { + "epoch": 1.0316262642478728, + "grad_norm": 1.11257004737854, + "learning_rate": 0.0001716423719924266, + "loss": 0.4527, + "step": 3213 + }, + { + "epoch": 1.0319473430727244, + "grad_norm": 1.4045213460922241, + "learning_rate": 0.00017161801451052376, + "loss": 0.7051, + "step": 3214 + }, + { + "epoch": 1.032268421897576, + "grad_norm": 0.6990142464637756, + "learning_rate": 0.00017159364830218312, + "loss": 0.69, + "step": 3215 + }, + { + "epoch": 1.0325895007224273, + "grad_norm": 0.6937003135681152, + "learning_rate": 0.0001715692733703736, + "loss": 0.9796, + "step": 3216 + }, + { + "epoch": 1.0329105795472788, + "grad_norm": 0.7836512923240662, + "learning_rate": 0.00017154488971806518, + "loss": 0.5325, + "step": 3217 + }, + { + "epoch": 1.0332316583721304, + "grad_norm": 0.7977432012557983, + "learning_rate": 0.00017152049734822902, + "loss": 0.5356, + "step": 3218 + }, + { + "epoch": 1.033552737196982, + "grad_norm": 0.8344630599021912, + "learning_rate": 0.00017149609626383717, + "loss": 0.529, + "step": 3219 + }, + { + "epoch": 1.0338738160218333, + "grad_norm": 0.9146289229393005, + "learning_rate": 0.00017147168646786286, + "loss": 0.4378, + "step": 3220 + }, + { + "epoch": 1.0341948948466848, + "grad_norm": 0.8820130825042725, + "learning_rate": 0.00017144726796328034, + "loss": 0.4912, + "step": 3221 + }, + { + "epoch": 1.0345159736715364, + "grad_norm": 0.8433278203010559, + "learning_rate": 0.00017142284075306497, + "loss": 0.5559, + "step": 3222 + }, + { + "epoch": 1.034837052496388, + "grad_norm": 1.3478479385375977, + "learning_rate": 0.0001713984048401931, + "loss": 0.8736, + "step": 3223 + }, + { + "epoch": 1.0351581313212395, + "grad_norm": 1.062957763671875, + "learning_rate": 0.00017137396022764214, + "loss": 0.7858, + "step": 3224 + }, + { + "epoch": 1.0354792101460908, + "grad_norm": 1.3038783073425293, + "learning_rate": 0.00017134950691839065, + "loss": 0.8089, + "step": 3225 + }, + { + "epoch": 1.0358002889709423, + "grad_norm": 1.0557838678359985, + "learning_rate": 0.00017132504491541818, + "loss": 0.5426, + "step": 3226 + }, + { + "epoch": 1.036121367795794, + "grad_norm": 0.7496936321258545, + "learning_rate": 0.0001713005742217053, + "loss": 0.5222, + "step": 3227 + }, + { + "epoch": 1.0364424466206454, + "grad_norm": 0.9153050184249878, + "learning_rate": 0.00017127609484023377, + "loss": 0.6843, + "step": 3228 + }, + { + "epoch": 1.0367635254454968, + "grad_norm": 1.0914545059204102, + "learning_rate": 0.00017125160677398626, + "loss": 0.5231, + "step": 3229 + }, + { + "epoch": 1.0370846042703483, + "grad_norm": 0.7990017533302307, + "learning_rate": 0.0001712271100259466, + "loss": 0.6105, + "step": 3230 + }, + { + "epoch": 1.0374056830951999, + "grad_norm": 0.8752117156982422, + "learning_rate": 0.00017120260459909967, + "loss": 0.6104, + "step": 3231 + }, + { + "epoch": 1.0377267619200514, + "grad_norm": 1.1408075094223022, + "learning_rate": 0.0001711780904964313, + "loss": 0.8112, + "step": 3232 + }, + { + "epoch": 1.038047840744903, + "grad_norm": 0.7575783133506775, + "learning_rate": 0.00017115356772092857, + "loss": 0.4848, + "step": 3233 + }, + { + "epoch": 1.0383689195697543, + "grad_norm": 1.2850323915481567, + "learning_rate": 0.0001711290362755794, + "loss": 0.7803, + "step": 3234 + }, + { + "epoch": 1.0386899983946059, + "grad_norm": 0.9859423637390137, + "learning_rate": 0.00017110449616337289, + "loss": 0.6785, + "step": 3235 + }, + { + "epoch": 1.0390110772194574, + "grad_norm": 1.224280834197998, + "learning_rate": 0.00017107994738729926, + "loss": 0.806, + "step": 3236 + }, + { + "epoch": 1.039332156044309, + "grad_norm": 1.1364513635635376, + "learning_rate": 0.00017105538995034963, + "loss": 0.8497, + "step": 3237 + }, + { + "epoch": 1.0396532348691603, + "grad_norm": 0.712236225605011, + "learning_rate": 0.00017103082385551627, + "loss": 0.4612, + "step": 3238 + }, + { + "epoch": 1.0399743136940118, + "grad_norm": 0.8277900218963623, + "learning_rate": 0.0001710062491057925, + "loss": 0.6745, + "step": 3239 + }, + { + "epoch": 1.0402953925188634, + "grad_norm": 1.1629518270492554, + "learning_rate": 0.00017098166570417262, + "loss": 0.7966, + "step": 3240 + }, + { + "epoch": 1.040616471343715, + "grad_norm": 1.143298864364624, + "learning_rate": 0.0001709570736536521, + "loss": 0.5971, + "step": 3241 + }, + { + "epoch": 1.0409375501685665, + "grad_norm": 1.7243444919586182, + "learning_rate": 0.0001709324729572274, + "loss": 0.521, + "step": 3242 + }, + { + "epoch": 1.0412586289934178, + "grad_norm": 0.7155652046203613, + "learning_rate": 0.000170907863617896, + "loss": 0.5353, + "step": 3243 + }, + { + "epoch": 1.0415797078182694, + "grad_norm": 0.8861052393913269, + "learning_rate": 0.00017088324563865656, + "loss": 0.7525, + "step": 3244 + }, + { + "epoch": 1.041900786643121, + "grad_norm": 0.786766529083252, + "learning_rate": 0.00017085861902250865, + "loss": 0.6192, + "step": 3245 + }, + { + "epoch": 1.0422218654679725, + "grad_norm": 1.0128397941589355, + "learning_rate": 0.0001708339837724529, + "loss": 0.6272, + "step": 3246 + }, + { + "epoch": 1.0425429442928238, + "grad_norm": 0.9892904162406921, + "learning_rate": 0.0001708093398914911, + "loss": 0.5604, + "step": 3247 + }, + { + "epoch": 1.0428640231176753, + "grad_norm": 1.106776237487793, + "learning_rate": 0.00017078468738262602, + "loss": 0.7397, + "step": 3248 + }, + { + "epoch": 1.043185101942527, + "grad_norm": 1.0783487558364868, + "learning_rate": 0.00017076002624886154, + "loss": 0.6305, + "step": 3249 + }, + { + "epoch": 1.0435061807673784, + "grad_norm": 1.2297683954238892, + "learning_rate": 0.00017073535649320248, + "loss": 0.6252, + "step": 3250 + }, + { + "epoch": 1.04382725959223, + "grad_norm": 0.9655210375785828, + "learning_rate": 0.00017071067811865476, + "loss": 0.6704, + "step": 3251 + }, + { + "epoch": 1.0441483384170813, + "grad_norm": 1.2158546447753906, + "learning_rate": 0.00017068599112822543, + "loss": 0.6869, + "step": 3252 + }, + { + "epoch": 1.0444694172419329, + "grad_norm": 1.0447654724121094, + "learning_rate": 0.0001706612955249225, + "loss": 0.4427, + "step": 3253 + }, + { + "epoch": 1.0447904960667844, + "grad_norm": 0.8590002059936523, + "learning_rate": 0.000170636591311755, + "loss": 0.5222, + "step": 3254 + }, + { + "epoch": 1.045111574891636, + "grad_norm": 1.114013433456421, + "learning_rate": 0.00017061187849173317, + "loss": 0.5776, + "step": 3255 + }, + { + "epoch": 1.0454326537164873, + "grad_norm": 0.8874313235282898, + "learning_rate": 0.0001705871570678681, + "loss": 0.6153, + "step": 3256 + }, + { + "epoch": 1.0457537325413389, + "grad_norm": 1.2078914642333984, + "learning_rate": 0.0001705624270431721, + "loss": 0.7155, + "step": 3257 + }, + { + "epoch": 1.0460748113661904, + "grad_norm": 0.9867888689041138, + "learning_rate": 0.00017053768842065833, + "loss": 0.5113, + "step": 3258 + }, + { + "epoch": 1.046395890191042, + "grad_norm": 0.6994266510009766, + "learning_rate": 0.00017051294120334125, + "loss": 0.5073, + "step": 3259 + }, + { + "epoch": 1.0467169690158935, + "grad_norm": 1.1606996059417725, + "learning_rate": 0.00017048818539423615, + "loss": 0.5518, + "step": 3260 + }, + { + "epoch": 1.0470380478407448, + "grad_norm": 0.9645611643791199, + "learning_rate": 0.00017046342099635948, + "loss": 0.5303, + "step": 3261 + }, + { + "epoch": 1.0473591266655964, + "grad_norm": 0.6704840660095215, + "learning_rate": 0.00017043864801272868, + "loss": 0.4778, + "step": 3262 + }, + { + "epoch": 1.047680205490448, + "grad_norm": 0.7522701025009155, + "learning_rate": 0.0001704138664463623, + "loss": 0.5107, + "step": 3263 + }, + { + "epoch": 1.0480012843152995, + "grad_norm": 1.1835343837738037, + "learning_rate": 0.00017038907630027988, + "loss": 0.516, + "step": 3264 + }, + { + "epoch": 1.0483223631401508, + "grad_norm": 0.9107168316841125, + "learning_rate": 0.00017036427757750205, + "loss": 0.7626, + "step": 3265 + }, + { + "epoch": 1.0486434419650024, + "grad_norm": 0.9731441736221313, + "learning_rate": 0.00017033947028105039, + "loss": 0.848, + "step": 3266 + }, + { + "epoch": 1.048964520789854, + "grad_norm": 1.073034644126892, + "learning_rate": 0.00017031465441394767, + "loss": 0.875, + "step": 3267 + }, + { + "epoch": 1.0492855996147055, + "grad_norm": 1.114713430404663, + "learning_rate": 0.00017028982997921758, + "loss": 0.5253, + "step": 3268 + }, + { + "epoch": 1.049606678439557, + "grad_norm": 1.0419694185256958, + "learning_rate": 0.00017026499697988493, + "loss": 0.6738, + "step": 3269 + }, + { + "epoch": 1.0499277572644083, + "grad_norm": 1.0044914484024048, + "learning_rate": 0.0001702401554189755, + "loss": 0.4802, + "step": 3270 + }, + { + "epoch": 1.05024883608926, + "grad_norm": 0.8162968158721924, + "learning_rate": 0.00017021530529951625, + "loss": 0.3764, + "step": 3271 + }, + { + "epoch": 1.0505699149141114, + "grad_norm": 0.8983839154243469, + "learning_rate": 0.000170190446624535, + "loss": 0.5214, + "step": 3272 + }, + { + "epoch": 1.050890993738963, + "grad_norm": 1.0330827236175537, + "learning_rate": 0.00017016557939706075, + "loss": 0.7508, + "step": 3273 + }, + { + "epoch": 1.0512120725638143, + "grad_norm": 0.8711283802986145, + "learning_rate": 0.00017014070362012348, + "loss": 0.6736, + "step": 3274 + }, + { + "epoch": 1.0515331513886659, + "grad_norm": 0.8156806230545044, + "learning_rate": 0.00017011581929675425, + "loss": 0.633, + "step": 3275 + }, + { + "epoch": 1.0518542302135174, + "grad_norm": 0.9561333060264587, + "learning_rate": 0.0001700909264299851, + "loss": 0.7371, + "step": 3276 + }, + { + "epoch": 1.052175309038369, + "grad_norm": 0.7813916802406311, + "learning_rate": 0.00017006602502284918, + "loss": 0.5109, + "step": 3277 + }, + { + "epoch": 1.0524963878632203, + "grad_norm": 0.8601284027099609, + "learning_rate": 0.00017004111507838064, + "loss": 0.5188, + "step": 3278 + }, + { + "epoch": 1.0528174666880719, + "grad_norm": 1.1225234270095825, + "learning_rate": 0.00017001619659961467, + "loss": 0.6945, + "step": 3279 + }, + { + "epoch": 1.0531385455129234, + "grad_norm": 1.0475960969924927, + "learning_rate": 0.00016999126958958756, + "loss": 0.6678, + "step": 3280 + }, + { + "epoch": 1.053459624337775, + "grad_norm": 1.015073537826538, + "learning_rate": 0.00016996633405133655, + "loss": 0.8029, + "step": 3281 + }, + { + "epoch": 1.0537807031626265, + "grad_norm": 1.1307599544525146, + "learning_rate": 0.00016994138998789997, + "loss": 0.6215, + "step": 3282 + }, + { + "epoch": 1.0541017819874778, + "grad_norm": 1.1362276077270508, + "learning_rate": 0.00016991643740231716, + "loss": 0.7888, + "step": 3283 + }, + { + "epoch": 1.0544228608123294, + "grad_norm": 1.2660646438598633, + "learning_rate": 0.0001698914762976285, + "loss": 0.7849, + "step": 3284 + }, + { + "epoch": 1.054743939637181, + "grad_norm": 0.9536210298538208, + "learning_rate": 0.00016986650667687552, + "loss": 0.6164, + "step": 3285 + }, + { + "epoch": 1.0550650184620325, + "grad_norm": 0.9557814598083496, + "learning_rate": 0.0001698415285431006, + "loss": 0.6736, + "step": 3286 + }, + { + "epoch": 1.0553860972868838, + "grad_norm": 1.0140166282653809, + "learning_rate": 0.00016981654189934727, + "loss": 0.6405, + "step": 3287 + }, + { + "epoch": 1.0557071761117354, + "grad_norm": 0.9973203539848328, + "learning_rate": 0.0001697915467486601, + "loss": 0.7855, + "step": 3288 + }, + { + "epoch": 1.056028254936587, + "grad_norm": 1.143325924873352, + "learning_rate": 0.00016976654309408464, + "loss": 0.8278, + "step": 3289 + }, + { + "epoch": 1.0563493337614385, + "grad_norm": 1.1182677745819092, + "learning_rate": 0.00016974153093866757, + "loss": 0.6487, + "step": 3290 + }, + { + "epoch": 1.05667041258629, + "grad_norm": 0.8752659559249878, + "learning_rate": 0.00016971651028545648, + "loss": 0.7054, + "step": 3291 + }, + { + "epoch": 1.0569914914111413, + "grad_norm": 1.3598425388336182, + "learning_rate": 0.00016969148113750007, + "loss": 0.8206, + "step": 3292 + }, + { + "epoch": 1.057312570235993, + "grad_norm": 0.8707671165466309, + "learning_rate": 0.00016966644349784808, + "loss": 0.6105, + "step": 3293 + }, + { + "epoch": 1.0576336490608444, + "grad_norm": 0.9644749760627747, + "learning_rate": 0.0001696413973695513, + "loss": 0.5752, + "step": 3294 + }, + { + "epoch": 1.057954727885696, + "grad_norm": 0.894107460975647, + "learning_rate": 0.00016961634275566146, + "loss": 0.6371, + "step": 3295 + }, + { + "epoch": 1.0582758067105473, + "grad_norm": 1.123781442642212, + "learning_rate": 0.00016959127965923142, + "loss": 0.6209, + "step": 3296 + }, + { + "epoch": 1.0585968855353989, + "grad_norm": 1.2167694568634033, + "learning_rate": 0.0001695662080833151, + "loss": 0.576, + "step": 3297 + }, + { + "epoch": 1.0589179643602504, + "grad_norm": 1.0875262022018433, + "learning_rate": 0.00016954112803096728, + "loss": 0.6831, + "step": 3298 + }, + { + "epoch": 1.059239043185102, + "grad_norm": 0.9846530556678772, + "learning_rate": 0.000169516039505244, + "loss": 0.6525, + "step": 3299 + }, + { + "epoch": 1.0595601220099535, + "grad_norm": 0.9327017664909363, + "learning_rate": 0.00016949094250920217, + "loss": 0.5837, + "step": 3300 + }, + { + "epoch": 1.0598812008348049, + "grad_norm": 0.8693040013313293, + "learning_rate": 0.00016946583704589973, + "loss": 0.7224, + "step": 3301 + }, + { + "epoch": 1.0602022796596564, + "grad_norm": 1.8250339031219482, + "learning_rate": 0.00016944072311839581, + "loss": 0.6624, + "step": 3302 + }, + { + "epoch": 1.060523358484508, + "grad_norm": 1.6133804321289062, + "learning_rate": 0.0001694156007297504, + "loss": 0.6358, + "step": 3303 + }, + { + "epoch": 1.0608444373093595, + "grad_norm": 1.2496116161346436, + "learning_rate": 0.00016939046988302458, + "loss": 0.788, + "step": 3304 + }, + { + "epoch": 1.0611655161342108, + "grad_norm": 0.8245418071746826, + "learning_rate": 0.0001693653305812805, + "loss": 0.6231, + "step": 3305 + }, + { + "epoch": 1.0614865949590624, + "grad_norm": 0.9475066661834717, + "learning_rate": 0.0001693401828275813, + "loss": 0.6326, + "step": 3306 + }, + { + "epoch": 1.061807673783914, + "grad_norm": 0.8447438478469849, + "learning_rate": 0.00016931502662499118, + "loss": 0.4383, + "step": 3307 + }, + { + "epoch": 1.0621287526087655, + "grad_norm": 0.9479712247848511, + "learning_rate": 0.00016928986197657525, + "loss": 0.4397, + "step": 3308 + }, + { + "epoch": 1.062449831433617, + "grad_norm": 1.2100830078125, + "learning_rate": 0.0001692646888853999, + "loss": 0.6239, + "step": 3309 + }, + { + "epoch": 1.0627709102584684, + "grad_norm": 1.2680039405822754, + "learning_rate": 0.00016923950735453226, + "loss": 0.5405, + "step": 3310 + }, + { + "epoch": 1.06309198908332, + "grad_norm": 0.9997744560241699, + "learning_rate": 0.0001692143173870407, + "loss": 0.5493, + "step": 3311 + }, + { + "epoch": 1.0634130679081715, + "grad_norm": 0.9026280641555786, + "learning_rate": 0.0001691891189859945, + "loss": 0.5545, + "step": 3312 + }, + { + "epoch": 1.063734146733023, + "grad_norm": 1.112392544746399, + "learning_rate": 0.000169163912154464, + "loss": 0.6452, + "step": 3313 + }, + { + "epoch": 1.0640552255578744, + "grad_norm": 0.7569609880447388, + "learning_rate": 0.00016913869689552064, + "loss": 0.4284, + "step": 3314 + }, + { + "epoch": 1.064376304382726, + "grad_norm": 0.8292409181594849, + "learning_rate": 0.0001691134732122368, + "loss": 0.5371, + "step": 3315 + }, + { + "epoch": 1.0646973832075775, + "grad_norm": 0.8496055006980896, + "learning_rate": 0.00016908824110768584, + "loss": 1.1306, + "step": 3316 + }, + { + "epoch": 1.065018462032429, + "grad_norm": 0.9334166049957275, + "learning_rate": 0.00016906300058494228, + "loss": 0.619, + "step": 3317 + }, + { + "epoch": 1.0653395408572806, + "grad_norm": 0.9362530708312988, + "learning_rate": 0.00016903775164708163, + "loss": 0.6312, + "step": 3318 + }, + { + "epoch": 1.0656606196821319, + "grad_norm": 0.7941862344741821, + "learning_rate": 0.00016901249429718032, + "loss": 0.5325, + "step": 3319 + }, + { + "epoch": 1.0659816985069834, + "grad_norm": 0.8351782560348511, + "learning_rate": 0.00016898722853831593, + "loss": 0.4099, + "step": 3320 + }, + { + "epoch": 1.066302777331835, + "grad_norm": 2.7632505893707275, + "learning_rate": 0.000168961954373567, + "loss": 0.6414, + "step": 3321 + }, + { + "epoch": 1.0666238561566865, + "grad_norm": 1.1203904151916504, + "learning_rate": 0.00016893667180601312, + "loss": 0.7519, + "step": 3322 + }, + { + "epoch": 1.0669449349815379, + "grad_norm": 1.0219404697418213, + "learning_rate": 0.00016891138083873487, + "loss": 0.6174, + "step": 3323 + }, + { + "epoch": 1.0672660138063894, + "grad_norm": 0.9879265427589417, + "learning_rate": 0.00016888608147481388, + "loss": 0.636, + "step": 3324 + }, + { + "epoch": 1.067587092631241, + "grad_norm": 0.7659774422645569, + "learning_rate": 0.00016886077371733283, + "loss": 0.5238, + "step": 3325 + }, + { + "epoch": 1.0679081714560925, + "grad_norm": 0.9733467698097229, + "learning_rate": 0.0001688354575693754, + "loss": 0.7139, + "step": 3326 + }, + { + "epoch": 1.068229250280944, + "grad_norm": 0.7653968334197998, + "learning_rate": 0.0001688101330340263, + "loss": 0.4363, + "step": 3327 + }, + { + "epoch": 1.0685503291057954, + "grad_norm": 1.0636018514633179, + "learning_rate": 0.0001687848001143711, + "loss": 0.7243, + "step": 3328 + }, + { + "epoch": 1.068871407930647, + "grad_norm": 1.1847553253173828, + "learning_rate": 0.00016875945881349676, + "loss": 0.7352, + "step": 3329 + }, + { + "epoch": 1.0691924867554985, + "grad_norm": 1.2986525297164917, + "learning_rate": 0.00016873410913449091, + "loss": 0.7651, + "step": 3330 + }, + { + "epoch": 1.06951356558035, + "grad_norm": 0.921399712562561, + "learning_rate": 0.0001687087510804423, + "loss": 0.6639, + "step": 3331 + }, + { + "epoch": 1.0698346444052014, + "grad_norm": 0.900202751159668, + "learning_rate": 0.00016868338465444085, + "loss": 0.6414, + "step": 3332 + }, + { + "epoch": 1.070155723230053, + "grad_norm": 0.7372778654098511, + "learning_rate": 0.00016865800985957726, + "loss": 0.518, + "step": 3333 + }, + { + "epoch": 1.0704768020549045, + "grad_norm": 1.0439753532409668, + "learning_rate": 0.0001686326266989435, + "loss": 0.7912, + "step": 3334 + }, + { + "epoch": 1.070797880879756, + "grad_norm": 0.9654948711395264, + "learning_rate": 0.0001686072351756323, + "loss": 0.7365, + "step": 3335 + }, + { + "epoch": 1.0711189597046076, + "grad_norm": 0.9803206324577332, + "learning_rate": 0.00016858183529273765, + "loss": 0.5448, + "step": 3336 + }, + { + "epoch": 1.071440038529459, + "grad_norm": 0.9181073307991028, + "learning_rate": 0.00016855642705335437, + "loss": 0.6209, + "step": 3337 + }, + { + "epoch": 1.0717611173543105, + "grad_norm": 1.7982044219970703, + "learning_rate": 0.0001685310104605784, + "loss": 0.6342, + "step": 3338 + }, + { + "epoch": 1.072082196179162, + "grad_norm": 1.0803323984146118, + "learning_rate": 0.0001685055855175067, + "loss": 0.8042, + "step": 3339 + }, + { + "epoch": 1.0724032750040136, + "grad_norm": 0.936269223690033, + "learning_rate": 0.0001684801522272372, + "loss": 0.589, + "step": 3340 + }, + { + "epoch": 1.0727243538288649, + "grad_norm": 0.9068983197212219, + "learning_rate": 0.00016845471059286887, + "loss": 0.5997, + "step": 3341 + }, + { + "epoch": 1.0730454326537164, + "grad_norm": 1.2561771869659424, + "learning_rate": 0.0001684292606175017, + "loss": 0.6738, + "step": 3342 + }, + { + "epoch": 1.073366511478568, + "grad_norm": 2.2465908527374268, + "learning_rate": 0.00016840380230423668, + "loss": 0.7389, + "step": 3343 + }, + { + "epoch": 1.0736875903034195, + "grad_norm": 1.6561983823776245, + "learning_rate": 0.0001683783356561759, + "loss": 0.8363, + "step": 3344 + }, + { + "epoch": 1.074008669128271, + "grad_norm": 0.843169629573822, + "learning_rate": 0.00016835286067642228, + "loss": 0.6381, + "step": 3345 + }, + { + "epoch": 1.0743297479531224, + "grad_norm": 0.7259941697120667, + "learning_rate": 0.00016832737736807994, + "loss": 0.5703, + "step": 3346 + }, + { + "epoch": 1.074650826777974, + "grad_norm": 1.018009901046753, + "learning_rate": 0.00016830188573425389, + "loss": 0.7034, + "step": 3347 + }, + { + "epoch": 1.0749719056028255, + "grad_norm": 0.8844952583312988, + "learning_rate": 0.00016827638577805026, + "loss": 0.6634, + "step": 3348 + }, + { + "epoch": 1.075292984427677, + "grad_norm": 0.9090389609336853, + "learning_rate": 0.0001682508775025762, + "loss": 0.5809, + "step": 3349 + }, + { + "epoch": 1.0756140632525284, + "grad_norm": 0.8599204421043396, + "learning_rate": 0.00016822536091093965, + "loss": 0.6162, + "step": 3350 + }, + { + "epoch": 1.07593514207738, + "grad_norm": 1.0319000482559204, + "learning_rate": 0.00016819983600624986, + "loss": 0.6512, + "step": 3351 + }, + { + "epoch": 1.0762562209022315, + "grad_norm": 0.8953613042831421, + "learning_rate": 0.0001681743027916169, + "loss": 0.4827, + "step": 3352 + }, + { + "epoch": 1.076577299727083, + "grad_norm": 0.7418705821037292, + "learning_rate": 0.000168148761270152, + "loss": 0.4673, + "step": 3353 + }, + { + "epoch": 1.0768983785519346, + "grad_norm": 1.4244052171707153, + "learning_rate": 0.0001681232114449672, + "loss": 0.728, + "step": 3354 + }, + { + "epoch": 1.077219457376786, + "grad_norm": 0.9894974827766418, + "learning_rate": 0.00016809765331917575, + "loss": 0.6679, + "step": 3355 + }, + { + "epoch": 1.0775405362016375, + "grad_norm": 0.8581247925758362, + "learning_rate": 0.0001680720868958918, + "loss": 0.5517, + "step": 3356 + }, + { + "epoch": 1.077861615026489, + "grad_norm": 0.8083166480064392, + "learning_rate": 0.00016804651217823053, + "loss": 0.5014, + "step": 3357 + }, + { + "epoch": 1.0781826938513406, + "grad_norm": 1.293942928314209, + "learning_rate": 0.00016802092916930818, + "loss": 0.724, + "step": 3358 + }, + { + "epoch": 1.078503772676192, + "grad_norm": 0.8754002451896667, + "learning_rate": 0.00016799533787224192, + "loss": 0.5268, + "step": 3359 + }, + { + "epoch": 1.0788248515010435, + "grad_norm": 0.9752049446105957, + "learning_rate": 0.00016796973829015, + "loss": 0.5621, + "step": 3360 + }, + { + "epoch": 1.079145930325895, + "grad_norm": 0.9150457978248596, + "learning_rate": 0.00016794413042615168, + "loss": 0.4815, + "step": 3361 + }, + { + "epoch": 1.0794670091507466, + "grad_norm": 0.992889940738678, + "learning_rate": 0.00016791851428336711, + "loss": 0.5026, + "step": 3362 + }, + { + "epoch": 1.079788087975598, + "grad_norm": 0.6349382996559143, + "learning_rate": 0.00016789288986491762, + "loss": 0.3832, + "step": 3363 + }, + { + "epoch": 1.0801091668004494, + "grad_norm": 0.9240784645080566, + "learning_rate": 0.00016786725717392545, + "loss": 0.4269, + "step": 3364 + }, + { + "epoch": 1.080430245625301, + "grad_norm": 0.9478742480278015, + "learning_rate": 0.00016784161621351382, + "loss": 0.9252, + "step": 3365 + }, + { + "epoch": 1.0807513244501525, + "grad_norm": 0.8908936381340027, + "learning_rate": 0.0001678159669868071, + "loss": 0.9994, + "step": 3366 + }, + { + "epoch": 1.081072403275004, + "grad_norm": 0.7920323610305786, + "learning_rate": 0.00016779030949693044, + "loss": 0.5493, + "step": 3367 + }, + { + "epoch": 1.0813934820998554, + "grad_norm": 1.6557127237319946, + "learning_rate": 0.00016776464374701025, + "loss": 0.638, + "step": 3368 + }, + { + "epoch": 1.081714560924707, + "grad_norm": 0.9681761860847473, + "learning_rate": 0.00016773896974017373, + "loss": 0.5296, + "step": 3369 + }, + { + "epoch": 1.0820356397495585, + "grad_norm": 0.812778651714325, + "learning_rate": 0.00016771328747954925, + "loss": 0.3148, + "step": 3370 + }, + { + "epoch": 1.08235671857441, + "grad_norm": 0.8901128768920898, + "learning_rate": 0.00016768759696826608, + "loss": 0.6074, + "step": 3371 + }, + { + "epoch": 1.0826777973992616, + "grad_norm": 1.0377326011657715, + "learning_rate": 0.00016766189820945456, + "loss": 0.5944, + "step": 3372 + }, + { + "epoch": 1.082998876224113, + "grad_norm": 0.9749375581741333, + "learning_rate": 0.00016763619120624594, + "loss": 0.6133, + "step": 3373 + }, + { + "epoch": 1.0833199550489645, + "grad_norm": 1.1712234020233154, + "learning_rate": 0.00016761047596177263, + "loss": 0.735, + "step": 3374 + }, + { + "epoch": 1.083641033873816, + "grad_norm": 0.9546951651573181, + "learning_rate": 0.00016758475247916787, + "loss": 0.6357, + "step": 3375 + }, + { + "epoch": 1.0839621126986676, + "grad_norm": 1.6145797967910767, + "learning_rate": 0.00016755902076156604, + "loss": 0.8267, + "step": 3376 + }, + { + "epoch": 1.084283191523519, + "grad_norm": 0.9280446767807007, + "learning_rate": 0.00016753328081210245, + "loss": 0.7201, + "step": 3377 + }, + { + "epoch": 1.0846042703483705, + "grad_norm": 0.7923296689987183, + "learning_rate": 0.00016750753263391346, + "loss": 0.6021, + "step": 3378 + }, + { + "epoch": 1.084925349173222, + "grad_norm": 0.9804905652999878, + "learning_rate": 0.00016748177623013638, + "loss": 0.544, + "step": 3379 + }, + { + "epoch": 1.0852464279980736, + "grad_norm": 0.815140962600708, + "learning_rate": 0.00016745601160390958, + "loss": 0.6108, + "step": 3380 + }, + { + "epoch": 1.0855675068229251, + "grad_norm": 0.842607855796814, + "learning_rate": 0.00016743023875837233, + "loss": 0.6777, + "step": 3381 + }, + { + "epoch": 1.0858885856477765, + "grad_norm": 0.7987306118011475, + "learning_rate": 0.00016740445769666509, + "loss": 0.672, + "step": 3382 + }, + { + "epoch": 1.086209664472628, + "grad_norm": 0.8180364966392517, + "learning_rate": 0.0001673786684219291, + "loss": 0.5932, + "step": 3383 + }, + { + "epoch": 1.0865307432974796, + "grad_norm": 1.0452377796173096, + "learning_rate": 0.00016735287093730676, + "loss": 0.5845, + "step": 3384 + }, + { + "epoch": 1.0868518221223311, + "grad_norm": 1.3078027963638306, + "learning_rate": 0.00016732706524594137, + "loss": 0.632, + "step": 3385 + }, + { + "epoch": 1.0871729009471824, + "grad_norm": 1.3132222890853882, + "learning_rate": 0.00016730125135097735, + "loss": 0.5767, + "step": 3386 + }, + { + "epoch": 1.087493979772034, + "grad_norm": 1.1473840475082397, + "learning_rate": 0.00016727542925555996, + "loss": 0.7189, + "step": 3387 + }, + { + "epoch": 1.0878150585968855, + "grad_norm": 1.0360926389694214, + "learning_rate": 0.00016724959896283559, + "loss": 0.6505, + "step": 3388 + }, + { + "epoch": 1.088136137421737, + "grad_norm": 0.9349396228790283, + "learning_rate": 0.00016722376047595164, + "loss": 0.7034, + "step": 3389 + }, + { + "epoch": 1.0884572162465886, + "grad_norm": 1.2072162628173828, + "learning_rate": 0.0001671979137980563, + "loss": 0.6266, + "step": 3390 + }, + { + "epoch": 1.08877829507144, + "grad_norm": 1.2379400730133057, + "learning_rate": 0.00016717205893229903, + "loss": 0.6909, + "step": 3391 + }, + { + "epoch": 1.0890993738962915, + "grad_norm": 0.8755012154579163, + "learning_rate": 0.00016714619588183014, + "loss": 0.7037, + "step": 3392 + }, + { + "epoch": 1.089420452721143, + "grad_norm": 1.0528205633163452, + "learning_rate": 0.00016712032464980095, + "loss": 0.6186, + "step": 3393 + }, + { + "epoch": 1.0897415315459946, + "grad_norm": 0.9427685141563416, + "learning_rate": 0.0001670944452393638, + "loss": 0.7993, + "step": 3394 + }, + { + "epoch": 1.090062610370846, + "grad_norm": 0.9150214791297913, + "learning_rate": 0.000167068557653672, + "loss": 0.739, + "step": 3395 + }, + { + "epoch": 1.0903836891956975, + "grad_norm": 2.4165000915527344, + "learning_rate": 0.0001670426618958799, + "loss": 0.6721, + "step": 3396 + }, + { + "epoch": 1.090704768020549, + "grad_norm": 0.821657121181488, + "learning_rate": 0.00016701675796914286, + "loss": 0.6333, + "step": 3397 + }, + { + "epoch": 1.0910258468454006, + "grad_norm": 0.8281720876693726, + "learning_rate": 0.0001669908458766171, + "loss": 0.6536, + "step": 3398 + }, + { + "epoch": 1.0913469256702522, + "grad_norm": 0.7724263668060303, + "learning_rate": 0.00016696492562145996, + "loss": 0.5401, + "step": 3399 + }, + { + "epoch": 1.0916680044951035, + "grad_norm": 1.0343469381332397, + "learning_rate": 0.00016693899720682977, + "loss": 0.6455, + "step": 3400 + }, + { + "epoch": 1.091989083319955, + "grad_norm": 0.9886199831962585, + "learning_rate": 0.00016691306063588583, + "loss": 0.4821, + "step": 3401 + }, + { + "epoch": 1.0923101621448066, + "grad_norm": 0.8337252736091614, + "learning_rate": 0.00016688711591178842, + "loss": 0.5141, + "step": 3402 + }, + { + "epoch": 1.0926312409696581, + "grad_norm": 0.8266648054122925, + "learning_rate": 0.00016686116303769882, + "loss": 0.5598, + "step": 3403 + }, + { + "epoch": 1.0929523197945095, + "grad_norm": 0.9328741431236267, + "learning_rate": 0.0001668352020167793, + "loss": 0.7067, + "step": 3404 + }, + { + "epoch": 1.093273398619361, + "grad_norm": 1.163390040397644, + "learning_rate": 0.00016680923285219317, + "loss": 0.6493, + "step": 3405 + }, + { + "epoch": 1.0935944774442126, + "grad_norm": 1.0952308177947998, + "learning_rate": 0.00016678325554710468, + "loss": 0.6423, + "step": 3406 + }, + { + "epoch": 1.0939155562690641, + "grad_norm": 1.341279149055481, + "learning_rate": 0.00016675727010467906, + "loss": 0.751, + "step": 3407 + }, + { + "epoch": 1.0942366350939157, + "grad_norm": 0.7064843773841858, + "learning_rate": 0.0001667312765280826, + "loss": 0.4315, + "step": 3408 + }, + { + "epoch": 1.094557713918767, + "grad_norm": 1.5312671661376953, + "learning_rate": 0.00016670527482048246, + "loss": 0.5555, + "step": 3409 + }, + { + "epoch": 1.0948787927436185, + "grad_norm": 0.8741896152496338, + "learning_rate": 0.00016667926498504696, + "loss": 0.573, + "step": 3410 + }, + { + "epoch": 1.09519987156847, + "grad_norm": 0.6594826579093933, + "learning_rate": 0.00016665324702494524, + "loss": 0.4327, + "step": 3411 + }, + { + "epoch": 1.0955209503933216, + "grad_norm": 0.7997058033943176, + "learning_rate": 0.0001666272209433476, + "loss": 0.4166, + "step": 3412 + }, + { + "epoch": 1.095842029218173, + "grad_norm": 1.526983380317688, + "learning_rate": 0.00016660118674342517, + "loss": 0.6392, + "step": 3413 + }, + { + "epoch": 1.0961631080430245, + "grad_norm": 0.7481285929679871, + "learning_rate": 0.00016657514442835014, + "loss": 0.4221, + "step": 3414 + }, + { + "epoch": 1.096484186867876, + "grad_norm": 0.7330780625343323, + "learning_rate": 0.00016654909400129575, + "loss": 0.8458, + "step": 3415 + }, + { + "epoch": 1.0968052656927276, + "grad_norm": 0.798095703125, + "learning_rate": 0.00016652303546543608, + "loss": 1.141, + "step": 3416 + }, + { + "epoch": 1.0971263445175792, + "grad_norm": 1.091076374053955, + "learning_rate": 0.00016649696882394633, + "loss": 0.6236, + "step": 3417 + }, + { + "epoch": 1.0974474233424305, + "grad_norm": 1.0143516063690186, + "learning_rate": 0.00016647089408000266, + "loss": 0.5477, + "step": 3418 + }, + { + "epoch": 1.097768502167282, + "grad_norm": 0.8810601234436035, + "learning_rate": 0.00016644481123678217, + "loss": 0.3834, + "step": 3419 + }, + { + "epoch": 1.0980895809921336, + "grad_norm": 1.0156632661819458, + "learning_rate": 0.00016641872029746297, + "loss": 0.3836, + "step": 3420 + }, + { + "epoch": 1.0984106598169852, + "grad_norm": 1.0656507015228271, + "learning_rate": 0.00016639262126522418, + "loss": 0.5248, + "step": 3421 + }, + { + "epoch": 1.0987317386418365, + "grad_norm": 0.9195857048034668, + "learning_rate": 0.00016636651414324587, + "loss": 0.3556, + "step": 3422 + }, + { + "epoch": 1.099052817466688, + "grad_norm": 1.0913866758346558, + "learning_rate": 0.00016634039893470912, + "loss": 0.8998, + "step": 3423 + }, + { + "epoch": 1.0993738962915396, + "grad_norm": 1.01368248462677, + "learning_rate": 0.000166314275642796, + "loss": 0.8026, + "step": 3424 + }, + { + "epoch": 1.0996949751163911, + "grad_norm": 0.8792101740837097, + "learning_rate": 0.00016628814427068953, + "loss": 0.7112, + "step": 3425 + }, + { + "epoch": 1.1000160539412427, + "grad_norm": 1.1131765842437744, + "learning_rate": 0.00016626200482157378, + "loss": 0.7527, + "step": 3426 + }, + { + "epoch": 1.100337132766094, + "grad_norm": 0.7965254783630371, + "learning_rate": 0.00016623585729863368, + "loss": 0.5228, + "step": 3427 + }, + { + "epoch": 1.1006582115909456, + "grad_norm": 0.7073742747306824, + "learning_rate": 0.00016620970170505534, + "loss": 0.5549, + "step": 3428 + }, + { + "epoch": 1.1009792904157971, + "grad_norm": 1.1967341899871826, + "learning_rate": 0.00016618353804402568, + "loss": 0.8142, + "step": 3429 + }, + { + "epoch": 1.1013003692406487, + "grad_norm": 1.0835646390914917, + "learning_rate": 0.00016615736631873262, + "loss": 0.7728, + "step": 3430 + }, + { + "epoch": 1.1016214480655, + "grad_norm": 0.8416442275047302, + "learning_rate": 0.00016613118653236518, + "loss": 0.5987, + "step": 3431 + }, + { + "epoch": 1.1019425268903515, + "grad_norm": 3.7327215671539307, + "learning_rate": 0.00016610499868811328, + "loss": 0.8418, + "step": 3432 + }, + { + "epoch": 1.102263605715203, + "grad_norm": 0.9980570077896118, + "learning_rate": 0.00016607880278916777, + "loss": 0.5245, + "step": 3433 + }, + { + "epoch": 1.1025846845400546, + "grad_norm": 0.9758525490760803, + "learning_rate": 0.0001660525988387206, + "loss": 0.6669, + "step": 3434 + }, + { + "epoch": 1.1029057633649062, + "grad_norm": 1.14530611038208, + "learning_rate": 0.00016602638683996463, + "loss": 0.7346, + "step": 3435 + }, + { + "epoch": 1.1032268421897575, + "grad_norm": 1.0736594200134277, + "learning_rate": 0.0001660001667960937, + "loss": 0.8106, + "step": 3436 + }, + { + "epoch": 1.103547921014609, + "grad_norm": 1.2492066621780396, + "learning_rate": 0.00016597393871030264, + "loss": 0.6462, + "step": 3437 + }, + { + "epoch": 1.1038689998394606, + "grad_norm": 1.3556599617004395, + "learning_rate": 0.0001659477025857872, + "loss": 0.8159, + "step": 3438 + }, + { + "epoch": 1.1041900786643122, + "grad_norm": 0.9458972811698914, + "learning_rate": 0.00016592145842574433, + "loss": 0.6553, + "step": 3439 + }, + { + "epoch": 1.1045111574891635, + "grad_norm": 1.0294822454452515, + "learning_rate": 0.0001658952062333717, + "loss": 0.6278, + "step": 3440 + }, + { + "epoch": 1.104832236314015, + "grad_norm": 1.8948813676834106, + "learning_rate": 0.00016586894601186805, + "loss": 0.7584, + "step": 3441 + }, + { + "epoch": 1.1051533151388666, + "grad_norm": 1.0939890146255493, + "learning_rate": 0.00016584267776443317, + "loss": 0.7916, + "step": 3442 + }, + { + "epoch": 1.1054743939637182, + "grad_norm": 1.3582258224487305, + "learning_rate": 0.00016581640149426768, + "loss": 0.6503, + "step": 3443 + }, + { + "epoch": 1.1057954727885697, + "grad_norm": 1.082184910774231, + "learning_rate": 0.00016579011720457333, + "loss": 0.6415, + "step": 3444 + }, + { + "epoch": 1.106116551613421, + "grad_norm": 0.8474713563919067, + "learning_rate": 0.00016576382489855274, + "loss": 0.5673, + "step": 3445 + }, + { + "epoch": 1.1064376304382726, + "grad_norm": 0.9523148536682129, + "learning_rate": 0.0001657375245794096, + "loss": 0.6361, + "step": 3446 + }, + { + "epoch": 1.1067587092631241, + "grad_norm": 1.021826982498169, + "learning_rate": 0.00016571121625034847, + "loss": 0.7183, + "step": 3447 + }, + { + "epoch": 1.1070797880879757, + "grad_norm": 1.3967097997665405, + "learning_rate": 0.00016568489991457497, + "loss": 0.7846, + "step": 3448 + }, + { + "epoch": 1.107400866912827, + "grad_norm": 0.7168581485748291, + "learning_rate": 0.00016565857557529566, + "loss": 0.4492, + "step": 3449 + }, + { + "epoch": 1.1077219457376786, + "grad_norm": 0.7148603200912476, + "learning_rate": 0.00016563224323571806, + "loss": 0.4595, + "step": 3450 + }, + { + "epoch": 1.1080430245625301, + "grad_norm": 1.2017083168029785, + "learning_rate": 0.00016560590289905073, + "loss": 0.5782, + "step": 3451 + }, + { + "epoch": 1.1083641033873817, + "grad_norm": 0.8505964279174805, + "learning_rate": 0.00016557955456850313, + "loss": 0.5918, + "step": 3452 + }, + { + "epoch": 1.108685182212233, + "grad_norm": 1.3165613412857056, + "learning_rate": 0.00016555319824728575, + "loss": 0.6645, + "step": 3453 + }, + { + "epoch": 1.1090062610370846, + "grad_norm": 0.9442883729934692, + "learning_rate": 0.00016552683393860997, + "loss": 0.6143, + "step": 3454 + }, + { + "epoch": 1.109327339861936, + "grad_norm": 1.0663771629333496, + "learning_rate": 0.00016550046164568827, + "loss": 0.5655, + "step": 3455 + }, + { + "epoch": 1.1096484186867877, + "grad_norm": 0.8024634122848511, + "learning_rate": 0.00016547408137173396, + "loss": 0.4356, + "step": 3456 + }, + { + "epoch": 1.1099694975116392, + "grad_norm": 1.0509380102157593, + "learning_rate": 0.00016544769311996148, + "loss": 0.5454, + "step": 3457 + }, + { + "epoch": 1.1102905763364905, + "grad_norm": 1.2511743307113647, + "learning_rate": 0.00016542129689358612, + "loss": 0.7829, + "step": 3458 + }, + { + "epoch": 1.110611655161342, + "grad_norm": 0.8298420906066895, + "learning_rate": 0.00016539489269582416, + "loss": 0.5684, + "step": 3459 + }, + { + "epoch": 1.1109327339861936, + "grad_norm": 0.8872331380844116, + "learning_rate": 0.00016536848052989291, + "loss": 0.6568, + "step": 3460 + }, + { + "epoch": 1.1112538128110452, + "grad_norm": 1.0351537466049194, + "learning_rate": 0.00016534206039901057, + "loss": 0.5094, + "step": 3461 + }, + { + "epoch": 1.1115748916358965, + "grad_norm": 0.9417381286621094, + "learning_rate": 0.00016531563230639637, + "loss": 0.4403, + "step": 3462 + }, + { + "epoch": 1.111895970460748, + "grad_norm": 1.471907615661621, + "learning_rate": 0.0001652891962552705, + "loss": 0.5832, + "step": 3463 + }, + { + "epoch": 1.1122170492855996, + "grad_norm": 0.8464019894599915, + "learning_rate": 0.00016526275224885411, + "loss": 0.4628, + "step": 3464 + }, + { + "epoch": 1.1125381281104512, + "grad_norm": 0.7151861190795898, + "learning_rate": 0.00016523630029036931, + "loss": 0.6354, + "step": 3465 + }, + { + "epoch": 1.1128592069353027, + "grad_norm": 0.8638876080513, + "learning_rate": 0.00016520984038303924, + "loss": 0.9545, + "step": 3466 + }, + { + "epoch": 1.113180285760154, + "grad_norm": 0.7509087920188904, + "learning_rate": 0.0001651833725300879, + "loss": 0.8626, + "step": 3467 + }, + { + "epoch": 1.1135013645850056, + "grad_norm": 0.7917301058769226, + "learning_rate": 0.00016515689673474033, + "loss": 0.5097, + "step": 3468 + }, + { + "epoch": 1.1138224434098571, + "grad_norm": 1.0008916854858398, + "learning_rate": 0.00016513041300022255, + "loss": 0.4359, + "step": 3469 + }, + { + "epoch": 1.1141435222347087, + "grad_norm": 1.335162878036499, + "learning_rate": 0.0001651039213297615, + "loss": 0.3395, + "step": 3470 + }, + { + "epoch": 1.11446460105956, + "grad_norm": 0.8309035897254944, + "learning_rate": 0.0001650774217265851, + "loss": 0.4172, + "step": 3471 + }, + { + "epoch": 1.1147856798844116, + "grad_norm": 1.0509357452392578, + "learning_rate": 0.00016505091419392228, + "loss": 0.9272, + "step": 3472 + }, + { + "epoch": 1.1151067587092631, + "grad_norm": 1.047833800315857, + "learning_rate": 0.00016502439873500289, + "loss": 0.8088, + "step": 3473 + }, + { + "epoch": 1.1154278375341147, + "grad_norm": 0.9323020577430725, + "learning_rate": 0.00016499787535305776, + "loss": 0.713, + "step": 3474 + }, + { + "epoch": 1.1157489163589662, + "grad_norm": 0.902013897895813, + "learning_rate": 0.00016497134405131866, + "loss": 0.644, + "step": 3475 + }, + { + "epoch": 1.1160699951838176, + "grad_norm": 0.8789449334144592, + "learning_rate": 0.00016494480483301836, + "loss": 0.6663, + "step": 3476 + }, + { + "epoch": 1.116391074008669, + "grad_norm": 0.7771266102790833, + "learning_rate": 0.00016491825770139062, + "loss": 0.7559, + "step": 3477 + }, + { + "epoch": 1.1167121528335207, + "grad_norm": 0.9561981558799744, + "learning_rate": 0.00016489170265967008, + "loss": 0.7239, + "step": 3478 + }, + { + "epoch": 1.1170332316583722, + "grad_norm": 0.9018192887306213, + "learning_rate": 0.00016486513971109243, + "loss": 0.8175, + "step": 3479 + }, + { + "epoch": 1.1173543104832235, + "grad_norm": 1.1967219114303589, + "learning_rate": 0.0001648385688588942, + "loss": 0.7613, + "step": 3480 + }, + { + "epoch": 1.117675389308075, + "grad_norm": 0.9554547667503357, + "learning_rate": 0.0001648119901063131, + "loss": 0.4639, + "step": 3481 + }, + { + "epoch": 1.1179964681329266, + "grad_norm": 1.2637308835983276, + "learning_rate": 0.00016478540345658759, + "loss": 0.7384, + "step": 3482 + }, + { + "epoch": 1.1183175469577782, + "grad_norm": 0.9255183935165405, + "learning_rate": 0.00016475880891295716, + "loss": 0.7321, + "step": 3483 + }, + { + "epoch": 1.1186386257826297, + "grad_norm": 1.0068658590316772, + "learning_rate": 0.0001647322064786623, + "loss": 0.6394, + "step": 3484 + }, + { + "epoch": 1.118959704607481, + "grad_norm": 1.1133503913879395, + "learning_rate": 0.00016470559615694446, + "loss": 0.713, + "step": 3485 + }, + { + "epoch": 1.1192807834323326, + "grad_norm": 0.8424758911132812, + "learning_rate": 0.000164678977951046, + "loss": 0.692, + "step": 3486 + }, + { + "epoch": 1.1196018622571842, + "grad_norm": 1.4674954414367676, + "learning_rate": 0.0001646523518642102, + "loss": 0.7578, + "step": 3487 + }, + { + "epoch": 1.1199229410820357, + "grad_norm": 1.0364352464675903, + "learning_rate": 0.00016462571789968152, + "loss": 0.6651, + "step": 3488 + }, + { + "epoch": 1.120244019906887, + "grad_norm": 0.8905995488166809, + "learning_rate": 0.0001645990760607051, + "loss": 0.5398, + "step": 3489 + }, + { + "epoch": 1.1205650987317386, + "grad_norm": 0.9402886629104614, + "learning_rate": 0.00016457242635052724, + "loss": 0.6423, + "step": 3490 + }, + { + "epoch": 1.1208861775565901, + "grad_norm": 1.093013882637024, + "learning_rate": 0.00016454576877239507, + "loss": 0.629, + "step": 3491 + }, + { + "epoch": 1.1212072563814417, + "grad_norm": 1.6441494226455688, + "learning_rate": 0.00016451910332955679, + "loss": 0.8024, + "step": 3492 + }, + { + "epoch": 1.1215283352062932, + "grad_norm": 1.1009982824325562, + "learning_rate": 0.00016449243002526144, + "loss": 0.6887, + "step": 3493 + }, + { + "epoch": 1.1218494140311446, + "grad_norm": 0.9255393743515015, + "learning_rate": 0.00016446574886275913, + "loss": 0.6147, + "step": 3494 + }, + { + "epoch": 1.1221704928559961, + "grad_norm": 1.203102469444275, + "learning_rate": 0.0001644390598453009, + "loss": 0.7035, + "step": 3495 + }, + { + "epoch": 1.1224915716808477, + "grad_norm": 1.5952414274215698, + "learning_rate": 0.00016441236297613866, + "loss": 0.753, + "step": 3496 + }, + { + "epoch": 1.1228126505056992, + "grad_norm": 1.0285307168960571, + "learning_rate": 0.0001643856582585254, + "loss": 0.6581, + "step": 3497 + }, + { + "epoch": 1.1231337293305506, + "grad_norm": 1.1580029726028442, + "learning_rate": 0.00016435894569571496, + "loss": 0.75, + "step": 3498 + }, + { + "epoch": 1.123454808155402, + "grad_norm": 2.223442792892456, + "learning_rate": 0.0001643322252909622, + "loss": 0.751, + "step": 3499 + }, + { + "epoch": 1.1237758869802537, + "grad_norm": 1.224037528038025, + "learning_rate": 0.00016430549704752294, + "loss": 0.6832, + "step": 3500 + }, + { + "epoch": 1.1240969658051052, + "grad_norm": 0.9943028092384338, + "learning_rate": 0.00016427876096865394, + "loss": 0.639, + "step": 3501 + }, + { + "epoch": 1.1244180446299565, + "grad_norm": 1.723598599433899, + "learning_rate": 0.00016425201705761288, + "loss": 0.7863, + "step": 3502 + }, + { + "epoch": 1.124739123454808, + "grad_norm": 0.7261873483657837, + "learning_rate": 0.00016422526531765846, + "loss": 0.53, + "step": 3503 + }, + { + "epoch": 1.1250602022796596, + "grad_norm": 0.9602160453796387, + "learning_rate": 0.00016419850575205024, + "loss": 0.4901, + "step": 3504 + }, + { + "epoch": 1.1253812811045112, + "grad_norm": 0.8210904002189636, + "learning_rate": 0.00016417173836404887, + "loss": 0.4663, + "step": 3505 + }, + { + "epoch": 1.1257023599293627, + "grad_norm": 1.026781439781189, + "learning_rate": 0.00016414496315691581, + "loss": 0.6553, + "step": 3506 + }, + { + "epoch": 1.126023438754214, + "grad_norm": 0.9842190146446228, + "learning_rate": 0.00016411818013391355, + "loss": 0.4576, + "step": 3507 + }, + { + "epoch": 1.1263445175790656, + "grad_norm": 0.8347097635269165, + "learning_rate": 0.00016409138929830553, + "loss": 0.478, + "step": 3508 + }, + { + "epoch": 1.1266655964039172, + "grad_norm": 1.0178107023239136, + "learning_rate": 0.00016406459065335615, + "loss": 0.621, + "step": 3509 + }, + { + "epoch": 1.1269866752287687, + "grad_norm": 0.8920689225196838, + "learning_rate": 0.00016403778420233075, + "loss": 0.4913, + "step": 3510 + }, + { + "epoch": 1.12730775405362, + "grad_norm": 0.9431245923042297, + "learning_rate": 0.00016401096994849557, + "loss": 0.5671, + "step": 3511 + }, + { + "epoch": 1.1276288328784716, + "grad_norm": 1.142845869064331, + "learning_rate": 0.00016398414789511786, + "loss": 0.6351, + "step": 3512 + }, + { + "epoch": 1.1279499117033231, + "grad_norm": 0.4318004250526428, + "learning_rate": 0.0001639573180454658, + "loss": 0.3718, + "step": 3513 + }, + { + "epoch": 1.1282709905281747, + "grad_norm": 0.7192524671554565, + "learning_rate": 0.00016393048040280855, + "loss": 0.411, + "step": 3514 + }, + { + "epoch": 1.1285920693530263, + "grad_norm": 0.7344934344291687, + "learning_rate": 0.00016390363497041622, + "loss": 0.7704, + "step": 3515 + }, + { + "epoch": 1.1289131481778776, + "grad_norm": 0.746320366859436, + "learning_rate": 0.00016387678175155978, + "loss": 1.1028, + "step": 3516 + }, + { + "epoch": 1.1292342270027291, + "grad_norm": 0.9115157127380371, + "learning_rate": 0.00016384992074951123, + "loss": 1.1808, + "step": 3517 + }, + { + "epoch": 1.1295553058275807, + "grad_norm": 1.0850014686584473, + "learning_rate": 0.00016382305196754356, + "loss": 0.6813, + "step": 3518 + }, + { + "epoch": 1.1298763846524322, + "grad_norm": 1.0016835927963257, + "learning_rate": 0.00016379617540893056, + "loss": 0.5984, + "step": 3519 + }, + { + "epoch": 1.1301974634772836, + "grad_norm": 0.8297609090805054, + "learning_rate": 0.0001637692910769471, + "loss": 0.3999, + "step": 3520 + }, + { + "epoch": 1.130518542302135, + "grad_norm": 0.8285101056098938, + "learning_rate": 0.000163742398974869, + "loss": 0.4791, + "step": 3521 + }, + { + "epoch": 1.1308396211269867, + "grad_norm": 0.9352186918258667, + "learning_rate": 0.00016371549910597287, + "loss": 0.5488, + "step": 3522 + }, + { + "epoch": 1.1311606999518382, + "grad_norm": 1.118675947189331, + "learning_rate": 0.0001636885914735365, + "loss": 0.7107, + "step": 3523 + }, + { + "epoch": 1.1314817787766898, + "grad_norm": 0.8197215795516968, + "learning_rate": 0.00016366167608083843, + "loss": 0.6317, + "step": 3524 + }, + { + "epoch": 1.131802857601541, + "grad_norm": 0.8849331736564636, + "learning_rate": 0.00016363475293115824, + "loss": 0.7617, + "step": 3525 + }, + { + "epoch": 1.1321239364263926, + "grad_norm": 1.0144376754760742, + "learning_rate": 0.0001636078220277764, + "loss": 0.7364, + "step": 3526 + }, + { + "epoch": 1.1324450152512442, + "grad_norm": 0.8314408659934998, + "learning_rate": 0.00016358088337397442, + "loss": 0.759, + "step": 3527 + }, + { + "epoch": 1.1327660940760957, + "grad_norm": 0.8000319004058838, + "learning_rate": 0.00016355393697303465, + "loss": 0.6728, + "step": 3528 + }, + { + "epoch": 1.133087172900947, + "grad_norm": 0.9634177684783936, + "learning_rate": 0.00016352698282824044, + "loss": 0.7623, + "step": 3529 + }, + { + "epoch": 1.1334082517257986, + "grad_norm": 0.7649347186088562, + "learning_rate": 0.00016350002094287609, + "loss": 0.5196, + "step": 3530 + }, + { + "epoch": 1.1337293305506502, + "grad_norm": 0.8428080081939697, + "learning_rate": 0.00016347305132022677, + "loss": 0.6598, + "step": 3531 + }, + { + "epoch": 1.1340504093755017, + "grad_norm": 0.9351022243499756, + "learning_rate": 0.0001634460739635787, + "loss": 0.667, + "step": 3532 + }, + { + "epoch": 1.1343714882003533, + "grad_norm": 0.854694128036499, + "learning_rate": 0.00016341908887621895, + "loss": 0.7199, + "step": 3533 + }, + { + "epoch": 1.1346925670252046, + "grad_norm": 0.8774638175964355, + "learning_rate": 0.00016339209606143563, + "loss": 0.4986, + "step": 3534 + }, + { + "epoch": 1.1350136458500562, + "grad_norm": 0.8318768739700317, + "learning_rate": 0.00016336509552251766, + "loss": 0.7203, + "step": 3535 + }, + { + "epoch": 1.1353347246749077, + "grad_norm": 1.0216394662857056, + "learning_rate": 0.000163338087262755, + "loss": 0.7023, + "step": 3536 + }, + { + "epoch": 1.1356558034997593, + "grad_norm": 1.4008989334106445, + "learning_rate": 0.00016331107128543857, + "loss": 0.8295, + "step": 3537 + }, + { + "epoch": 1.1359768823246106, + "grad_norm": 0.8423093557357788, + "learning_rate": 0.00016328404759386014, + "loss": 0.6274, + "step": 3538 + }, + { + "epoch": 1.1362979611494621, + "grad_norm": 1.1864322423934937, + "learning_rate": 0.00016325701619131246, + "loss": 0.6257, + "step": 3539 + }, + { + "epoch": 1.1366190399743137, + "grad_norm": 1.03271484375, + "learning_rate": 0.00016322997708108922, + "loss": 0.712, + "step": 3540 + }, + { + "epoch": 1.1369401187991652, + "grad_norm": 0.9949981570243835, + "learning_rate": 0.0001632029302664851, + "loss": 0.7167, + "step": 3541 + }, + { + "epoch": 1.1372611976240168, + "grad_norm": 0.9992634057998657, + "learning_rate": 0.00016317587575079563, + "loss": 0.6267, + "step": 3542 + }, + { + "epoch": 1.1375822764488681, + "grad_norm": 0.9867181181907654, + "learning_rate": 0.00016314881353731732, + "loss": 0.6953, + "step": 3543 + }, + { + "epoch": 1.1379033552737197, + "grad_norm": 1.025295615196228, + "learning_rate": 0.00016312174362934765, + "loss": 0.8544, + "step": 3544 + }, + { + "epoch": 1.1382244340985712, + "grad_norm": 0.9153071641921997, + "learning_rate": 0.00016309466603018496, + "loss": 0.6237, + "step": 3545 + }, + { + "epoch": 1.1385455129234228, + "grad_norm": 0.8474112749099731, + "learning_rate": 0.00016306758074312864, + "loss": 0.6815, + "step": 3546 + }, + { + "epoch": 1.138866591748274, + "grad_norm": 0.9534016847610474, + "learning_rate": 0.0001630404877714789, + "loss": 0.7636, + "step": 3547 + }, + { + "epoch": 1.1391876705731256, + "grad_norm": 1.3331577777862549, + "learning_rate": 0.00016301338711853693, + "loss": 0.8063, + "step": 3548 + }, + { + "epoch": 1.1395087493979772, + "grad_norm": 1.318224549293518, + "learning_rate": 0.00016298627878760487, + "loss": 0.7616, + "step": 3549 + }, + { + "epoch": 1.1398298282228287, + "grad_norm": 1.084550142288208, + "learning_rate": 0.00016295916278198584, + "loss": 0.7024, + "step": 3550 + }, + { + "epoch": 1.1401509070476803, + "grad_norm": 1.946920394897461, + "learning_rate": 0.00016293203910498376, + "loss": 0.8216, + "step": 3551 + }, + { + "epoch": 1.1404719858725316, + "grad_norm": 0.9112229943275452, + "learning_rate": 0.0001629049077599036, + "loss": 0.59, + "step": 3552 + }, + { + "epoch": 1.1407930646973832, + "grad_norm": 0.9161878824234009, + "learning_rate": 0.0001628777687500513, + "loss": 0.4761, + "step": 3553 + }, + { + "epoch": 1.1411141435222347, + "grad_norm": 0.8824045658111572, + "learning_rate": 0.00016285062207873355, + "loss": 0.6317, + "step": 3554 + }, + { + "epoch": 1.1414352223470863, + "grad_norm": 0.897005021572113, + "learning_rate": 0.00016282346774925817, + "loss": 0.5112, + "step": 3555 + }, + { + "epoch": 1.1417563011719376, + "grad_norm": 0.7413773536682129, + "learning_rate": 0.00016279630576493382, + "loss": 0.5975, + "step": 3556 + }, + { + "epoch": 1.1420773799967892, + "grad_norm": 0.8787861466407776, + "learning_rate": 0.00016276913612907007, + "loss": 0.508, + "step": 3557 + }, + { + "epoch": 1.1423984588216407, + "grad_norm": 1.0276175737380981, + "learning_rate": 0.0001627419588449775, + "loss": 0.6004, + "step": 3558 + }, + { + "epoch": 1.1427195376464923, + "grad_norm": 1.1329381465911865, + "learning_rate": 0.00016271477391596753, + "loss": 0.5258, + "step": 3559 + }, + { + "epoch": 1.1430406164713438, + "grad_norm": 0.9113026857376099, + "learning_rate": 0.0001626875813453526, + "loss": 0.5028, + "step": 3560 + }, + { + "epoch": 1.1433616952961951, + "grad_norm": 1.3290276527404785, + "learning_rate": 0.00016266038113644607, + "loss": 0.6505, + "step": 3561 + }, + { + "epoch": 1.1436827741210467, + "grad_norm": 0.9289746284484863, + "learning_rate": 0.00016263317329256213, + "loss": 0.449, + "step": 3562 + }, + { + "epoch": 1.1440038529458982, + "grad_norm": 1.1701115369796753, + "learning_rate": 0.00016260595781701604, + "loss": 0.6017, + "step": 3563 + }, + { + "epoch": 1.1443249317707498, + "grad_norm": 1.0579955577850342, + "learning_rate": 0.0001625787347131239, + "loss": 0.3807, + "step": 3564 + }, + { + "epoch": 1.1446460105956011, + "grad_norm": 0.7256002426147461, + "learning_rate": 0.00016255150398420271, + "loss": 0.6831, + "step": 3565 + }, + { + "epoch": 1.1449670894204527, + "grad_norm": 0.8824403882026672, + "learning_rate": 0.00016252426563357055, + "loss": 0.8835, + "step": 3566 + }, + { + "epoch": 1.1452881682453042, + "grad_norm": 1.1135727167129517, + "learning_rate": 0.00016249701966454625, + "loss": 0.4474, + "step": 3567 + }, + { + "epoch": 1.1456092470701558, + "grad_norm": 1.1596729755401611, + "learning_rate": 0.0001624697660804497, + "loss": 0.4549, + "step": 3568 + }, + { + "epoch": 1.1459303258950073, + "grad_norm": 1.1575850248336792, + "learning_rate": 0.00016244250488460158, + "loss": 0.3866, + "step": 3569 + }, + { + "epoch": 1.1462514047198586, + "grad_norm": 1.0404236316680908, + "learning_rate": 0.00016241523608032373, + "loss": 0.7547, + "step": 3570 + }, + { + "epoch": 1.1465724835447102, + "grad_norm": 1.020094633102417, + "learning_rate": 0.00016238795967093864, + "loss": 0.752, + "step": 3571 + }, + { + "epoch": 1.1468935623695617, + "grad_norm": 1.119242787361145, + "learning_rate": 0.00016236067565976992, + "loss": 0.755, + "step": 3572 + }, + { + "epoch": 1.1472146411944133, + "grad_norm": 0.830894947052002, + "learning_rate": 0.00016233338405014202, + "loss": 0.639, + "step": 3573 + }, + { + "epoch": 1.1475357200192646, + "grad_norm": 0.7884930968284607, + "learning_rate": 0.00016230608484538034, + "loss": 0.546, + "step": 3574 + }, + { + "epoch": 1.1478567988441162, + "grad_norm": 1.13563072681427, + "learning_rate": 0.00016227877804881127, + "loss": 0.7434, + "step": 3575 + }, + { + "epoch": 1.1481778776689677, + "grad_norm": 0.8976322412490845, + "learning_rate": 0.00016225146366376198, + "loss": 0.5961, + "step": 3576 + }, + { + "epoch": 1.1484989564938193, + "grad_norm": 0.8841371536254883, + "learning_rate": 0.00016222414169356065, + "loss": 0.5675, + "step": 3577 + }, + { + "epoch": 1.1488200353186708, + "grad_norm": 1.0855551958084106, + "learning_rate": 0.00016219681214153643, + "loss": 0.8424, + "step": 3578 + }, + { + "epoch": 1.1491411141435222, + "grad_norm": 1.0877726078033447, + "learning_rate": 0.0001621694750110193, + "loss": 0.8544, + "step": 3579 + }, + { + "epoch": 1.1494621929683737, + "grad_norm": 0.8400183320045471, + "learning_rate": 0.0001621421303053402, + "loss": 0.6311, + "step": 3580 + }, + { + "epoch": 1.1497832717932253, + "grad_norm": 0.8696182370185852, + "learning_rate": 0.00016211477802783103, + "loss": 0.5355, + "step": 3581 + }, + { + "epoch": 1.1501043506180768, + "grad_norm": 1.010580062866211, + "learning_rate": 0.0001620874181818246, + "loss": 0.7799, + "step": 3582 + }, + { + "epoch": 1.1504254294429281, + "grad_norm": 0.9656292200088501, + "learning_rate": 0.00016206005077065458, + "loss": 0.7286, + "step": 3583 + }, + { + "epoch": 1.1507465082677797, + "grad_norm": 1.5450822114944458, + "learning_rate": 0.00016203267579765563, + "loss": 0.791, + "step": 3584 + }, + { + "epoch": 1.1510675870926312, + "grad_norm": 0.836439847946167, + "learning_rate": 0.00016200529326616328, + "loss": 0.6037, + "step": 3585 + }, + { + "epoch": 1.1513886659174828, + "grad_norm": 1.2512813806533813, + "learning_rate": 0.00016197790317951403, + "loss": 0.5621, + "step": 3586 + }, + { + "epoch": 1.1517097447423343, + "grad_norm": 1.2249475717544556, + "learning_rate": 0.00016195050554104528, + "loss": 0.7086, + "step": 3587 + }, + { + "epoch": 1.1520308235671857, + "grad_norm": 1.0801141262054443, + "learning_rate": 0.00016192310035409536, + "loss": 0.7006, + "step": 3588 + }, + { + "epoch": 1.1523519023920372, + "grad_norm": 1.20231294631958, + "learning_rate": 0.00016189568762200348, + "loss": 0.6001, + "step": 3589 + }, + { + "epoch": 1.1526729812168888, + "grad_norm": 1.2221317291259766, + "learning_rate": 0.00016186826734810979, + "loss": 0.8367, + "step": 3590 + }, + { + "epoch": 1.1529940600417403, + "grad_norm": 0.8664124011993408, + "learning_rate": 0.0001618408395357554, + "loss": 0.646, + "step": 3591 + }, + { + "epoch": 1.1533151388665916, + "grad_norm": 1.5447945594787598, + "learning_rate": 0.00016181340418828233, + "loss": 0.7546, + "step": 3592 + }, + { + "epoch": 1.1536362176914432, + "grad_norm": 0.9267856478691101, + "learning_rate": 0.00016178596130903344, + "loss": 0.7593, + "step": 3593 + }, + { + "epoch": 1.1539572965162948, + "grad_norm": 1.0048587322235107, + "learning_rate": 0.0001617585109013526, + "loss": 0.6133, + "step": 3594 + }, + { + "epoch": 1.1542783753411463, + "grad_norm": 1.1687082052230835, + "learning_rate": 0.00016173105296858452, + "loss": 0.5192, + "step": 3595 + }, + { + "epoch": 1.1545994541659979, + "grad_norm": 0.8441829085350037, + "learning_rate": 0.00016170358751407487, + "loss": 0.6674, + "step": 3596 + }, + { + "epoch": 1.1549205329908492, + "grad_norm": 0.8441125154495239, + "learning_rate": 0.00016167611454117025, + "loss": 0.5651, + "step": 3597 + }, + { + "epoch": 1.1552416118157007, + "grad_norm": 0.8785204291343689, + "learning_rate": 0.0001616486340532182, + "loss": 0.6461, + "step": 3598 + }, + { + "epoch": 1.1555626906405523, + "grad_norm": 0.8196426033973694, + "learning_rate": 0.00016162114605356703, + "loss": 0.6182, + "step": 3599 + }, + { + "epoch": 1.1558837694654038, + "grad_norm": 1.4259721040725708, + "learning_rate": 0.0001615936505455662, + "loss": 0.7115, + "step": 3600 + }, + { + "epoch": 1.1562048482902552, + "grad_norm": 1.1216281652450562, + "learning_rate": 0.0001615661475325658, + "loss": 0.5116, + "step": 3601 + }, + { + "epoch": 1.1565259271151067, + "grad_norm": 0.9022599458694458, + "learning_rate": 0.00016153863701791717, + "loss": 0.5675, + "step": 3602 + }, + { + "epoch": 1.1568470059399583, + "grad_norm": 1.1778627634048462, + "learning_rate": 0.00016151111900497225, + "loss": 0.6868, + "step": 3603 + }, + { + "epoch": 1.1571680847648098, + "grad_norm": 1.2097725868225098, + "learning_rate": 0.00016148359349708402, + "loss": 0.6071, + "step": 3604 + }, + { + "epoch": 1.1574891635896614, + "grad_norm": 1.0817495584487915, + "learning_rate": 0.00016145606049760644, + "loss": 0.871, + "step": 3605 + }, + { + "epoch": 1.1578102424145127, + "grad_norm": 1.1181918382644653, + "learning_rate": 0.00016142852000989433, + "loss": 0.6375, + "step": 3606 + }, + { + "epoch": 1.1581313212393642, + "grad_norm": 1.0407376289367676, + "learning_rate": 0.00016140097203730337, + "loss": 0.6119, + "step": 3607 + }, + { + "epoch": 1.1584524000642158, + "grad_norm": 0.9020042419433594, + "learning_rate": 0.00016137341658319023, + "loss": 0.5714, + "step": 3608 + }, + { + "epoch": 1.1587734788890673, + "grad_norm": 1.4991035461425781, + "learning_rate": 0.00016134585365091243, + "loss": 0.8278, + "step": 3609 + }, + { + "epoch": 1.1590945577139187, + "grad_norm": 1.3282997608184814, + "learning_rate": 0.00016131828324382846, + "loss": 0.5376, + "step": 3610 + }, + { + "epoch": 1.1594156365387702, + "grad_norm": 3.6863720417022705, + "learning_rate": 0.00016129070536529766, + "loss": 0.5749, + "step": 3611 + }, + { + "epoch": 1.1597367153636218, + "grad_norm": 0.7200700640678406, + "learning_rate": 0.00016126312001868033, + "loss": 0.4114, + "step": 3612 + }, + { + "epoch": 1.1600577941884733, + "grad_norm": 0.7452567219734192, + "learning_rate": 0.00016123552720733765, + "loss": 0.5016, + "step": 3613 + }, + { + "epoch": 1.1603788730133249, + "grad_norm": 0.543973982334137, + "learning_rate": 0.00016120792693463174, + "loss": 0.4231, + "step": 3614 + }, + { + "epoch": 1.1606999518381762, + "grad_norm": 0.718130886554718, + "learning_rate": 0.00016118031920392558, + "loss": 0.6174, + "step": 3615 + }, + { + "epoch": 1.1610210306630278, + "grad_norm": 0.9145506024360657, + "learning_rate": 0.00016115270401858314, + "loss": 0.9531, + "step": 3616 + }, + { + "epoch": 1.1613421094878793, + "grad_norm": 0.9525867700576782, + "learning_rate": 0.00016112508138196917, + "loss": 0.6047, + "step": 3617 + }, + { + "epoch": 1.1616631883127309, + "grad_norm": 0.9359449744224548, + "learning_rate": 0.00016109745129744946, + "loss": 0.4499, + "step": 3618 + }, + { + "epoch": 1.1619842671375822, + "grad_norm": 0.8167291283607483, + "learning_rate": 0.00016106981376839066, + "loss": 0.4337, + "step": 3619 + }, + { + "epoch": 1.1623053459624337, + "grad_norm": 1.0462777614593506, + "learning_rate": 0.00016104216879816026, + "loss": 0.5782, + "step": 3620 + }, + { + "epoch": 1.1626264247872853, + "grad_norm": 0.8575799465179443, + "learning_rate": 0.0001610145163901268, + "loss": 0.7717, + "step": 3621 + }, + { + "epoch": 1.1629475036121368, + "grad_norm": 0.911267876625061, + "learning_rate": 0.00016098685654765955, + "loss": 0.786, + "step": 3622 + }, + { + "epoch": 1.1632685824369884, + "grad_norm": 0.5476121306419373, + "learning_rate": 0.0001609591892741288, + "loss": 0.4105, + "step": 3623 + }, + { + "epoch": 1.1635896612618397, + "grad_norm": 1.0486372709274292, + "learning_rate": 0.0001609315145729058, + "loss": 0.7131, + "step": 3624 + }, + { + "epoch": 1.1639107400866913, + "grad_norm": 0.9732680320739746, + "learning_rate": 0.00016090383244736256, + "loss": 0.6468, + "step": 3625 + }, + { + "epoch": 1.1642318189115428, + "grad_norm": 0.9579371213912964, + "learning_rate": 0.00016087614290087208, + "loss": 0.651, + "step": 3626 + }, + { + "epoch": 1.1645528977363944, + "grad_norm": 0.8852554559707642, + "learning_rate": 0.0001608484459368082, + "loss": 0.734, + "step": 3627 + }, + { + "epoch": 1.1648739765612457, + "grad_norm": 1.0034711360931396, + "learning_rate": 0.00016082074155854582, + "loss": 0.6897, + "step": 3628 + }, + { + "epoch": 1.1651950553860972, + "grad_norm": 1.2633333206176758, + "learning_rate": 0.00016079302976946055, + "loss": 0.5851, + "step": 3629 + }, + { + "epoch": 1.1655161342109488, + "grad_norm": 0.9554821848869324, + "learning_rate": 0.000160765310572929, + "loss": 0.6042, + "step": 3630 + }, + { + "epoch": 1.1658372130358003, + "grad_norm": 1.3010461330413818, + "learning_rate": 0.00016073758397232868, + "loss": 0.7993, + "step": 3631 + }, + { + "epoch": 1.166158291860652, + "grad_norm": 2.045257091522217, + "learning_rate": 0.000160709849971038, + "loss": 0.8856, + "step": 3632 + }, + { + "epoch": 1.1664793706855032, + "grad_norm": 1.0188322067260742, + "learning_rate": 0.00016068210857243624, + "loss": 0.7296, + "step": 3633 + }, + { + "epoch": 1.1668004495103548, + "grad_norm": 0.7480521202087402, + "learning_rate": 0.0001606543597799036, + "loss": 0.4887, + "step": 3634 + }, + { + "epoch": 1.1671215283352063, + "grad_norm": 1.0140467882156372, + "learning_rate": 0.00016062660359682124, + "loss": 0.8066, + "step": 3635 + }, + { + "epoch": 1.1674426071600579, + "grad_norm": 1.1531801223754883, + "learning_rate": 0.0001605988400265711, + "loss": 0.7233, + "step": 3636 + }, + { + "epoch": 1.1677636859849092, + "grad_norm": 0.8829801678657532, + "learning_rate": 0.00016057106907253616, + "loss": 0.6, + "step": 3637 + }, + { + "epoch": 1.1680847648097608, + "grad_norm": 1.2912678718566895, + "learning_rate": 0.00016054329073810015, + "loss": 0.6598, + "step": 3638 + }, + { + "epoch": 1.1684058436346123, + "grad_norm": 2.6909449100494385, + "learning_rate": 0.0001605155050266478, + "loss": 0.8908, + "step": 3639 + }, + { + "epoch": 1.1687269224594639, + "grad_norm": 1.1661655902862549, + "learning_rate": 0.00016048771194156477, + "loss": 0.8248, + "step": 3640 + }, + { + "epoch": 1.1690480012843154, + "grad_norm": 0.8896608948707581, + "learning_rate": 0.0001604599114862375, + "loss": 0.6519, + "step": 3641 + }, + { + "epoch": 1.1693690801091667, + "grad_norm": 1.0213063955307007, + "learning_rate": 0.0001604321036640534, + "loss": 0.598, + "step": 3642 + }, + { + "epoch": 1.1696901589340183, + "grad_norm": 1.0079537630081177, + "learning_rate": 0.00016040428847840079, + "loss": 0.5429, + "step": 3643 + }, + { + "epoch": 1.1700112377588698, + "grad_norm": 1.1655068397521973, + "learning_rate": 0.00016037646593266883, + "loss": 0.6184, + "step": 3644 + }, + { + "epoch": 1.1703323165837214, + "grad_norm": 1.1363699436187744, + "learning_rate": 0.00016034863603024767, + "loss": 0.6852, + "step": 3645 + }, + { + "epoch": 1.1706533954085727, + "grad_norm": 2.0418121814727783, + "learning_rate": 0.00016032079877452825, + "loss": 0.6902, + "step": 3646 + }, + { + "epoch": 1.1709744742334243, + "grad_norm": 0.8260238170623779, + "learning_rate": 0.00016029295416890248, + "loss": 0.5518, + "step": 3647 + }, + { + "epoch": 1.1712955530582758, + "grad_norm": 0.9947139620780945, + "learning_rate": 0.00016026510221676311, + "loss": 0.6918, + "step": 3648 + }, + { + "epoch": 1.1716166318831274, + "grad_norm": 1.0895559787750244, + "learning_rate": 0.00016023724292150385, + "loss": 0.8454, + "step": 3649 + }, + { + "epoch": 1.171937710707979, + "grad_norm": 1.3528664112091064, + "learning_rate": 0.00016020937628651927, + "loss": 0.6834, + "step": 3650 + }, + { + "epoch": 1.1722587895328302, + "grad_norm": 1.0099211931228638, + "learning_rate": 0.00016018150231520486, + "loss": 0.5097, + "step": 3651 + }, + { + "epoch": 1.1725798683576818, + "grad_norm": 1.2328240871429443, + "learning_rate": 0.0001601536210109569, + "loss": 0.6222, + "step": 3652 + }, + { + "epoch": 1.1729009471825333, + "grad_norm": 0.7499989867210388, + "learning_rate": 0.0001601257323771727, + "loss": 0.4985, + "step": 3653 + }, + { + "epoch": 1.173222026007385, + "grad_norm": 1.5503602027893066, + "learning_rate": 0.00016009783641725034, + "loss": 0.7534, + "step": 3654 + }, + { + "epoch": 1.1735431048322362, + "grad_norm": 1.1560394763946533, + "learning_rate": 0.00016006993313458896, + "loss": 0.5315, + "step": 3655 + }, + { + "epoch": 1.1738641836570878, + "grad_norm": 1.0473573207855225, + "learning_rate": 0.00016004202253258842, + "loss": 0.7029, + "step": 3656 + }, + { + "epoch": 1.1741852624819393, + "grad_norm": 0.7268290519714355, + "learning_rate": 0.00016001410461464956, + "loss": 0.3893, + "step": 3657 + }, + { + "epoch": 1.1745063413067909, + "grad_norm": 0.8399918079376221, + "learning_rate": 0.0001599861793841741, + "loss": 0.6149, + "step": 3658 + }, + { + "epoch": 1.1748274201316424, + "grad_norm": 0.9458096027374268, + "learning_rate": 0.00015995824684456465, + "loss": 0.6191, + "step": 3659 + }, + { + "epoch": 1.1751484989564938, + "grad_norm": 0.9515113234519958, + "learning_rate": 0.00015993030699922468, + "loss": 0.6762, + "step": 3660 + }, + { + "epoch": 1.1754695777813453, + "grad_norm": 0.9861279726028442, + "learning_rate": 0.0001599023598515586, + "loss": 0.4427, + "step": 3661 + }, + { + "epoch": 1.1757906566061969, + "grad_norm": 0.9347848892211914, + "learning_rate": 0.00015987440540497167, + "loss": 0.4939, + "step": 3662 + }, + { + "epoch": 1.1761117354310484, + "grad_norm": 0.8265044093132019, + "learning_rate": 0.00015984644366287006, + "loss": 0.525, + "step": 3663 + }, + { + "epoch": 1.1764328142558997, + "grad_norm": 1.6481767892837524, + "learning_rate": 0.00015981847462866083, + "loss": 0.4591, + "step": 3664 + }, + { + "epoch": 1.1767538930807513, + "grad_norm": 0.738932728767395, + "learning_rate": 0.0001597904983057519, + "loss": 0.5946, + "step": 3665 + }, + { + "epoch": 1.1770749719056028, + "grad_norm": 0.8453285694122314, + "learning_rate": 0.00015976251469755214, + "loss": 1.0439, + "step": 3666 + }, + { + "epoch": 1.1773960507304544, + "grad_norm": 1.1691863536834717, + "learning_rate": 0.00015973452380747122, + "loss": 0.7068, + "step": 3667 + }, + { + "epoch": 1.177717129555306, + "grad_norm": 0.9889507293701172, + "learning_rate": 0.00015970652563891978, + "loss": 0.6272, + "step": 3668 + }, + { + "epoch": 1.1780382083801573, + "grad_norm": 0.9118053317070007, + "learning_rate": 0.00015967852019530926, + "loss": 0.5458, + "step": 3669 + }, + { + "epoch": 1.1783592872050088, + "grad_norm": 0.9585251808166504, + "learning_rate": 0.00015965050748005215, + "loss": 0.3483, + "step": 3670 + }, + { + "epoch": 1.1786803660298604, + "grad_norm": 0.7852297425270081, + "learning_rate": 0.0001596224874965616, + "loss": 0.3958, + "step": 3671 + }, + { + "epoch": 1.179001444854712, + "grad_norm": 1.148110270500183, + "learning_rate": 0.0001595944602482518, + "loss": 0.7532, + "step": 3672 + }, + { + "epoch": 1.1793225236795633, + "grad_norm": 1.0729384422302246, + "learning_rate": 0.0001595664257385378, + "loss": 0.7822, + "step": 3673 + }, + { + "epoch": 1.1796436025044148, + "grad_norm": 1.0979061126708984, + "learning_rate": 0.00015953838397083552, + "loss": 0.8229, + "step": 3674 + }, + { + "epoch": 1.1799646813292664, + "grad_norm": 0.9034024477005005, + "learning_rate": 0.00015951033494856175, + "loss": 0.6591, + "step": 3675 + }, + { + "epoch": 1.180285760154118, + "grad_norm": 0.832550048828125, + "learning_rate": 0.00015948227867513415, + "loss": 0.6285, + "step": 3676 + }, + { + "epoch": 1.1806068389789695, + "grad_norm": 0.8613409399986267, + "learning_rate": 0.00015945421515397133, + "loss": 0.5975, + "step": 3677 + }, + { + "epoch": 1.1809279178038208, + "grad_norm": 0.762203574180603, + "learning_rate": 0.00015942614438849275, + "loss": 0.5549, + "step": 3678 + }, + { + "epoch": 1.1812489966286723, + "grad_norm": 0.8697448968887329, + "learning_rate": 0.0001593980663821187, + "loss": 0.7776, + "step": 3679 + }, + { + "epoch": 1.1815700754535239, + "grad_norm": 0.8074071407318115, + "learning_rate": 0.00015936998113827048, + "loss": 0.6768, + "step": 3680 + }, + { + "epoch": 1.1818911542783754, + "grad_norm": 0.7951146364212036, + "learning_rate": 0.00015934188866037016, + "loss": 0.6422, + "step": 3681 + }, + { + "epoch": 1.1822122331032268, + "grad_norm": 0.7514532804489136, + "learning_rate": 0.0001593137889518407, + "loss": 0.5805, + "step": 3682 + }, + { + "epoch": 1.1825333119280783, + "grad_norm": 0.7481774687767029, + "learning_rate": 0.00015928568201610595, + "loss": 0.5634, + "step": 3683 + }, + { + "epoch": 1.1828543907529299, + "grad_norm": 1.1757893562316895, + "learning_rate": 0.00015925756785659069, + "loss": 0.8346, + "step": 3684 + }, + { + "epoch": 1.1831754695777814, + "grad_norm": 1.0502216815948486, + "learning_rate": 0.00015922944647672052, + "loss": 0.6632, + "step": 3685 + }, + { + "epoch": 1.183496548402633, + "grad_norm": 0.7584315538406372, + "learning_rate": 0.00015920131787992197, + "loss": 0.5628, + "step": 3686 + }, + { + "epoch": 1.1838176272274843, + "grad_norm": 0.8802182674407959, + "learning_rate": 0.0001591731820696224, + "loss": 0.6358, + "step": 3687 + }, + { + "epoch": 1.1841387060523358, + "grad_norm": 0.9516767263412476, + "learning_rate": 0.0001591450390492501, + "loss": 0.6534, + "step": 3688 + }, + { + "epoch": 1.1844597848771874, + "grad_norm": 0.8391969799995422, + "learning_rate": 0.0001591168888222342, + "loss": 0.6248, + "step": 3689 + }, + { + "epoch": 1.184780863702039, + "grad_norm": 1.1127198934555054, + "learning_rate": 0.00015908873139200473, + "loss": 0.6813, + "step": 3690 + }, + { + "epoch": 1.1851019425268903, + "grad_norm": 1.2551233768463135, + "learning_rate": 0.00015906056676199255, + "loss": 0.577, + "step": 3691 + }, + { + "epoch": 1.1854230213517418, + "grad_norm": 1.1656824350357056, + "learning_rate": 0.00015903239493562948, + "loss": 0.8192, + "step": 3692 + }, + { + "epoch": 1.1857441001765934, + "grad_norm": 0.7559661269187927, + "learning_rate": 0.00015900421591634814, + "loss": 0.5647, + "step": 3693 + }, + { + "epoch": 1.186065179001445, + "grad_norm": 0.8689810037612915, + "learning_rate": 0.00015897602970758206, + "loss": 0.6108, + "step": 3694 + }, + { + "epoch": 1.1863862578262965, + "grad_norm": 1.167493462562561, + "learning_rate": 0.00015894783631276567, + "loss": 0.6969, + "step": 3695 + }, + { + "epoch": 1.1867073366511478, + "grad_norm": 1.027427077293396, + "learning_rate": 0.0001589196357353342, + "loss": 0.6776, + "step": 3696 + }, + { + "epoch": 1.1870284154759994, + "grad_norm": 1.0420182943344116, + "learning_rate": 0.00015889142797872387, + "loss": 0.606, + "step": 3697 + }, + { + "epoch": 1.187349494300851, + "grad_norm": 0.8472828269004822, + "learning_rate": 0.0001588632130463717, + "loss": 0.5962, + "step": 3698 + }, + { + "epoch": 1.1876705731257025, + "grad_norm": 0.9795169830322266, + "learning_rate": 0.00015883499094171554, + "loss": 0.59, + "step": 3699 + }, + { + "epoch": 1.1879916519505538, + "grad_norm": 1.1912764310836792, + "learning_rate": 0.0001588067616681942, + "loss": 0.7445, + "step": 3700 + }, + { + "epoch": 1.1883127307754053, + "grad_norm": 1.2817906141281128, + "learning_rate": 0.00015877852522924732, + "loss": 0.447, + "step": 3701 + }, + { + "epoch": 1.1886338096002569, + "grad_norm": 1.1116948127746582, + "learning_rate": 0.00015875028162831546, + "loss": 0.7283, + "step": 3702 + }, + { + "epoch": 1.1889548884251084, + "grad_norm": 1.298180341720581, + "learning_rate": 0.00015872203086883996, + "loss": 0.6075, + "step": 3703 + }, + { + "epoch": 1.18927596724996, + "grad_norm": 1.0509965419769287, + "learning_rate": 0.00015869377295426316, + "loss": 0.5656, + "step": 3704 + }, + { + "epoch": 1.1895970460748113, + "grad_norm": 1.1119554042816162, + "learning_rate": 0.00015866550788802813, + "loss": 0.5898, + "step": 3705 + }, + { + "epoch": 1.1899181248996629, + "grad_norm": 1.1349931955337524, + "learning_rate": 0.00015863723567357892, + "loss": 0.7535, + "step": 3706 + }, + { + "epoch": 1.1902392037245144, + "grad_norm": 1.1311873197555542, + "learning_rate": 0.00015860895631436043, + "loss": 0.6378, + "step": 3707 + }, + { + "epoch": 1.1905602825493657, + "grad_norm": 1.291495680809021, + "learning_rate": 0.0001585806698138184, + "loss": 0.6692, + "step": 3708 + }, + { + "epoch": 1.1908813613742173, + "grad_norm": 0.741792619228363, + "learning_rate": 0.0001585523761753994, + "loss": 0.3739, + "step": 3709 + }, + { + "epoch": 1.1912024401990688, + "grad_norm": 1.0837442874908447, + "learning_rate": 0.00015852407540255104, + "loss": 0.5723, + "step": 3710 + }, + { + "epoch": 1.1915235190239204, + "grad_norm": 0.8327803015708923, + "learning_rate": 0.00015849576749872157, + "loss": 0.565, + "step": 3711 + }, + { + "epoch": 1.191844597848772, + "grad_norm": 0.62799072265625, + "learning_rate": 0.00015846745246736026, + "loss": 0.3977, + "step": 3712 + }, + { + "epoch": 1.1921656766736235, + "grad_norm": 0.8305913209915161, + "learning_rate": 0.00015843913031191723, + "loss": 0.4562, + "step": 3713 + }, + { + "epoch": 1.1924867554984748, + "grad_norm": 0.7770129442214966, + "learning_rate": 0.00015841080103584342, + "loss": 0.4003, + "step": 3714 + }, + { + "epoch": 1.1928078343233264, + "grad_norm": 0.9295758605003357, + "learning_rate": 0.0001583824646425907, + "loss": 0.803, + "step": 3715 + }, + { + "epoch": 1.193128913148178, + "grad_norm": 0.8279706239700317, + "learning_rate": 0.00015835412113561175, + "loss": 0.3903, + "step": 3716 + }, + { + "epoch": 1.1934499919730293, + "grad_norm": 0.8809469938278198, + "learning_rate": 0.00015832577051836015, + "loss": 0.771, + "step": 3717 + }, + { + "epoch": 1.1937710707978808, + "grad_norm": 1.3099849224090576, + "learning_rate": 0.00015829741279429035, + "loss": 0.7353, + "step": 3718 + }, + { + "epoch": 1.1940921496227324, + "grad_norm": 0.8764384388923645, + "learning_rate": 0.00015826904796685762, + "loss": 0.5436, + "step": 3719 + }, + { + "epoch": 1.194413228447584, + "grad_norm": 0.8666921257972717, + "learning_rate": 0.00015824067603951812, + "loss": 0.323, + "step": 3720 + }, + { + "epoch": 1.1947343072724355, + "grad_norm": 0.9187259674072266, + "learning_rate": 0.00015821229701572896, + "loss": 0.5078, + "step": 3721 + }, + { + "epoch": 1.1950553860972868, + "grad_norm": 0.9329090118408203, + "learning_rate": 0.00015818391089894796, + "loss": 0.4697, + "step": 3722 + }, + { + "epoch": 1.1953764649221383, + "grad_norm": 0.9224849939346313, + "learning_rate": 0.00015815551769263387, + "loss": 0.6722, + "step": 3723 + }, + { + "epoch": 1.1956975437469899, + "grad_norm": 0.9811240434646606, + "learning_rate": 0.0001581271174002464, + "loss": 0.7558, + "step": 3724 + }, + { + "epoch": 1.1960186225718414, + "grad_norm": 0.8810230493545532, + "learning_rate": 0.000158098710025246, + "loss": 0.6703, + "step": 3725 + }, + { + "epoch": 1.1963397013966928, + "grad_norm": 0.852672815322876, + "learning_rate": 0.00015807029557109398, + "loss": 0.6907, + "step": 3726 + }, + { + "epoch": 1.1966607802215443, + "grad_norm": 0.9756046533584595, + "learning_rate": 0.0001580418740412526, + "loss": 0.7458, + "step": 3727 + }, + { + "epoch": 1.1969818590463959, + "grad_norm": 0.7994744181632996, + "learning_rate": 0.00015801344543918495, + "loss": 0.572, + "step": 3728 + }, + { + "epoch": 1.1973029378712474, + "grad_norm": 0.8432713747024536, + "learning_rate": 0.00015798500976835493, + "loss": 0.567, + "step": 3729 + }, + { + "epoch": 1.197624016696099, + "grad_norm": 0.8624154925346375, + "learning_rate": 0.00015795656703222736, + "loss": 0.5077, + "step": 3730 + }, + { + "epoch": 1.1979450955209503, + "grad_norm": 1.059381127357483, + "learning_rate": 0.0001579281172342679, + "loss": 0.6805, + "step": 3731 + }, + { + "epoch": 1.1982661743458018, + "grad_norm": 1.291218638420105, + "learning_rate": 0.00015789966037794306, + "loss": 0.816, + "step": 3732 + }, + { + "epoch": 1.1985872531706534, + "grad_norm": 1.0364362001419067, + "learning_rate": 0.00015787119646672025, + "loss": 0.7035, + "step": 3733 + }, + { + "epoch": 1.198908331995505, + "grad_norm": 0.8102630972862244, + "learning_rate": 0.00015784272550406765, + "loss": 0.4892, + "step": 3734 + }, + { + "epoch": 1.1992294108203563, + "grad_norm": 0.8558127284049988, + "learning_rate": 0.00015781424749345446, + "loss": 0.6971, + "step": 3735 + }, + { + "epoch": 1.1995504896452078, + "grad_norm": 0.9906354546546936, + "learning_rate": 0.00015778576243835054, + "loss": 0.5872, + "step": 3736 + }, + { + "epoch": 1.1998715684700594, + "grad_norm": 1.178027629852295, + "learning_rate": 0.00015775727034222675, + "loss": 0.7676, + "step": 3737 + }, + { + "epoch": 1.200192647294911, + "grad_norm": 0.6666367053985596, + "learning_rate": 0.0001577287712085548, + "loss": 0.4359, + "step": 3738 + }, + { + "epoch": 1.2005137261197625, + "grad_norm": 0.7421877980232239, + "learning_rate": 0.00015770026504080718, + "loss": 0.582, + "step": 3739 + }, + { + "epoch": 1.2008348049446138, + "grad_norm": 1.3498107194900513, + "learning_rate": 0.00015767175184245726, + "loss": 0.656, + "step": 3740 + }, + { + "epoch": 1.2011558837694654, + "grad_norm": 0.896021842956543, + "learning_rate": 0.00015764323161697935, + "loss": 0.4453, + "step": 3741 + }, + { + "epoch": 1.201476962594317, + "grad_norm": 0.9143558740615845, + "learning_rate": 0.00015761470436784846, + "loss": 0.4917, + "step": 3742 + }, + { + "epoch": 1.2017980414191685, + "grad_norm": 1.4460296630859375, + "learning_rate": 0.0001575861700985407, + "loss": 0.8317, + "step": 3743 + }, + { + "epoch": 1.2021191202440198, + "grad_norm": 1.7859466075897217, + "learning_rate": 0.0001575576288125327, + "loss": 0.6328, + "step": 3744 + }, + { + "epoch": 1.2024401990688713, + "grad_norm": 0.773352324962616, + "learning_rate": 0.00015752908051330227, + "loss": 0.4811, + "step": 3745 + }, + { + "epoch": 1.202761277893723, + "grad_norm": 0.9597437381744385, + "learning_rate": 0.00015750052520432787, + "loss": 0.6322, + "step": 3746 + }, + { + "epoch": 1.2030823567185744, + "grad_norm": 1.0898431539535522, + "learning_rate": 0.00015747196288908887, + "loss": 0.6503, + "step": 3747 + }, + { + "epoch": 1.203403435543426, + "grad_norm": 1.4335774183273315, + "learning_rate": 0.00015744339357106558, + "loss": 0.502, + "step": 3748 + }, + { + "epoch": 1.2037245143682773, + "grad_norm": 1.5681631565093994, + "learning_rate": 0.000157414817253739, + "loss": 0.7586, + "step": 3749 + }, + { + "epoch": 1.2040455931931289, + "grad_norm": 1.2081352472305298, + "learning_rate": 0.00015738623394059107, + "loss": 0.743, + "step": 3750 + }, + { + "epoch": 1.2043666720179804, + "grad_norm": 0.9064257740974426, + "learning_rate": 0.0001573576436351046, + "loss": 0.6028, + "step": 3751 + }, + { + "epoch": 1.204687750842832, + "grad_norm": 0.9491926431655884, + "learning_rate": 0.00015732904634076328, + "loss": 0.5614, + "step": 3752 + }, + { + "epoch": 1.2050088296676833, + "grad_norm": 1.0897586345672607, + "learning_rate": 0.00015730044206105156, + "loss": 0.5848, + "step": 3753 + }, + { + "epoch": 1.2053299084925349, + "grad_norm": 0.8406479358673096, + "learning_rate": 0.00015727183079945476, + "loss": 0.4931, + "step": 3754 + }, + { + "epoch": 1.2056509873173864, + "grad_norm": 0.9714513421058655, + "learning_rate": 0.0001572432125594591, + "loss": 0.4901, + "step": 3755 + }, + { + "epoch": 1.205972066142238, + "grad_norm": 1.135326623916626, + "learning_rate": 0.00015721458734455163, + "loss": 0.6236, + "step": 3756 + }, + { + "epoch": 1.2062931449670895, + "grad_norm": 2.5608773231506348, + "learning_rate": 0.00015718595515822027, + "loss": 0.5221, + "step": 3757 + }, + { + "epoch": 1.2066142237919408, + "grad_norm": 0.9237856864929199, + "learning_rate": 0.0001571573160039537, + "loss": 0.5451, + "step": 3758 + }, + { + "epoch": 1.2069353026167924, + "grad_norm": 1.0552841424942017, + "learning_rate": 0.00015712866988524155, + "loss": 0.5602, + "step": 3759 + }, + { + "epoch": 1.207256381441644, + "grad_norm": 1.123316764831543, + "learning_rate": 0.0001571000168055743, + "loss": 0.6103, + "step": 3760 + }, + { + "epoch": 1.2075774602664955, + "grad_norm": 0.7353083491325378, + "learning_rate": 0.0001570713567684432, + "loss": 0.4778, + "step": 3761 + }, + { + "epoch": 1.2078985390913468, + "grad_norm": 1.4208050966262817, + "learning_rate": 0.00015704268977734037, + "loss": 0.529, + "step": 3762 + }, + { + "epoch": 1.2082196179161984, + "grad_norm": 1.0792194604873657, + "learning_rate": 0.00015701401583575884, + "loss": 0.5751, + "step": 3763 + }, + { + "epoch": 1.20854069674105, + "grad_norm": 0.7297836542129517, + "learning_rate": 0.00015698533494719238, + "loss": 0.4981, + "step": 3764 + }, + { + "epoch": 1.2088617755659015, + "grad_norm": 0.9594284892082214, + "learning_rate": 0.00015695664711513576, + "loss": 0.9049, + "step": 3765 + }, + { + "epoch": 1.209182854390753, + "grad_norm": 0.8230898976325989, + "learning_rate": 0.00015692795234308445, + "loss": 0.964, + "step": 3766 + }, + { + "epoch": 1.2095039332156043, + "grad_norm": 0.7228332757949829, + "learning_rate": 0.0001568992506345348, + "loss": 1.1967, + "step": 3767 + }, + { + "epoch": 1.209825012040456, + "grad_norm": 0.8351823687553406, + "learning_rate": 0.00015687054199298408, + "loss": 0.5122, + "step": 3768 + }, + { + "epoch": 1.2101460908653074, + "grad_norm": 1.0156910419464111, + "learning_rate": 0.0001568418264219303, + "loss": 0.4224, + "step": 3769 + }, + { + "epoch": 1.210467169690159, + "grad_norm": 0.9522204995155334, + "learning_rate": 0.0001568131039248724, + "loss": 0.4744, + "step": 3770 + }, + { + "epoch": 1.2107882485150103, + "grad_norm": 0.835364818572998, + "learning_rate": 0.00015678437450531013, + "loss": 0.3386, + "step": 3771 + }, + { + "epoch": 1.2111093273398619, + "grad_norm": 1.0940889120101929, + "learning_rate": 0.00015675563816674407, + "loss": 0.4991, + "step": 3772 + }, + { + "epoch": 1.2114304061647134, + "grad_norm": 0.7849900722503662, + "learning_rate": 0.00015672689491267567, + "loss": 0.3609, + "step": 3773 + }, + { + "epoch": 1.211751484989565, + "grad_norm": 1.0691943168640137, + "learning_rate": 0.00015669814474660718, + "loss": 0.8524, + "step": 3774 + }, + { + "epoch": 1.2120725638144165, + "grad_norm": 2.4347715377807617, + "learning_rate": 0.0001566693876720417, + "loss": 0.6352, + "step": 3775 + }, + { + "epoch": 1.2123936426392679, + "grad_norm": 0.8152544498443604, + "learning_rate": 0.00015664062369248328, + "loss": 0.5616, + "step": 3776 + }, + { + "epoch": 1.2127147214641194, + "grad_norm": 0.7303499579429626, + "learning_rate": 0.00015661185281143667, + "loss": 0.546, + "step": 3777 + }, + { + "epoch": 1.213035800288971, + "grad_norm": 0.9726660251617432, + "learning_rate": 0.0001565830750324075, + "loss": 0.5368, + "step": 3778 + }, + { + "epoch": 1.2133568791138225, + "grad_norm": 0.8433571457862854, + "learning_rate": 0.0001565542903589023, + "loss": 0.5876, + "step": 3779 + }, + { + "epoch": 1.2136779579386738, + "grad_norm": 0.7719855308532715, + "learning_rate": 0.00015652549879442834, + "loss": 0.6917, + "step": 3780 + }, + { + "epoch": 1.2139990367635254, + "grad_norm": 0.931829571723938, + "learning_rate": 0.0001564967003424938, + "loss": 0.7677, + "step": 3781 + }, + { + "epoch": 1.214320115588377, + "grad_norm": 1.2775667905807495, + "learning_rate": 0.00015646789500660773, + "loss": 0.8883, + "step": 3782 + }, + { + "epoch": 1.2146411944132285, + "grad_norm": 0.9605757594108582, + "learning_rate": 0.00015643908279027992, + "loss": 0.6806, + "step": 3783 + }, + { + "epoch": 1.21496227323808, + "grad_norm": 0.9084275960922241, + "learning_rate": 0.00015641026369702106, + "loss": 0.6906, + "step": 3784 + }, + { + "epoch": 1.2152833520629314, + "grad_norm": 0.9735769629478455, + "learning_rate": 0.00015638143773034267, + "loss": 0.6756, + "step": 3785 + }, + { + "epoch": 1.215604430887783, + "grad_norm": 1.068384051322937, + "learning_rate": 0.00015635260489375714, + "loss": 0.7305, + "step": 3786 + }, + { + "epoch": 1.2159255097126345, + "grad_norm": 0.9031590223312378, + "learning_rate": 0.00015632376519077767, + "loss": 0.6657, + "step": 3787 + }, + { + "epoch": 1.216246588537486, + "grad_norm": 1.1685518026351929, + "learning_rate": 0.0001562949186249182, + "loss": 0.8324, + "step": 3788 + }, + { + "epoch": 1.2165676673623373, + "grad_norm": 0.9389743804931641, + "learning_rate": 0.0001562660651996937, + "loss": 0.6689, + "step": 3789 + }, + { + "epoch": 1.216888746187189, + "grad_norm": 0.8513121008872986, + "learning_rate": 0.0001562372049186198, + "loss": 0.5915, + "step": 3790 + }, + { + "epoch": 1.2172098250120404, + "grad_norm": 1.3301939964294434, + "learning_rate": 0.00015620833778521307, + "loss": 0.5882, + "step": 3791 + }, + { + "epoch": 1.217530903836892, + "grad_norm": 1.1245044469833374, + "learning_rate": 0.00015617946380299088, + "loss": 0.6076, + "step": 3792 + }, + { + "epoch": 1.2178519826617435, + "grad_norm": 0.8256964087486267, + "learning_rate": 0.00015615058297547145, + "loss": 0.5895, + "step": 3793 + }, + { + "epoch": 1.2181730614865949, + "grad_norm": 1.246702790260315, + "learning_rate": 0.0001561216953061738, + "loss": 0.6992, + "step": 3794 + }, + { + "epoch": 1.2184941403114464, + "grad_norm": 0.8872213959693909, + "learning_rate": 0.0001560928007986178, + "loss": 0.547, + "step": 3795 + }, + { + "epoch": 1.218815219136298, + "grad_norm": 1.465101718902588, + "learning_rate": 0.0001560638994563242, + "loss": 0.879, + "step": 3796 + }, + { + "epoch": 1.2191362979611495, + "grad_norm": 2.137561321258545, + "learning_rate": 0.00015603499128281448, + "loss": 0.5223, + "step": 3797 + }, + { + "epoch": 1.2194573767860009, + "grad_norm": 1.957757830619812, + "learning_rate": 0.00015600607628161103, + "loss": 0.5569, + "step": 3798 + }, + { + "epoch": 1.2197784556108524, + "grad_norm": 0.9652926921844482, + "learning_rate": 0.00015597715445623712, + "loss": 0.6782, + "step": 3799 + }, + { + "epoch": 1.220099534435704, + "grad_norm": 1.3139662742614746, + "learning_rate": 0.0001559482258102167, + "loss": 0.5627, + "step": 3800 + }, + { + "epoch": 1.2204206132605555, + "grad_norm": 1.0370103120803833, + "learning_rate": 0.0001559192903470747, + "loss": 0.5761, + "step": 3801 + }, + { + "epoch": 1.220741692085407, + "grad_norm": 1.4218648672103882, + "learning_rate": 0.00015589034807033677, + "loss": 0.6787, + "step": 3802 + }, + { + "epoch": 1.2210627709102584, + "grad_norm": 1.046571969985962, + "learning_rate": 0.00015586139898352946, + "loss": 0.5636, + "step": 3803 + }, + { + "epoch": 1.22138384973511, + "grad_norm": 0.8225265145301819, + "learning_rate": 0.00015583244309018014, + "loss": 0.5093, + "step": 3804 + }, + { + "epoch": 1.2217049285599615, + "grad_norm": 0.8345786929130554, + "learning_rate": 0.000155803480393817, + "loss": 0.6215, + "step": 3805 + }, + { + "epoch": 1.222026007384813, + "grad_norm": 1.0300137996673584, + "learning_rate": 0.00015577451089796905, + "loss": 0.5036, + "step": 3806 + }, + { + "epoch": 1.2223470862096644, + "grad_norm": 1.6953532695770264, + "learning_rate": 0.00015574553460616608, + "loss": 0.5542, + "step": 3807 + }, + { + "epoch": 1.222668165034516, + "grad_norm": 1.2669715881347656, + "learning_rate": 0.00015571655152193885, + "loss": 0.5519, + "step": 3808 + }, + { + "epoch": 1.2229892438593675, + "grad_norm": 0.9114670753479004, + "learning_rate": 0.00015568756164881882, + "loss": 0.5563, + "step": 3809 + }, + { + "epoch": 1.223310322684219, + "grad_norm": 1.259053349494934, + "learning_rate": 0.00015565856499033832, + "loss": 0.5622, + "step": 3810 + }, + { + "epoch": 1.2236314015090706, + "grad_norm": 1.13980233669281, + "learning_rate": 0.0001556295615500305, + "loss": 0.528, + "step": 3811 + }, + { + "epoch": 1.223952480333922, + "grad_norm": 1.1629769802093506, + "learning_rate": 0.0001556005513314293, + "loss": 0.389, + "step": 3812 + }, + { + "epoch": 1.2242735591587735, + "grad_norm": 1.1004542112350464, + "learning_rate": 0.00015557153433806966, + "loss": 0.43, + "step": 3813 + }, + { + "epoch": 1.224594637983625, + "grad_norm": 1.0557912588119507, + "learning_rate": 0.00015554251057348713, + "loss": 0.5082, + "step": 3814 + }, + { + "epoch": 1.2249157168084766, + "grad_norm": 0.8889212012290955, + "learning_rate": 0.0001555134800412181, + "loss": 0.9576, + "step": 3815 + }, + { + "epoch": 1.2252367956333279, + "grad_norm": 0.7801089882850647, + "learning_rate": 0.00015548444274479995, + "loss": 1.078, + "step": 3816 + }, + { + "epoch": 1.2255578744581794, + "grad_norm": 0.8572964668273926, + "learning_rate": 0.00015545539868777074, + "loss": 0.5412, + "step": 3817 + }, + { + "epoch": 1.225878953283031, + "grad_norm": 1.0506659746170044, + "learning_rate": 0.00015542634787366942, + "loss": 0.5478, + "step": 3818 + }, + { + "epoch": 1.2262000321078825, + "grad_norm": 0.9632019996643066, + "learning_rate": 0.00015539729030603574, + "loss": 0.4899, + "step": 3819 + }, + { + "epoch": 1.226521110932734, + "grad_norm": 0.9268298149108887, + "learning_rate": 0.00015536822598841024, + "loss": 0.2964, + "step": 3820 + }, + { + "epoch": 1.2268421897575854, + "grad_norm": 0.7900720834732056, + "learning_rate": 0.00015533915492433443, + "loss": 0.4777, + "step": 3821 + }, + { + "epoch": 1.227163268582437, + "grad_norm": 0.9437267780303955, + "learning_rate": 0.0001553100771173504, + "loss": 0.3384, + "step": 3822 + }, + { + "epoch": 1.2274843474072885, + "grad_norm": 0.995335578918457, + "learning_rate": 0.00015528099257100127, + "loss": 0.5097, + "step": 3823 + }, + { + "epoch": 1.22780542623214, + "grad_norm": 1.3662070035934448, + "learning_rate": 0.00015525190128883083, + "loss": 0.7474, + "step": 3824 + }, + { + "epoch": 1.2281265050569914, + "grad_norm": 1.1693106889724731, + "learning_rate": 0.00015522280327438388, + "loss": 0.6907, + "step": 3825 + }, + { + "epoch": 1.228447583881843, + "grad_norm": 0.9598641395568848, + "learning_rate": 0.0001551936985312058, + "loss": 0.651, + "step": 3826 + }, + { + "epoch": 1.2287686627066945, + "grad_norm": 0.9100416898727417, + "learning_rate": 0.00015516458706284303, + "loss": 0.6725, + "step": 3827 + }, + { + "epoch": 1.229089741531546, + "grad_norm": 0.8291442394256592, + "learning_rate": 0.00015513546887284264, + "loss": 0.621, + "step": 3828 + }, + { + "epoch": 1.2294108203563976, + "grad_norm": 0.8551065325737, + "learning_rate": 0.0001551063439647526, + "loss": 0.593, + "step": 3829 + }, + { + "epoch": 1.229731899181249, + "grad_norm": 0.6962476968765259, + "learning_rate": 0.00015507721234212172, + "loss": 0.6304, + "step": 3830 + }, + { + "epoch": 1.2300529780061005, + "grad_norm": 0.8959445357322693, + "learning_rate": 0.00015504807400849958, + "loss": 0.7921, + "step": 3831 + }, + { + "epoch": 1.230374056830952, + "grad_norm": 0.88429856300354, + "learning_rate": 0.0001550189289674366, + "loss": 0.6881, + "step": 3832 + }, + { + "epoch": 1.2306951356558036, + "grad_norm": 1.0589004755020142, + "learning_rate": 0.000154989777222484, + "loss": 0.6597, + "step": 3833 + }, + { + "epoch": 1.231016214480655, + "grad_norm": 0.932532548904419, + "learning_rate": 0.00015496061877719384, + "loss": 0.6299, + "step": 3834 + }, + { + "epoch": 1.2313372933055065, + "grad_norm": 1.763346552848816, + "learning_rate": 0.000154931453635119, + "loss": 0.6717, + "step": 3835 + }, + { + "epoch": 1.231658372130358, + "grad_norm": 2.0481505393981934, + "learning_rate": 0.0001549022817998132, + "loss": 0.6489, + "step": 3836 + }, + { + "epoch": 1.2319794509552096, + "grad_norm": 0.8745352029800415, + "learning_rate": 0.00015487310327483086, + "loss": 0.6171, + "step": 3837 + }, + { + "epoch": 1.232300529780061, + "grad_norm": 0.8354154825210571, + "learning_rate": 0.00015484391806372733, + "loss": 0.669, + "step": 3838 + }, + { + "epoch": 1.2326216086049124, + "grad_norm": 0.9111486673355103, + "learning_rate": 0.00015481472617005876, + "loss": 0.7414, + "step": 3839 + }, + { + "epoch": 1.232942687429764, + "grad_norm": 1.0252684354782104, + "learning_rate": 0.00015478552759738207, + "loss": 0.724, + "step": 3840 + }, + { + "epoch": 1.2332637662546155, + "grad_norm": 1.018760323524475, + "learning_rate": 0.00015475632234925504, + "loss": 0.5888, + "step": 3841 + }, + { + "epoch": 1.233584845079467, + "grad_norm": 1.5923256874084473, + "learning_rate": 0.00015472711042923621, + "loss": 0.6708, + "step": 3842 + }, + { + "epoch": 1.2339059239043184, + "grad_norm": 1.3504046201705933, + "learning_rate": 0.00015469789184088497, + "loss": 0.6927, + "step": 3843 + }, + { + "epoch": 1.23422700272917, + "grad_norm": 0.9088150262832642, + "learning_rate": 0.00015466866658776155, + "loss": 0.6003, + "step": 3844 + }, + { + "epoch": 1.2345480815540215, + "grad_norm": 0.9223833680152893, + "learning_rate": 0.00015463943467342693, + "loss": 0.7096, + "step": 3845 + }, + { + "epoch": 1.234869160378873, + "grad_norm": 0.9848785400390625, + "learning_rate": 0.0001546101961014429, + "loss": 0.6357, + "step": 3846 + }, + { + "epoch": 1.2351902392037246, + "grad_norm": 0.9770264029502869, + "learning_rate": 0.00015458095087537218, + "loss": 0.5513, + "step": 3847 + }, + { + "epoch": 1.235511318028576, + "grad_norm": 0.8450601696968079, + "learning_rate": 0.00015455169899877813, + "loss": 0.571, + "step": 3848 + }, + { + "epoch": 1.2358323968534275, + "grad_norm": 0.8427115678787231, + "learning_rate": 0.00015452244047522502, + "loss": 0.637, + "step": 3849 + }, + { + "epoch": 1.236153475678279, + "grad_norm": 0.7641344666481018, + "learning_rate": 0.00015449317530827794, + "loss": 0.564, + "step": 3850 + }, + { + "epoch": 1.2364745545031306, + "grad_norm": 1.1168718338012695, + "learning_rate": 0.00015446390350150273, + "loss": 0.8165, + "step": 3851 + }, + { + "epoch": 1.236795633327982, + "grad_norm": 1.0445823669433594, + "learning_rate": 0.0001544346250584661, + "loss": 0.5067, + "step": 3852 + }, + { + "epoch": 1.2371167121528335, + "grad_norm": 0.9036487340927124, + "learning_rate": 0.00015440533998273547, + "loss": 0.5537, + "step": 3853 + }, + { + "epoch": 1.237437790977685, + "grad_norm": 0.7309536933898926, + "learning_rate": 0.00015437604827787927, + "loss": 0.4998, + "step": 3854 + }, + { + "epoch": 1.2377588698025366, + "grad_norm": 0.7521160840988159, + "learning_rate": 0.0001543467499474665, + "loss": 0.4568, + "step": 3855 + }, + { + "epoch": 1.2380799486273881, + "grad_norm": 1.4212712049484253, + "learning_rate": 0.00015431744499506706, + "loss": 0.6532, + "step": 3856 + }, + { + "epoch": 1.2384010274522395, + "grad_norm": 0.758643627166748, + "learning_rate": 0.00015428813342425177, + "loss": 0.436, + "step": 3857 + }, + { + "epoch": 1.238722106277091, + "grad_norm": 0.8695588707923889, + "learning_rate": 0.00015425881523859207, + "loss": 0.4969, + "step": 3858 + }, + { + "epoch": 1.2390431851019426, + "grad_norm": 1.0201971530914307, + "learning_rate": 0.0001542294904416603, + "loss": 0.5688, + "step": 3859 + }, + { + "epoch": 1.239364263926794, + "grad_norm": 0.7259702682495117, + "learning_rate": 0.00015420015903702962, + "loss": 0.4379, + "step": 3860 + }, + { + "epoch": 1.2396853427516454, + "grad_norm": 1.1150838136672974, + "learning_rate": 0.000154170821028274, + "loss": 0.6326, + "step": 3861 + }, + { + "epoch": 1.240006421576497, + "grad_norm": 0.9569064974784851, + "learning_rate": 0.00015414147641896813, + "loss": 0.4673, + "step": 3862 + }, + { + "epoch": 1.2403275004013485, + "grad_norm": 0.6631338000297546, + "learning_rate": 0.00015411212521268758, + "loss": 0.4138, + "step": 3863 + }, + { + "epoch": 1.2406485792262, + "grad_norm": 1.1545629501342773, + "learning_rate": 0.00015408276741300873, + "loss": 0.4498, + "step": 3864 + }, + { + "epoch": 1.2409696580510516, + "grad_norm": 3.294052839279175, + "learning_rate": 0.00015405340302350871, + "loss": 0.7942, + "step": 3865 + }, + { + "epoch": 1.241290736875903, + "grad_norm": 0.7048175930976868, + "learning_rate": 0.0001540240320477655, + "loss": 0.727, + "step": 3866 + }, + { + "epoch": 1.2416118157007545, + "grad_norm": 0.6912996172904968, + "learning_rate": 0.00015399465448935788, + "loss": 0.7886, + "step": 3867 + }, + { + "epoch": 1.241932894525606, + "grad_norm": 0.8170081377029419, + "learning_rate": 0.00015396527035186537, + "loss": 0.4395, + "step": 3868 + }, + { + "epoch": 1.2422539733504576, + "grad_norm": 0.9392108917236328, + "learning_rate": 0.00015393587963886835, + "loss": 0.3322, + "step": 3869 + }, + { + "epoch": 1.242575052175309, + "grad_norm": 1.0908229351043701, + "learning_rate": 0.00015390648235394803, + "loss": 0.3907, + "step": 3870 + }, + { + "epoch": 1.2428961310001605, + "grad_norm": 1.2291988134384155, + "learning_rate": 0.0001538770785006863, + "loss": 0.5189, + "step": 3871 + }, + { + "epoch": 1.243217209825012, + "grad_norm": 0.9118694067001343, + "learning_rate": 0.00015384766808266602, + "loss": 0.702, + "step": 3872 + }, + { + "epoch": 1.2435382886498636, + "grad_norm": 1.1265658140182495, + "learning_rate": 0.00015381825110347074, + "loss": 0.6291, + "step": 3873 + }, + { + "epoch": 1.2438593674747151, + "grad_norm": 0.9491468667984009, + "learning_rate": 0.00015378882756668478, + "loss": 0.7169, + "step": 3874 + }, + { + "epoch": 1.2441804462995665, + "grad_norm": 0.8333759307861328, + "learning_rate": 0.00015375939747589335, + "loss": 0.5432, + "step": 3875 + }, + { + "epoch": 1.244501525124418, + "grad_norm": 0.7121776342391968, + "learning_rate": 0.0001537299608346824, + "loss": 0.4806, + "step": 3876 + }, + { + "epoch": 1.2448226039492696, + "grad_norm": 0.9570683836936951, + "learning_rate": 0.0001537005176466387, + "loss": 0.6031, + "step": 3877 + }, + { + "epoch": 1.2451436827741211, + "grad_norm": 0.8433529734611511, + "learning_rate": 0.00015367106791534983, + "loss": 0.7327, + "step": 3878 + }, + { + "epoch": 1.2454647615989725, + "grad_norm": 0.8592220544815063, + "learning_rate": 0.0001536416116444041, + "loss": 0.5449, + "step": 3879 + }, + { + "epoch": 1.245785840423824, + "grad_norm": 0.9075251221656799, + "learning_rate": 0.00015361214883739076, + "loss": 0.6982, + "step": 3880 + }, + { + "epoch": 1.2461069192486756, + "grad_norm": 1.0835230350494385, + "learning_rate": 0.00015358267949789966, + "loss": 0.6304, + "step": 3881 + }, + { + "epoch": 1.246427998073527, + "grad_norm": 0.9230034947395325, + "learning_rate": 0.00015355320362952162, + "loss": 0.631, + "step": 3882 + }, + { + "epoch": 1.2467490768983787, + "grad_norm": 1.1106213331222534, + "learning_rate": 0.00015352372123584814, + "loss": 0.7395, + "step": 3883 + }, + { + "epoch": 1.24707015572323, + "grad_norm": 1.3075162172317505, + "learning_rate": 0.00015349423232047162, + "loss": 0.6999, + "step": 3884 + }, + { + "epoch": 1.2473912345480815, + "grad_norm": 1.0641080141067505, + "learning_rate": 0.00015346473688698513, + "loss": 0.6968, + "step": 3885 + }, + { + "epoch": 1.247712313372933, + "grad_norm": 0.8962423801422119, + "learning_rate": 0.00015343523493898265, + "loss": 0.5366, + "step": 3886 + }, + { + "epoch": 1.2480333921977846, + "grad_norm": 1.0747464895248413, + "learning_rate": 0.00015340572648005888, + "loss": 0.5723, + "step": 3887 + }, + { + "epoch": 1.248354471022636, + "grad_norm": 1.0071834325790405, + "learning_rate": 0.0001533762115138093, + "loss": 0.6652, + "step": 3888 + }, + { + "epoch": 1.2486755498474875, + "grad_norm": 1.272632360458374, + "learning_rate": 0.0001533466900438303, + "loss": 0.7782, + "step": 3889 + }, + { + "epoch": 1.248996628672339, + "grad_norm": 1.1216950416564941, + "learning_rate": 0.00015331716207371888, + "loss": 0.6813, + "step": 3890 + }, + { + "epoch": 1.2493177074971906, + "grad_norm": 0.9694088697433472, + "learning_rate": 0.000153287627607073, + "loss": 0.5781, + "step": 3891 + }, + { + "epoch": 1.2496387863220422, + "grad_norm": 1.916099190711975, + "learning_rate": 0.00015325808664749135, + "loss": 0.8175, + "step": 3892 + }, + { + "epoch": 1.2499598651468935, + "grad_norm": 0.9841503500938416, + "learning_rate": 0.0001532285391985734, + "loss": 0.6193, + "step": 3893 + }, + { + "epoch": 1.250280943971745, + "grad_norm": 1.1670957803726196, + "learning_rate": 0.0001531989852639194, + "loss": 0.7807, + "step": 3894 + }, + { + "epoch": 1.2506020227965966, + "grad_norm": 1.1303762197494507, + "learning_rate": 0.0001531694248471304, + "loss": 0.7768, + "step": 3895 + }, + { + "epoch": 1.250923101621448, + "grad_norm": 0.9170710444450378, + "learning_rate": 0.00015313985795180828, + "loss": 0.6192, + "step": 3896 + }, + { + "epoch": 1.2512441804462995, + "grad_norm": 1.1706234216690063, + "learning_rate": 0.00015311028458155567, + "loss": 0.6049, + "step": 3897 + }, + { + "epoch": 1.251565259271151, + "grad_norm": 1.188920021057129, + "learning_rate": 0.00015308070473997598, + "loss": 0.6907, + "step": 3898 + }, + { + "epoch": 1.2518863380960026, + "grad_norm": 0.8956981301307678, + "learning_rate": 0.0001530511184306734, + "loss": 0.5518, + "step": 3899 + }, + { + "epoch": 1.2522074169208541, + "grad_norm": 0.978665828704834, + "learning_rate": 0.00015302152565725298, + "loss": 0.5264, + "step": 3900 + }, + { + "epoch": 1.2525284957457057, + "grad_norm": 0.822583794593811, + "learning_rate": 0.0001529919264233205, + "loss": 0.5521, + "step": 3901 + }, + { + "epoch": 1.252849574570557, + "grad_norm": 0.7783530354499817, + "learning_rate": 0.00015296232073248251, + "loss": 0.4333, + "step": 3902 + }, + { + "epoch": 1.2531706533954086, + "grad_norm": 0.7691320776939392, + "learning_rate": 0.00015293270858834644, + "loss": 0.4828, + "step": 3903 + }, + { + "epoch": 1.2534917322202601, + "grad_norm": 1.0390291213989258, + "learning_rate": 0.00015290308999452032, + "loss": 0.7419, + "step": 3904 + }, + { + "epoch": 1.2538128110451114, + "grad_norm": 0.8240483999252319, + "learning_rate": 0.00015287346495461315, + "loss": 0.5811, + "step": 3905 + }, + { + "epoch": 1.254133889869963, + "grad_norm": 1.1478774547576904, + "learning_rate": 0.00015284383347223472, + "loss": 0.7649, + "step": 3906 + }, + { + "epoch": 1.2544549686948145, + "grad_norm": 1.2211809158325195, + "learning_rate": 0.00015281419555099546, + "loss": 0.6535, + "step": 3907 + }, + { + "epoch": 1.254776047519666, + "grad_norm": 1.166593074798584, + "learning_rate": 0.00015278455119450664, + "loss": 0.6659, + "step": 3908 + }, + { + "epoch": 1.2550971263445176, + "grad_norm": 0.8463481068611145, + "learning_rate": 0.00015275490040638038, + "loss": 0.508, + "step": 3909 + }, + { + "epoch": 1.2554182051693692, + "grad_norm": 1.5510456562042236, + "learning_rate": 0.00015272524319022955, + "loss": 0.3536, + "step": 3910 + }, + { + "epoch": 1.2557392839942205, + "grad_norm": 0.9114112854003906, + "learning_rate": 0.00015269557954966778, + "loss": 0.5686, + "step": 3911 + }, + { + "epoch": 1.256060362819072, + "grad_norm": 0.8683449029922485, + "learning_rate": 0.00015266590948830947, + "loss": 0.4499, + "step": 3912 + }, + { + "epoch": 1.2563814416439236, + "grad_norm": 0.9325652122497559, + "learning_rate": 0.00015263623300976978, + "loss": 0.4684, + "step": 3913 + }, + { + "epoch": 1.256702520468775, + "grad_norm": 0.4808101952075958, + "learning_rate": 0.00015260655011766484, + "loss": 0.3881, + "step": 3914 + }, + { + "epoch": 1.2570235992936265, + "grad_norm": 0.9658359289169312, + "learning_rate": 0.00015257686081561134, + "loss": 0.9497, + "step": 3915 + }, + { + "epoch": 1.257344678118478, + "grad_norm": 0.7216698527336121, + "learning_rate": 0.00015254716510722677, + "loss": 0.7033, + "step": 3916 + }, + { + "epoch": 1.2576657569433296, + "grad_norm": 0.9119216799736023, + "learning_rate": 0.0001525174629961296, + "loss": 0.6078, + "step": 3917 + }, + { + "epoch": 1.2579868357681812, + "grad_norm": 0.9573872685432434, + "learning_rate": 0.00015248775448593882, + "loss": 0.5487, + "step": 3918 + }, + { + "epoch": 1.2583079145930327, + "grad_norm": 0.8556592464447021, + "learning_rate": 0.00015245803958027434, + "loss": 0.3048, + "step": 3919 + }, + { + "epoch": 1.258628993417884, + "grad_norm": 0.7970437407493591, + "learning_rate": 0.00015242831828275692, + "loss": 0.5525, + "step": 3920 + }, + { + "epoch": 1.2589500722427356, + "grad_norm": 1.1400879621505737, + "learning_rate": 0.00015239859059700794, + "loss": 0.7523, + "step": 3921 + }, + { + "epoch": 1.2592711510675871, + "grad_norm": 1.1350511312484741, + "learning_rate": 0.00015236885652664963, + "loss": 0.6163, + "step": 3922 + }, + { + "epoch": 1.2595922298924385, + "grad_norm": 1.0047470331192017, + "learning_rate": 0.00015233911607530497, + "loss": 0.5577, + "step": 3923 + }, + { + "epoch": 1.25991330871729, + "grad_norm": 1.0919958353042603, + "learning_rate": 0.0001523093692465978, + "loss": 0.5031, + "step": 3924 + }, + { + "epoch": 1.2602343875421416, + "grad_norm": 0.7224966883659363, + "learning_rate": 0.00015227961604415266, + "loss": 0.5719, + "step": 3925 + }, + { + "epoch": 1.2605554663669931, + "grad_norm": 0.9928240776062012, + "learning_rate": 0.0001522498564715949, + "loss": 0.6879, + "step": 3926 + }, + { + "epoch": 1.2608765451918447, + "grad_norm": 0.8768775463104248, + "learning_rate": 0.0001522200905325506, + "loss": 0.6634, + "step": 3927 + }, + { + "epoch": 1.2611976240166962, + "grad_norm": 0.8131898641586304, + "learning_rate": 0.00015219031823064666, + "loss": 0.5647, + "step": 3928 + }, + { + "epoch": 1.2615187028415475, + "grad_norm": 1.0961014032363892, + "learning_rate": 0.0001521605395695108, + "loss": 0.6579, + "step": 3929 + }, + { + "epoch": 1.261839781666399, + "grad_norm": 0.9365900754928589, + "learning_rate": 0.0001521307545527714, + "loss": 0.6051, + "step": 3930 + }, + { + "epoch": 1.2621608604912506, + "grad_norm": 1.2082931995391846, + "learning_rate": 0.00015210096318405767, + "loss": 0.7062, + "step": 3931 + }, + { + "epoch": 1.262481939316102, + "grad_norm": 0.8848322033882141, + "learning_rate": 0.0001520711654669996, + "loss": 0.6358, + "step": 3932 + }, + { + "epoch": 1.2628030181409535, + "grad_norm": 0.8963106274604797, + "learning_rate": 0.00015204136140522799, + "loss": 0.5787, + "step": 3933 + }, + { + "epoch": 1.263124096965805, + "grad_norm": 1.1440925598144531, + "learning_rate": 0.0001520115510023743, + "loss": 0.6726, + "step": 3934 + }, + { + "epoch": 1.2634451757906566, + "grad_norm": 1.4616795778274536, + "learning_rate": 0.00015198173426207094, + "loss": 0.8507, + "step": 3935 + }, + { + "epoch": 1.2637662546155082, + "grad_norm": 1.1262779235839844, + "learning_rate": 0.00015195191118795096, + "loss": 0.6682, + "step": 3936 + }, + { + "epoch": 1.2640873334403597, + "grad_norm": 0.878693699836731, + "learning_rate": 0.00015192208178364816, + "loss": 0.6938, + "step": 3937 + }, + { + "epoch": 1.264408412265211, + "grad_norm": 1.1921688318252563, + "learning_rate": 0.00015189224605279718, + "loss": 0.6734, + "step": 3938 + }, + { + "epoch": 1.2647294910900626, + "grad_norm": 1.2111103534698486, + "learning_rate": 0.00015186240399903342, + "loss": 0.6716, + "step": 3939 + }, + { + "epoch": 1.2650505699149142, + "grad_norm": 1.1788506507873535, + "learning_rate": 0.00015183255562599307, + "loss": 0.6973, + "step": 3940 + }, + { + "epoch": 1.2653716487397655, + "grad_norm": 1.2426005601882935, + "learning_rate": 0.00015180270093731303, + "loss": 0.5777, + "step": 3941 + }, + { + "epoch": 1.265692727564617, + "grad_norm": 1.2214686870574951, + "learning_rate": 0.000151772839936631, + "loss": 0.8486, + "step": 3942 + }, + { + "epoch": 1.2660138063894686, + "grad_norm": 0.9162314534187317, + "learning_rate": 0.0001517429726275855, + "loss": 0.4804, + "step": 3943 + }, + { + "epoch": 1.2663348852143201, + "grad_norm": 1.108119249343872, + "learning_rate": 0.00015171309901381572, + "loss": 0.7479, + "step": 3944 + }, + { + "epoch": 1.2666559640391717, + "grad_norm": 0.9174817204475403, + "learning_rate": 0.00015168321909896172, + "loss": 0.6093, + "step": 3945 + }, + { + "epoch": 1.2669770428640232, + "grad_norm": 1.142285704612732, + "learning_rate": 0.0001516533328866642, + "loss": 0.6486, + "step": 3946 + }, + { + "epoch": 1.2672981216888746, + "grad_norm": 1.0875240564346313, + "learning_rate": 0.00015162344038056476, + "loss": 0.7344, + "step": 3947 + }, + { + "epoch": 1.2676192005137261, + "grad_norm": 0.7915586233139038, + "learning_rate": 0.00015159354158430572, + "loss": 0.5285, + "step": 3948 + }, + { + "epoch": 1.2679402793385777, + "grad_norm": 1.0847499370574951, + "learning_rate": 0.00015156363650153012, + "loss": 0.5974, + "step": 3949 + }, + { + "epoch": 1.268261358163429, + "grad_norm": 0.7492148876190186, + "learning_rate": 0.00015153372513588182, + "loss": 0.5093, + "step": 3950 + }, + { + "epoch": 1.2685824369882805, + "grad_norm": 1.150233507156372, + "learning_rate": 0.00015150380749100545, + "loss": 0.6276, + "step": 3951 + }, + { + "epoch": 1.268903515813132, + "grad_norm": 0.8965975642204285, + "learning_rate": 0.00015147388357054632, + "loss": 0.6621, + "step": 3952 + }, + { + "epoch": 1.2692245946379836, + "grad_norm": 1.1043106317520142, + "learning_rate": 0.00015144395337815064, + "loss": 0.6958, + "step": 3953 + }, + { + "epoch": 1.2695456734628352, + "grad_norm": 1.1139799356460571, + "learning_rate": 0.0001514140169174653, + "loss": 0.613, + "step": 3954 + }, + { + "epoch": 1.2698667522876868, + "grad_norm": 1.0345196723937988, + "learning_rate": 0.00015138407419213796, + "loss": 0.5999, + "step": 3955 + }, + { + "epoch": 1.270187831112538, + "grad_norm": 1.0387682914733887, + "learning_rate": 0.00015135412520581702, + "loss": 0.6301, + "step": 3956 + }, + { + "epoch": 1.2705089099373896, + "grad_norm": 0.9722599387168884, + "learning_rate": 0.0001513241699621517, + "loss": 0.6064, + "step": 3957 + }, + { + "epoch": 1.2708299887622412, + "grad_norm": 1.886247158050537, + "learning_rate": 0.00015129420846479196, + "loss": 0.7651, + "step": 3958 + }, + { + "epoch": 1.2711510675870925, + "grad_norm": 1.071523666381836, + "learning_rate": 0.0001512642407173885, + "loss": 0.5445, + "step": 3959 + }, + { + "epoch": 1.271472146411944, + "grad_norm": 0.7643309235572815, + "learning_rate": 0.00015123426672359285, + "loss": 0.3537, + "step": 3960 + }, + { + "epoch": 1.2717932252367956, + "grad_norm": 1.0264235734939575, + "learning_rate": 0.00015120428648705717, + "loss": 0.6581, + "step": 3961 + }, + { + "epoch": 1.2721143040616472, + "grad_norm": 1.0826537609100342, + "learning_rate": 0.00015117430001143452, + "loss": 0.6621, + "step": 3962 + }, + { + "epoch": 1.2724353828864987, + "grad_norm": 0.5251657366752625, + "learning_rate": 0.0001511443073003786, + "loss": 0.3996, + "step": 3963 + }, + { + "epoch": 1.2727564617113503, + "grad_norm": 0.6575037240982056, + "learning_rate": 0.000151114308357544, + "loss": 0.3963, + "step": 3964 + }, + { + "epoch": 1.2730775405362016, + "grad_norm": 0.7141722440719604, + "learning_rate": 0.000151084303186586, + "loss": 0.6904, + "step": 3965 + }, + { + "epoch": 1.2733986193610531, + "grad_norm": 0.8512976169586182, + "learning_rate": 0.0001510542917911606, + "loss": 0.999, + "step": 3966 + }, + { + "epoch": 1.2737196981859047, + "grad_norm": 0.7313768863677979, + "learning_rate": 0.0001510242741749246, + "loss": 1.1832, + "step": 3967 + }, + { + "epoch": 1.274040777010756, + "grad_norm": 0.7449411153793335, + "learning_rate": 0.00015099425034153553, + "loss": 0.6666, + "step": 3968 + }, + { + "epoch": 1.2743618558356076, + "grad_norm": 0.7838384509086609, + "learning_rate": 0.00015096422029465178, + "loss": 0.4504, + "step": 3969 + }, + { + "epoch": 1.2746829346604591, + "grad_norm": 0.6540324091911316, + "learning_rate": 0.00015093418403793238, + "loss": 0.3508, + "step": 3970 + }, + { + "epoch": 1.2750040134853107, + "grad_norm": 0.8694151043891907, + "learning_rate": 0.00015090414157503714, + "loss": 0.3151, + "step": 3971 + }, + { + "epoch": 1.2753250923101622, + "grad_norm": 1.0265659093856812, + "learning_rate": 0.00015087409290962667, + "loss": 0.3798, + "step": 3972 + }, + { + "epoch": 1.2756461711350138, + "grad_norm": 0.7983294129371643, + "learning_rate": 0.0001508440380453623, + "loss": 0.7452, + "step": 3973 + }, + { + "epoch": 1.275967249959865, + "grad_norm": 1.3019341230392456, + "learning_rate": 0.0001508139769859061, + "loss": 0.6089, + "step": 3974 + }, + { + "epoch": 1.2762883287847167, + "grad_norm": 0.756151556968689, + "learning_rate": 0.00015078390973492092, + "loss": 0.6198, + "step": 3975 + }, + { + "epoch": 1.2766094076095682, + "grad_norm": 0.797984778881073, + "learning_rate": 0.00015075383629607042, + "loss": 0.5604, + "step": 3976 + }, + { + "epoch": 1.2769304864344195, + "grad_norm": 0.7493342161178589, + "learning_rate": 0.00015072375667301893, + "loss": 0.6128, + "step": 3977 + }, + { + "epoch": 1.277251565259271, + "grad_norm": 0.8932711482048035, + "learning_rate": 0.00015069367086943154, + "loss": 0.5593, + "step": 3978 + }, + { + "epoch": 1.2775726440841226, + "grad_norm": 0.9008491039276123, + "learning_rate": 0.0001506635788889741, + "loss": 0.4842, + "step": 3979 + }, + { + "epoch": 1.2778937229089742, + "grad_norm": 0.8058199882507324, + "learning_rate": 0.00015063348073531324, + "loss": 0.6554, + "step": 3980 + }, + { + "epoch": 1.2782148017338257, + "grad_norm": 1.0209277868270874, + "learning_rate": 0.00015060337641211637, + "loss": 0.5969, + "step": 3981 + }, + { + "epoch": 1.2785358805586773, + "grad_norm": 1.4035515785217285, + "learning_rate": 0.0001505732659230516, + "loss": 0.7034, + "step": 3982 + }, + { + "epoch": 1.2788569593835286, + "grad_norm": 0.963071882724762, + "learning_rate": 0.0001505431492717878, + "loss": 0.5722, + "step": 3983 + }, + { + "epoch": 1.2791780382083802, + "grad_norm": 1.2736893892288208, + "learning_rate": 0.0001505130264619945, + "loss": 0.6295, + "step": 3984 + }, + { + "epoch": 1.2794991170332317, + "grad_norm": 0.9955981373786926, + "learning_rate": 0.0001504828974973422, + "loss": 0.7137, + "step": 3985 + }, + { + "epoch": 1.279820195858083, + "grad_norm": 1.3226053714752197, + "learning_rate": 0.00015045276238150192, + "loss": 0.7224, + "step": 3986 + }, + { + "epoch": 1.2801412746829346, + "grad_norm": 1.3493560552597046, + "learning_rate": 0.00015042262111814565, + "loss": 0.932, + "step": 3987 + }, + { + "epoch": 1.2804623535077861, + "grad_norm": 0.8732056617736816, + "learning_rate": 0.00015039247371094588, + "loss": 0.5749, + "step": 3988 + }, + { + "epoch": 1.2807834323326377, + "grad_norm": 0.9701531529426575, + "learning_rate": 0.0001503623201635761, + "loss": 0.6856, + "step": 3989 + }, + { + "epoch": 1.2811045111574892, + "grad_norm": 1.385581135749817, + "learning_rate": 0.00015033216047971031, + "loss": 0.7327, + "step": 3990 + }, + { + "epoch": 1.2814255899823408, + "grad_norm": 0.7699181437492371, + "learning_rate": 0.00015030199466302353, + "loss": 0.5799, + "step": 3991 + }, + { + "epoch": 1.2817466688071921, + "grad_norm": 1.5015044212341309, + "learning_rate": 0.00015027182271719122, + "loss": 0.8638, + "step": 3992 + }, + { + "epoch": 1.2820677476320437, + "grad_norm": 0.9158933758735657, + "learning_rate": 0.00015024164464588982, + "loss": 0.7239, + "step": 3993 + }, + { + "epoch": 1.2823888264568952, + "grad_norm": 1.1398961544036865, + "learning_rate": 0.0001502114604527964, + "loss": 0.7259, + "step": 3994 + }, + { + "epoch": 1.2827099052817466, + "grad_norm": 0.9500359892845154, + "learning_rate": 0.00015018127014158886, + "loss": 0.5439, + "step": 3995 + }, + { + "epoch": 1.283030984106598, + "grad_norm": 0.9452093243598938, + "learning_rate": 0.00015015107371594573, + "loss": 0.617, + "step": 3996 + }, + { + "epoch": 1.2833520629314497, + "grad_norm": 0.881150484085083, + "learning_rate": 0.00015012087117954642, + "loss": 0.4604, + "step": 3997 + }, + { + "epoch": 1.2836731417563012, + "grad_norm": 1.126312494277954, + "learning_rate": 0.000150090662536071, + "loss": 0.8014, + "step": 3998 + }, + { + "epoch": 1.2839942205811528, + "grad_norm": 1.0232257843017578, + "learning_rate": 0.0001500604477892003, + "loss": 0.7495, + "step": 3999 + }, + { + "epoch": 1.2843152994060043, + "grad_norm": 0.9484618902206421, + "learning_rate": 0.00015003022694261585, + "loss": 0.6574, + "step": 4000 + }, + { + "epoch": 1.2846363782308556, + "grad_norm": 0.8673555850982666, + "learning_rate": 0.00015000000000000001, + "loss": 0.6955, + "step": 4001 + }, + { + "epoch": 1.2849574570557072, + "grad_norm": 0.842471718788147, + "learning_rate": 0.00014996976696503587, + "loss": 0.5209, + "step": 4002 + }, + { + "epoch": 1.2852785358805587, + "grad_norm": 0.6719858050346375, + "learning_rate": 0.00014993952784140717, + "loss": 0.4186, + "step": 4003 + }, + { + "epoch": 1.28559961470541, + "grad_norm": 1.7661453485488892, + "learning_rate": 0.00014990928263279848, + "loss": 0.5979, + "step": 4004 + }, + { + "epoch": 1.2859206935302616, + "grad_norm": 1.1179763078689575, + "learning_rate": 0.00014987903134289508, + "loss": 0.5886, + "step": 4005 + }, + { + "epoch": 1.2862417723551132, + "grad_norm": 0.9625594615936279, + "learning_rate": 0.00014984877397538303, + "loss": 0.5743, + "step": 4006 + }, + { + "epoch": 1.2865628511799647, + "grad_norm": 1.0250779390335083, + "learning_rate": 0.0001498185105339491, + "loss": 0.5355, + "step": 4007 + }, + { + "epoch": 1.2868839300048163, + "grad_norm": 1.0908957719802856, + "learning_rate": 0.00014978824102228076, + "loss": 0.5967, + "step": 4008 + }, + { + "epoch": 1.2872050088296678, + "grad_norm": 0.8703522086143494, + "learning_rate": 0.00014975796544406625, + "loss": 0.5883, + "step": 4009 + }, + { + "epoch": 1.2875260876545191, + "grad_norm": 0.976186990737915, + "learning_rate": 0.0001497276838029946, + "loss": 0.5918, + "step": 4010 + }, + { + "epoch": 1.2878471664793707, + "grad_norm": 1.2151916027069092, + "learning_rate": 0.00014969739610275556, + "loss": 0.4542, + "step": 4011 + }, + { + "epoch": 1.2881682453042222, + "grad_norm": 0.8926727175712585, + "learning_rate": 0.0001496671023470395, + "loss": 0.5784, + "step": 4012 + }, + { + "epoch": 1.2884893241290736, + "grad_norm": 1.0263932943344116, + "learning_rate": 0.0001496368025395377, + "loss": 0.5809, + "step": 4013 + }, + { + "epoch": 1.2888104029539251, + "grad_norm": 1.3325320482254028, + "learning_rate": 0.00014960649668394207, + "loss": 0.4552, + "step": 4014 + }, + { + "epoch": 1.2891314817787767, + "grad_norm": 1.0666171312332153, + "learning_rate": 0.00014957618478394529, + "loss": 1.0576, + "step": 4015 + }, + { + "epoch": 1.2894525606036282, + "grad_norm": 0.7908462285995483, + "learning_rate": 0.00014954586684324078, + "loss": 1.0942, + "step": 4016 + }, + { + "epoch": 1.2897736394284798, + "grad_norm": 0.7149842381477356, + "learning_rate": 0.00014951554286552266, + "loss": 0.8738, + "step": 4017 + }, + { + "epoch": 1.2900947182533313, + "grad_norm": 0.8624144196510315, + "learning_rate": 0.00014948521285448586, + "loss": 0.9059, + "step": 4018 + }, + { + "epoch": 1.2904157970781827, + "grad_norm": 0.8445031642913818, + "learning_rate": 0.00014945487681382598, + "loss": 0.6081, + "step": 4019 + }, + { + "epoch": 1.2907368759030342, + "grad_norm": 0.8286342620849609, + "learning_rate": 0.00014942453474723935, + "loss": 0.4396, + "step": 4020 + }, + { + "epoch": 1.2910579547278858, + "grad_norm": 0.7982895374298096, + "learning_rate": 0.0001493941866584231, + "loss": 0.3624, + "step": 4021 + }, + { + "epoch": 1.291379033552737, + "grad_norm": 0.7270027995109558, + "learning_rate": 0.00014936383255107505, + "loss": 0.3002, + "step": 4022 + }, + { + "epoch": 1.2917001123775886, + "grad_norm": 0.838603138923645, + "learning_rate": 0.0001493334724288937, + "loss": 0.7561, + "step": 4023 + }, + { + "epoch": 1.2920211912024402, + "grad_norm": 1.2956212759017944, + "learning_rate": 0.0001493031062955784, + "loss": 0.6119, + "step": 4024 + }, + { + "epoch": 1.2923422700272917, + "grad_norm": 0.8435970544815063, + "learning_rate": 0.00014927273415482915, + "loss": 0.5659, + "step": 4025 + }, + { + "epoch": 1.2926633488521433, + "grad_norm": 0.8424569368362427, + "learning_rate": 0.00014924235601034672, + "loss": 0.5801, + "step": 4026 + }, + { + "epoch": 1.2929844276769948, + "grad_norm": 1.0110530853271484, + "learning_rate": 0.00014921197186583255, + "loss": 0.6738, + "step": 4027 + }, + { + "epoch": 1.2933055065018462, + "grad_norm": 0.7541561126708984, + "learning_rate": 0.0001491815817249889, + "loss": 0.5595, + "step": 4028 + }, + { + "epoch": 1.2936265853266977, + "grad_norm": 0.9687687158584595, + "learning_rate": 0.0001491511855915187, + "loss": 0.6474, + "step": 4029 + }, + { + "epoch": 1.2939476641515493, + "grad_norm": 0.9450034499168396, + "learning_rate": 0.00014912078346912563, + "loss": 0.5455, + "step": 4030 + }, + { + "epoch": 1.2942687429764006, + "grad_norm": 0.8591203689575195, + "learning_rate": 0.00014909037536151409, + "loss": 0.5695, + "step": 4031 + }, + { + "epoch": 1.2945898218012521, + "grad_norm": 1.0376840829849243, + "learning_rate": 0.0001490599612723892, + "loss": 0.7787, + "step": 4032 + }, + { + "epoch": 1.2949109006261037, + "grad_norm": 0.9652031064033508, + "learning_rate": 0.00014902954120545687, + "loss": 0.6709, + "step": 4033 + }, + { + "epoch": 1.2952319794509553, + "grad_norm": 0.8163144588470459, + "learning_rate": 0.00014899911516442365, + "loss": 0.6559, + "step": 4034 + }, + { + "epoch": 1.2955530582758068, + "grad_norm": 1.0613020658493042, + "learning_rate": 0.00014896868315299693, + "loss": 0.5966, + "step": 4035 + }, + { + "epoch": 1.2958741371006581, + "grad_norm": 0.9142425656318665, + "learning_rate": 0.00014893824517488464, + "loss": 0.6798, + "step": 4036 + }, + { + "epoch": 1.2961952159255097, + "grad_norm": 0.8986964225769043, + "learning_rate": 0.00014890780123379564, + "loss": 0.7447, + "step": 4037 + }, + { + "epoch": 1.2965162947503612, + "grad_norm": 1.133459448814392, + "learning_rate": 0.0001488773513334394, + "loss": 0.567, + "step": 4038 + }, + { + "epoch": 1.2968373735752128, + "grad_norm": 1.0823416709899902, + "learning_rate": 0.0001488468954775262, + "loss": 0.7199, + "step": 4039 + }, + { + "epoch": 1.297158452400064, + "grad_norm": 0.8796799182891846, + "learning_rate": 0.00014881643366976692, + "loss": 0.5962, + "step": 4040 + }, + { + "epoch": 1.2974795312249157, + "grad_norm": 1.0232266187667847, + "learning_rate": 0.0001487859659138733, + "loss": 0.7782, + "step": 4041 + }, + { + "epoch": 1.2978006100497672, + "grad_norm": 1.0571743249893188, + "learning_rate": 0.00014875549221355768, + "loss": 0.6525, + "step": 4042 + }, + { + "epoch": 1.2981216888746188, + "grad_norm": 1.7555642127990723, + "learning_rate": 0.00014872501257253323, + "loss": 0.8595, + "step": 4043 + }, + { + "epoch": 1.2984427676994703, + "grad_norm": 1.0583701133728027, + "learning_rate": 0.00014869452699451383, + "loss": 0.6666, + "step": 4044 + }, + { + "epoch": 1.2987638465243216, + "grad_norm": 1.1438575983047485, + "learning_rate": 0.000148664035483214, + "loss": 0.8125, + "step": 4045 + }, + { + "epoch": 1.2990849253491732, + "grad_norm": 1.0455180406570435, + "learning_rate": 0.00014863353804234905, + "loss": 0.5961, + "step": 4046 + }, + { + "epoch": 1.2994060041740247, + "grad_norm": 1.0730552673339844, + "learning_rate": 0.00014860303467563503, + "loss": 0.4922, + "step": 4047 + }, + { + "epoch": 1.2997270829988763, + "grad_norm": 1.0210821628570557, + "learning_rate": 0.00014857252538678865, + "loss": 0.6711, + "step": 4048 + }, + { + "epoch": 1.3000481618237276, + "grad_norm": 1.0794029235839844, + "learning_rate": 0.0001485420101795274, + "loss": 0.6992, + "step": 4049 + }, + { + "epoch": 1.3003692406485792, + "grad_norm": 1.1778931617736816, + "learning_rate": 0.00014851148905756947, + "loss": 0.5997, + "step": 4050 + }, + { + "epoch": 1.3006903194734307, + "grad_norm": 1.5340932607650757, + "learning_rate": 0.00014848096202463372, + "loss": 0.4667, + "step": 4051 + }, + { + "epoch": 1.3010113982982823, + "grad_norm": 1.405383825302124, + "learning_rate": 0.0001484504290844398, + "loss": 0.7285, + "step": 4052 + }, + { + "epoch": 1.3013324771231338, + "grad_norm": 0.9958581328392029, + "learning_rate": 0.00014841989024070809, + "loss": 0.651, + "step": 4053 + }, + { + "epoch": 1.3016535559479852, + "grad_norm": 1.1035085916519165, + "learning_rate": 0.00014838934549715963, + "loss": 0.6727, + "step": 4054 + }, + { + "epoch": 1.3019746347728367, + "grad_norm": 0.9837401509284973, + "learning_rate": 0.00014835879485751617, + "loss": 0.6048, + "step": 4055 + }, + { + "epoch": 1.3022957135976883, + "grad_norm": 1.033929467201233, + "learning_rate": 0.00014832823832550024, + "loss": 0.709, + "step": 4056 + }, + { + "epoch": 1.3026167924225398, + "grad_norm": 1.9119707345962524, + "learning_rate": 0.00014829767590483506, + "loss": 0.7885, + "step": 4057 + }, + { + "epoch": 1.3029378712473911, + "grad_norm": 1.1973304748535156, + "learning_rate": 0.0001482671075992446, + "loss": 0.6681, + "step": 4058 + }, + { + "epoch": 1.3032589500722427, + "grad_norm": 0.8669224977493286, + "learning_rate": 0.00014823653341245353, + "loss": 0.5783, + "step": 4059 + }, + { + "epoch": 1.3035800288970942, + "grad_norm": 1.0361067056655884, + "learning_rate": 0.00014820595334818712, + "loss": 0.7104, + "step": 4060 + }, + { + "epoch": 1.3039011077219458, + "grad_norm": 0.9638859033584595, + "learning_rate": 0.00014817536741017152, + "loss": 0.609, + "step": 4061 + }, + { + "epoch": 1.3042221865467973, + "grad_norm": 0.9824711084365845, + "learning_rate": 0.00014814477560213358, + "loss": 0.526, + "step": 4062 + }, + { + "epoch": 1.3045432653716487, + "grad_norm": 0.6022173762321472, + "learning_rate": 0.00014811417792780075, + "loss": 0.4001, + "step": 4063 + }, + { + "epoch": 1.3048643441965002, + "grad_norm": 0.7990144491195679, + "learning_rate": 0.00014808357439090127, + "loss": 0.4216, + "step": 4064 + }, + { + "epoch": 1.3051854230213518, + "grad_norm": 0.676643431186676, + "learning_rate": 0.00014805296499516407, + "loss": 0.7422, + "step": 4065 + }, + { + "epoch": 1.3055065018462033, + "grad_norm": 0.8250537514686584, + "learning_rate": 0.0001480223497443189, + "loss": 1.0526, + "step": 4066 + }, + { + "epoch": 1.3058275806710546, + "grad_norm": 1.0050231218338013, + "learning_rate": 0.00014799172864209608, + "loss": 0.6552, + "step": 4067 + }, + { + "epoch": 1.3061486594959062, + "grad_norm": 0.8684215545654297, + "learning_rate": 0.00014796110169222666, + "loss": 0.4027, + "step": 4068 + }, + { + "epoch": 1.3064697383207577, + "grad_norm": 0.89919513463974, + "learning_rate": 0.0001479304688984425, + "loss": 0.4916, + "step": 4069 + }, + { + "epoch": 1.3067908171456093, + "grad_norm": 0.8932663798332214, + "learning_rate": 0.00014789983026447612, + "loss": 0.2584, + "step": 4070 + }, + { + "epoch": 1.3071118959704608, + "grad_norm": 0.8465712666511536, + "learning_rate": 0.0001478691857940607, + "loss": 0.3093, + "step": 4071 + }, + { + "epoch": 1.3074329747953122, + "grad_norm": 0.7077059149742126, + "learning_rate": 0.00014783853549093018, + "loss": 0.4673, + "step": 4072 + }, + { + "epoch": 1.3077540536201637, + "grad_norm": 1.0644139051437378, + "learning_rate": 0.00014780787935881923, + "loss": 0.7199, + "step": 4073 + }, + { + "epoch": 1.3080751324450153, + "grad_norm": 0.7375410795211792, + "learning_rate": 0.0001477772174014632, + "loss": 0.5636, + "step": 4074 + }, + { + "epoch": 1.3083962112698668, + "grad_norm": 0.9191805720329285, + "learning_rate": 0.00014774654962259812, + "loss": 0.663, + "step": 4075 + }, + { + "epoch": 1.3087172900947182, + "grad_norm": 0.9933133721351624, + "learning_rate": 0.00014771587602596084, + "loss": 0.6267, + "step": 4076 + }, + { + "epoch": 1.3090383689195697, + "grad_norm": 0.7604119777679443, + "learning_rate": 0.0001476851966152888, + "loss": 0.5479, + "step": 4077 + }, + { + "epoch": 1.3093594477444213, + "grad_norm": 0.8366871476173401, + "learning_rate": 0.0001476545113943202, + "loss": 0.643, + "step": 4078 + }, + { + "epoch": 1.3096805265692728, + "grad_norm": 0.8511562943458557, + "learning_rate": 0.0001476238203667939, + "loss": 0.5903, + "step": 4079 + }, + { + "epoch": 1.3100016053941244, + "grad_norm": 0.9097726941108704, + "learning_rate": 0.0001475931235364496, + "loss": 0.6136, + "step": 4080 + }, + { + "epoch": 1.3103226842189757, + "grad_norm": 1.0176063776016235, + "learning_rate": 0.00014756242090702756, + "loss": 0.6633, + "step": 4081 + }, + { + "epoch": 1.3106437630438272, + "grad_norm": 1.1305146217346191, + "learning_rate": 0.00014753171248226875, + "loss": 0.7559, + "step": 4082 + }, + { + "epoch": 1.3109648418686788, + "grad_norm": 0.7558575868606567, + "learning_rate": 0.00014750099826591498, + "loss": 0.5253, + "step": 4083 + }, + { + "epoch": 1.3112859206935303, + "grad_norm": 0.850307047367096, + "learning_rate": 0.00014747027826170867, + "loss": 0.6287, + "step": 4084 + }, + { + "epoch": 1.3116069995183817, + "grad_norm": 1.0060899257659912, + "learning_rate": 0.00014743955247339293, + "loss": 0.7239, + "step": 4085 + }, + { + "epoch": 1.3119280783432332, + "grad_norm": 0.9706814885139465, + "learning_rate": 0.0001474088209047116, + "loss": 0.6527, + "step": 4086 + }, + { + "epoch": 1.3122491571680848, + "grad_norm": 0.847439706325531, + "learning_rate": 0.00014737808355940932, + "loss": 0.6468, + "step": 4087 + }, + { + "epoch": 1.3125702359929363, + "grad_norm": 0.9411894679069519, + "learning_rate": 0.0001473473404412312, + "loss": 0.6667, + "step": 4088 + }, + { + "epoch": 1.3128913148177879, + "grad_norm": 1.1132397651672363, + "learning_rate": 0.00014731659155392332, + "loss": 0.5735, + "step": 4089 + }, + { + "epoch": 1.3132123936426392, + "grad_norm": 1.0457525253295898, + "learning_rate": 0.00014728583690123224, + "loss": 0.6922, + "step": 4090 + }, + { + "epoch": 1.3135334724674907, + "grad_norm": 0.8525384068489075, + "learning_rate": 0.00014725507648690543, + "loss": 0.634, + "step": 4091 + }, + { + "epoch": 1.3138545512923423, + "grad_norm": 0.7392247319221497, + "learning_rate": 0.00014722431031469083, + "loss": 0.491, + "step": 4092 + }, + { + "epoch": 1.3141756301171938, + "grad_norm": 1.0142507553100586, + "learning_rate": 0.0001471935383883373, + "loss": 0.7623, + "step": 4093 + }, + { + "epoch": 1.3144967089420452, + "grad_norm": 1.3004919290542603, + "learning_rate": 0.00014716276071159422, + "loss": 0.7285, + "step": 4094 + }, + { + "epoch": 1.3148177877668967, + "grad_norm": 1.129610538482666, + "learning_rate": 0.00014713197728821183, + "loss": 0.7317, + "step": 4095 + }, + { + "epoch": 1.3151388665917483, + "grad_norm": 0.9293155670166016, + "learning_rate": 0.000147101188121941, + "loss": 0.6089, + "step": 4096 + }, + { + "epoch": 1.3154599454165998, + "grad_norm": 0.9089670777320862, + "learning_rate": 0.0001470703932165333, + "loss": 0.6274, + "step": 4097 + }, + { + "epoch": 1.3157810242414514, + "grad_norm": 1.2431045770645142, + "learning_rate": 0.0001470395925757409, + "loss": 0.6852, + "step": 4098 + }, + { + "epoch": 1.3161021030663027, + "grad_norm": 0.9798834919929504, + "learning_rate": 0.00014700878620331684, + "loss": 0.7175, + "step": 4099 + }, + { + "epoch": 1.3164231818911543, + "grad_norm": 0.9327998757362366, + "learning_rate": 0.0001469779741030148, + "loss": 0.6003, + "step": 4100 + }, + { + "epoch": 1.3167442607160058, + "grad_norm": 1.2201324701309204, + "learning_rate": 0.00014694715627858908, + "loss": 0.5154, + "step": 4101 + }, + { + "epoch": 1.3170653395408574, + "grad_norm": 0.912707507610321, + "learning_rate": 0.0001469163327337948, + "loss": 0.4929, + "step": 4102 + }, + { + "epoch": 1.3173864183657087, + "grad_norm": 1.2306265830993652, + "learning_rate": 0.0001468855034723877, + "loss": 0.6194, + "step": 4103 + }, + { + "epoch": 1.3177074971905602, + "grad_norm": 1.114705204963684, + "learning_rate": 0.00014685466849812418, + "loss": 0.6703, + "step": 4104 + }, + { + "epoch": 1.3180285760154118, + "grad_norm": 0.8249489068984985, + "learning_rate": 0.00014682382781476146, + "loss": 0.533, + "step": 4105 + }, + { + "epoch": 1.3183496548402633, + "grad_norm": 1.0677366256713867, + "learning_rate": 0.00014679298142605734, + "loss": 0.6168, + "step": 4106 + }, + { + "epoch": 1.318670733665115, + "grad_norm": 0.7703533172607422, + "learning_rate": 0.0001467621293357704, + "loss": 0.4699, + "step": 4107 + }, + { + "epoch": 1.3189918124899662, + "grad_norm": 1.1350317001342773, + "learning_rate": 0.0001467312715476598, + "loss": 0.5877, + "step": 4108 + }, + { + "epoch": 1.3193128913148178, + "grad_norm": 0.9653196930885315, + "learning_rate": 0.00014670040806548555, + "loss": 0.5712, + "step": 4109 + }, + { + "epoch": 1.3196339701396693, + "grad_norm": 1.1715449094772339, + "learning_rate": 0.0001466695388930082, + "loss": 0.5182, + "step": 4110 + }, + { + "epoch": 1.3199550489645209, + "grad_norm": 0.7922108769416809, + "learning_rate": 0.00014663866403398913, + "loss": 0.5427, + "step": 4111 + }, + { + "epoch": 1.3202761277893722, + "grad_norm": 0.9558143615722656, + "learning_rate": 0.0001466077834921903, + "loss": 0.5072, + "step": 4112 + }, + { + "epoch": 1.3205972066142238, + "grad_norm": 0.6734508872032166, + "learning_rate": 0.00014657689727137443, + "loss": 0.4252, + "step": 4113 + }, + { + "epoch": 1.3209182854390753, + "grad_norm": 0.826819658279419, + "learning_rate": 0.0001465460053753049, + "loss": 0.4521, + "step": 4114 + }, + { + "epoch": 1.3212393642639269, + "grad_norm": 0.8778970241546631, + "learning_rate": 0.00014651510780774583, + "loss": 0.5193, + "step": 4115 + }, + { + "epoch": 1.3215604430887784, + "grad_norm": 0.7205748558044434, + "learning_rate": 0.000146484204572462, + "loss": 0.8583, + "step": 4116 + }, + { + "epoch": 1.3218815219136297, + "grad_norm": 0.5651400685310364, + "learning_rate": 0.0001464532956732188, + "loss": 0.4929, + "step": 4117 + }, + { + "epoch": 1.3222026007384813, + "grad_norm": 0.8473796248435974, + "learning_rate": 0.0001464223811137824, + "loss": 0.5065, + "step": 4118 + }, + { + "epoch": 1.3225236795633328, + "grad_norm": 0.8395222425460815, + "learning_rate": 0.0001463914608979197, + "loss": 0.63, + "step": 4119 + }, + { + "epoch": 1.3228447583881842, + "grad_norm": 0.8078770041465759, + "learning_rate": 0.00014636053502939823, + "loss": 0.3171, + "step": 4120 + }, + { + "epoch": 1.3231658372130357, + "grad_norm": 0.7320200204849243, + "learning_rate": 0.00014632960351198618, + "loss": 0.4024, + "step": 4121 + }, + { + "epoch": 1.3234869160378873, + "grad_norm": 0.9218945503234863, + "learning_rate": 0.00014629866634945248, + "loss": 0.3652, + "step": 4122 + }, + { + "epoch": 1.3238079948627388, + "grad_norm": 0.6042816042900085, + "learning_rate": 0.0001462677235455667, + "loss": 0.2944, + "step": 4123 + }, + { + "epoch": 1.3241290736875904, + "grad_norm": 1.0619382858276367, + "learning_rate": 0.00014623677510409918, + "loss": 0.7914, + "step": 4124 + }, + { + "epoch": 1.324450152512442, + "grad_norm": 1.0162999629974365, + "learning_rate": 0.00014620582102882089, + "loss": 0.7339, + "step": 4125 + }, + { + "epoch": 1.3247712313372932, + "grad_norm": 1.0029937028884888, + "learning_rate": 0.00014617486132350343, + "loss": 0.63, + "step": 4126 + }, + { + "epoch": 1.3250923101621448, + "grad_norm": 0.9867237210273743, + "learning_rate": 0.00014614389599191917, + "loss": 0.7598, + "step": 4127 + }, + { + "epoch": 1.3254133889869963, + "grad_norm": 0.8086850643157959, + "learning_rate": 0.00014611292503784117, + "loss": 0.5079, + "step": 4128 + }, + { + "epoch": 1.3257344678118477, + "grad_norm": 0.8222152590751648, + "learning_rate": 0.0001460819484650431, + "loss": 0.551, + "step": 4129 + }, + { + "epoch": 1.3260555466366992, + "grad_norm": 0.84171462059021, + "learning_rate": 0.0001460509662772994, + "loss": 0.6227, + "step": 4130 + }, + { + "epoch": 1.3263766254615508, + "grad_norm": 0.8350204825401306, + "learning_rate": 0.00014601997847838518, + "loss": 0.626, + "step": 4131 + }, + { + "epoch": 1.3266977042864023, + "grad_norm": 1.4779672622680664, + "learning_rate": 0.00014598898507207615, + "loss": 0.6658, + "step": 4132 + }, + { + "epoch": 1.3270187831112539, + "grad_norm": 0.7730790376663208, + "learning_rate": 0.00014595798606214882, + "loss": 0.5676, + "step": 4133 + }, + { + "epoch": 1.3273398619361054, + "grad_norm": 1.5839117765426636, + "learning_rate": 0.00014592698145238028, + "loss": 0.7933, + "step": 4134 + }, + { + "epoch": 1.3276609407609568, + "grad_norm": 0.9410939812660217, + "learning_rate": 0.00014589597124654833, + "loss": 0.6305, + "step": 4135 + }, + { + "epoch": 1.3279820195858083, + "grad_norm": 1.0541006326675415, + "learning_rate": 0.00014586495544843152, + "loss": 0.7623, + "step": 4136 + }, + { + "epoch": 1.3283030984106599, + "grad_norm": 0.8780885934829712, + "learning_rate": 0.000145833934061809, + "loss": 0.6705, + "step": 4137 + }, + { + "epoch": 1.3286241772355112, + "grad_norm": 0.790573239326477, + "learning_rate": 0.00014580290709046066, + "loss": 0.6865, + "step": 4138 + }, + { + "epoch": 1.3289452560603627, + "grad_norm": 0.7406685948371887, + "learning_rate": 0.000145771874538167, + "loss": 0.4454, + "step": 4139 + }, + { + "epoch": 1.3292663348852143, + "grad_norm": 1.0333536863327026, + "learning_rate": 0.0001457408364087093, + "loss": 0.7394, + "step": 4140 + }, + { + "epoch": 1.3295874137100658, + "grad_norm": 0.9966391324996948, + "learning_rate": 0.00014570979270586945, + "loss": 0.5846, + "step": 4141 + }, + { + "epoch": 1.3299084925349174, + "grad_norm": 1.0538800954818726, + "learning_rate": 0.00014567874343342997, + "loss": 0.7812, + "step": 4142 + }, + { + "epoch": 1.330229571359769, + "grad_norm": 0.90020352602005, + "learning_rate": 0.00014564768859517418, + "loss": 0.62, + "step": 4143 + }, + { + "epoch": 1.3305506501846203, + "grad_norm": 1.0670877695083618, + "learning_rate": 0.00014561662819488597, + "loss": 0.6715, + "step": 4144 + }, + { + "epoch": 1.3308717290094718, + "grad_norm": 1.155572772026062, + "learning_rate": 0.00014558556223635003, + "loss": 0.6894, + "step": 4145 + }, + { + "epoch": 1.3311928078343234, + "grad_norm": 1.7336785793304443, + "learning_rate": 0.00014555449072335157, + "loss": 0.8215, + "step": 4146 + }, + { + "epoch": 1.3315138866591747, + "grad_norm": 1.0733205080032349, + "learning_rate": 0.00014552341365967658, + "loss": 0.7403, + "step": 4147 + }, + { + "epoch": 1.3318349654840262, + "grad_norm": 0.9724444150924683, + "learning_rate": 0.00014549233104911178, + "loss": 0.6041, + "step": 4148 + }, + { + "epoch": 1.3321560443088778, + "grad_norm": 1.006319522857666, + "learning_rate": 0.0001454612428954444, + "loss": 0.6566, + "step": 4149 + }, + { + "epoch": 1.3324771231337293, + "grad_norm": 0.736380398273468, + "learning_rate": 0.00014543014920246247, + "loss": 0.5568, + "step": 4150 + }, + { + "epoch": 1.332798201958581, + "grad_norm": 1.084572196006775, + "learning_rate": 0.00014539904997395468, + "loss": 0.5559, + "step": 4151 + }, + { + "epoch": 1.3331192807834324, + "grad_norm": 0.8436475396156311, + "learning_rate": 0.00014536794521371037, + "loss": 0.5434, + "step": 4152 + }, + { + "epoch": 1.3334403596082838, + "grad_norm": 1.179206371307373, + "learning_rate": 0.00014533683492551952, + "loss": 0.7242, + "step": 4153 + }, + { + "epoch": 1.3337614384331353, + "grad_norm": 1.3009215593338013, + "learning_rate": 0.0001453057191131729, + "loss": 0.6785, + "step": 4154 + }, + { + "epoch": 1.3340825172579869, + "grad_norm": 1.1896605491638184, + "learning_rate": 0.0001452745977804618, + "loss": 0.6612, + "step": 4155 + }, + { + "epoch": 1.3344035960828382, + "grad_norm": 1.1616253852844238, + "learning_rate": 0.00014524347093117828, + "loss": 0.6739, + "step": 4156 + }, + { + "epoch": 1.3347246749076898, + "grad_norm": 0.7783265113830566, + "learning_rate": 0.00014521233856911508, + "loss": 0.5076, + "step": 4157 + }, + { + "epoch": 1.3350457537325413, + "grad_norm": 1.7753092050552368, + "learning_rate": 0.00014518120069806557, + "loss": 0.5744, + "step": 4158 + }, + { + "epoch": 1.3353668325573929, + "grad_norm": 1.8857307434082031, + "learning_rate": 0.00014515005732182383, + "loss": 0.8165, + "step": 4159 + }, + { + "epoch": 1.3356879113822444, + "grad_norm": 0.8890649676322937, + "learning_rate": 0.00014511890844418453, + "loss": 0.3913, + "step": 4160 + }, + { + "epoch": 1.336008990207096, + "grad_norm": 0.691440999507904, + "learning_rate": 0.00014508775406894307, + "loss": 0.495, + "step": 4161 + }, + { + "epoch": 1.3363300690319473, + "grad_norm": 1.066584587097168, + "learning_rate": 0.0001450565941998956, + "loss": 0.6014, + "step": 4162 + }, + { + "epoch": 1.3366511478567988, + "grad_norm": 1.0590673685073853, + "learning_rate": 0.00014502542884083875, + "loss": 0.4447, + "step": 4163 + }, + { + "epoch": 1.3369722266816504, + "grad_norm": 0.7496399283409119, + "learning_rate": 0.00014499425799557, + "loss": 0.3962, + "step": 4164 + }, + { + "epoch": 1.3372933055065017, + "grad_norm": 0.5220407843589783, + "learning_rate": 0.0001449630816678874, + "loss": 0.4822, + "step": 4165 + }, + { + "epoch": 1.3376143843313533, + "grad_norm": 0.6849836111068726, + "learning_rate": 0.00014493189986158965, + "loss": 0.875, + "step": 4166 + }, + { + "epoch": 1.3379354631562048, + "grad_norm": 0.9034988284111023, + "learning_rate": 0.00014490071258047623, + "loss": 0.7436, + "step": 4167 + }, + { + "epoch": 1.3382565419810564, + "grad_norm": 0.7391397356987, + "learning_rate": 0.0001448695198283472, + "loss": 0.5641, + "step": 4168 + }, + { + "epoch": 1.338577620805908, + "grad_norm": 0.5824310779571533, + "learning_rate": 0.00014483832160900326, + "loss": 0.3517, + "step": 4169 + }, + { + "epoch": 1.3388986996307595, + "grad_norm": 0.7779441475868225, + "learning_rate": 0.0001448071179262458, + "loss": 0.4648, + "step": 4170 + }, + { + "epoch": 1.3392197784556108, + "grad_norm": 0.7949094176292419, + "learning_rate": 0.00014477590878387696, + "loss": 0.3886, + "step": 4171 + }, + { + "epoch": 1.3395408572804623, + "grad_norm": 0.7168573141098022, + "learning_rate": 0.0001447446941856995, + "loss": 0.5573, + "step": 4172 + }, + { + "epoch": 1.339861936105314, + "grad_norm": 0.9068509340286255, + "learning_rate": 0.00014471347413551672, + "loss": 0.7493, + "step": 4173 + }, + { + "epoch": 1.3401830149301652, + "grad_norm": 0.9333740472793579, + "learning_rate": 0.00014468224863713278, + "loss": 0.8158, + "step": 4174 + }, + { + "epoch": 1.3405040937550168, + "grad_norm": 0.8705101609230042, + "learning_rate": 0.00014465101769435234, + "loss": 0.7667, + "step": 4175 + }, + { + "epoch": 1.3408251725798683, + "grad_norm": 0.9098716974258423, + "learning_rate": 0.00014461978131098088, + "loss": 0.6398, + "step": 4176 + }, + { + "epoch": 1.3411462514047199, + "grad_norm": 0.7872979044914246, + "learning_rate": 0.00014458853949082443, + "loss": 0.5239, + "step": 4177 + }, + { + "epoch": 1.3414673302295714, + "grad_norm": 0.6730207800865173, + "learning_rate": 0.00014455729223768966, + "loss": 0.4844, + "step": 4178 + }, + { + "epoch": 1.341788409054423, + "grad_norm": 0.7260086536407471, + "learning_rate": 0.00014452603955538397, + "loss": 0.6197, + "step": 4179 + }, + { + "epoch": 1.3421094878792743, + "grad_norm": 0.8892760276794434, + "learning_rate": 0.00014449478144771543, + "loss": 0.6995, + "step": 4180 + }, + { + "epoch": 1.3424305667041259, + "grad_norm": 1.4739090204238892, + "learning_rate": 0.00014446351791849276, + "loss": 0.7409, + "step": 4181 + }, + { + "epoch": 1.3427516455289774, + "grad_norm": 0.7913824915885925, + "learning_rate": 0.0001444322489715253, + "loss": 0.6096, + "step": 4182 + }, + { + "epoch": 1.3430727243538287, + "grad_norm": 0.9372351169586182, + "learning_rate": 0.00014440097461062307, + "loss": 0.5211, + "step": 4183 + }, + { + "epoch": 1.3433938031786803, + "grad_norm": 1.0964716672897339, + "learning_rate": 0.00014436969483959676, + "loss": 0.7583, + "step": 4184 + }, + { + "epoch": 1.3437148820035318, + "grad_norm": 1.6157090663909912, + "learning_rate": 0.00014433840966225772, + "loss": 0.6351, + "step": 4185 + }, + { + "epoch": 1.3440359608283834, + "grad_norm": 0.8620708584785461, + "learning_rate": 0.00014430711908241798, + "loss": 0.5351, + "step": 4186 + }, + { + "epoch": 1.344357039653235, + "grad_norm": 0.814059317111969, + "learning_rate": 0.0001442758231038902, + "loss": 0.6808, + "step": 4187 + }, + { + "epoch": 1.3446781184780865, + "grad_norm": 1.1919108629226685, + "learning_rate": 0.0001442445217304876, + "loss": 0.8476, + "step": 4188 + }, + { + "epoch": 1.3449991973029378, + "grad_norm": 1.3358224630355835, + "learning_rate": 0.00014421321496602428, + "loss": 0.8407, + "step": 4189 + }, + { + "epoch": 1.3453202761277894, + "grad_norm": 1.180011510848999, + "learning_rate": 0.00014418190281431482, + "loss": 0.5575, + "step": 4190 + }, + { + "epoch": 1.345641354952641, + "grad_norm": 1.2531673908233643, + "learning_rate": 0.00014415058527917452, + "loss": 0.661, + "step": 4191 + }, + { + "epoch": 1.3459624337774923, + "grad_norm": 0.8697596192359924, + "learning_rate": 0.00014411926236441934, + "loss": 0.7263, + "step": 4192 + }, + { + "epoch": 1.3462835126023438, + "grad_norm": 1.6186376810073853, + "learning_rate": 0.00014408793407386588, + "loss": 0.6973, + "step": 4193 + }, + { + "epoch": 1.3466045914271954, + "grad_norm": 0.9928228259086609, + "learning_rate": 0.00014405660041133132, + "loss": 0.5972, + "step": 4194 + }, + { + "epoch": 1.346925670252047, + "grad_norm": 1.2978379726409912, + "learning_rate": 0.00014402526138063373, + "loss": 0.5707, + "step": 4195 + }, + { + "epoch": 1.3472467490768985, + "grad_norm": 1.1947479248046875, + "learning_rate": 0.00014399391698559152, + "loss": 0.7005, + "step": 4196 + }, + { + "epoch": 1.34756782790175, + "grad_norm": 1.2066709995269775, + "learning_rate": 0.000143962567230024, + "loss": 0.6688, + "step": 4197 + }, + { + "epoch": 1.3478889067266013, + "grad_norm": 0.8478728532791138, + "learning_rate": 0.000143931212117751, + "loss": 0.5476, + "step": 4198 + }, + { + "epoch": 1.3482099855514529, + "grad_norm": 1.081978678703308, + "learning_rate": 0.00014389985165259308, + "loss": 0.684, + "step": 4199 + }, + { + "epoch": 1.3485310643763044, + "grad_norm": 1.2415368556976318, + "learning_rate": 0.0001438684858383714, + "loss": 0.7176, + "step": 4200 + }, + { + "epoch": 1.3488521432011558, + "grad_norm": 1.0181922912597656, + "learning_rate": 0.00014383711467890774, + "loss": 0.6968, + "step": 4201 + }, + { + "epoch": 1.3491732220260073, + "grad_norm": 1.017606258392334, + "learning_rate": 0.00014380573817802467, + "loss": 0.474, + "step": 4202 + }, + { + "epoch": 1.3494943008508589, + "grad_norm": 1.2191901206970215, + "learning_rate": 0.00014377435633954527, + "loss": 0.5747, + "step": 4203 + }, + { + "epoch": 1.3498153796757104, + "grad_norm": 0.7296000719070435, + "learning_rate": 0.00014374296916729336, + "loss": 0.4687, + "step": 4204 + }, + { + "epoch": 1.350136458500562, + "grad_norm": 1.0375962257385254, + "learning_rate": 0.0001437115766650933, + "loss": 0.5935, + "step": 4205 + }, + { + "epoch": 1.3504575373254135, + "grad_norm": 0.8629053831100464, + "learning_rate": 0.00014368017883677024, + "loss": 0.5089, + "step": 4206 + }, + { + "epoch": 1.3507786161502648, + "grad_norm": 1.310308575630188, + "learning_rate": 0.0001436487756861499, + "loss": 0.7386, + "step": 4207 + }, + { + "epoch": 1.3510996949751164, + "grad_norm": 0.7340025901794434, + "learning_rate": 0.0001436173672170586, + "loss": 0.492, + "step": 4208 + }, + { + "epoch": 1.351420773799968, + "grad_norm": 0.8930183053016663, + "learning_rate": 0.00014358595343332342, + "loss": 0.4347, + "step": 4209 + }, + { + "epoch": 1.3517418526248193, + "grad_norm": 1.247145175933838, + "learning_rate": 0.00014355453433877204, + "loss": 0.5153, + "step": 4210 + }, + { + "epoch": 1.3520629314496708, + "grad_norm": 1.237648844718933, + "learning_rate": 0.00014352310993723277, + "loss": 0.5992, + "step": 4211 + }, + { + "epoch": 1.3523840102745224, + "grad_norm": 0.776263952255249, + "learning_rate": 0.00014349168023253456, + "loss": 0.5426, + "step": 4212 + }, + { + "epoch": 1.352705089099374, + "grad_norm": 1.0824544429779053, + "learning_rate": 0.00014346024522850703, + "loss": 0.5301, + "step": 4213 + }, + { + "epoch": 1.3530261679242255, + "grad_norm": 0.7915077209472656, + "learning_rate": 0.00014342880492898048, + "loss": 0.456, + "step": 4214 + }, + { + "epoch": 1.353347246749077, + "grad_norm": 0.845167338848114, + "learning_rate": 0.00014339735933778576, + "loss": 0.8329, + "step": 4215 + }, + { + "epoch": 1.3536683255739284, + "grad_norm": 0.7723715901374817, + "learning_rate": 0.00014336590845875446, + "loss": 0.914, + "step": 4216 + }, + { + "epoch": 1.35398940439878, + "grad_norm": 0.7588359713554382, + "learning_rate": 0.00014333445229571873, + "loss": 0.7267, + "step": 4217 + }, + { + "epoch": 1.3543104832236315, + "grad_norm": 0.733163058757782, + "learning_rate": 0.00014330299085251144, + "loss": 0.486, + "step": 4218 + }, + { + "epoch": 1.3546315620484828, + "grad_norm": 0.6390459537506104, + "learning_rate": 0.00014327152413296608, + "loss": 0.3084, + "step": 4219 + }, + { + "epoch": 1.3549526408733343, + "grad_norm": 0.8938314318656921, + "learning_rate": 0.00014324005214091676, + "loss": 0.4455, + "step": 4220 + }, + { + "epoch": 1.3552737196981859, + "grad_norm": 1.0440847873687744, + "learning_rate": 0.00014320857488019824, + "loss": 0.4297, + "step": 4221 + }, + { + "epoch": 1.3555947985230374, + "grad_norm": 0.8476107716560364, + "learning_rate": 0.00014317709235464593, + "loss": 0.5257, + "step": 4222 + }, + { + "epoch": 1.355915877347889, + "grad_norm": 0.9037560224533081, + "learning_rate": 0.0001431456045680959, + "loss": 0.6943, + "step": 4223 + }, + { + "epoch": 1.3562369561727405, + "grad_norm": 0.875007152557373, + "learning_rate": 0.00014311411152438482, + "loss": 0.6148, + "step": 4224 + }, + { + "epoch": 1.3565580349975919, + "grad_norm": 0.829646646976471, + "learning_rate": 0.00014308261322735005, + "loss": 0.7564, + "step": 4225 + }, + { + "epoch": 1.3568791138224434, + "grad_norm": 0.7148510217666626, + "learning_rate": 0.00014305110968082952, + "loss": 0.5596, + "step": 4226 + }, + { + "epoch": 1.357200192647295, + "grad_norm": 0.7443501949310303, + "learning_rate": 0.00014301960088866186, + "loss": 0.6532, + "step": 4227 + }, + { + "epoch": 1.3575212714721463, + "grad_norm": 0.9788792729377747, + "learning_rate": 0.00014298808685468635, + "loss": 0.5607, + "step": 4228 + }, + { + "epoch": 1.3578423502969978, + "grad_norm": 0.7856297492980957, + "learning_rate": 0.00014295656758274284, + "loss": 0.5779, + "step": 4229 + }, + { + "epoch": 1.3581634291218494, + "grad_norm": 0.7464038133621216, + "learning_rate": 0.00014292504307667186, + "loss": 0.4625, + "step": 4230 + }, + { + "epoch": 1.358484507946701, + "grad_norm": 0.958886981010437, + "learning_rate": 0.0001428935133403146, + "loss": 0.6014, + "step": 4231 + }, + { + "epoch": 1.3588055867715525, + "grad_norm": 1.1012500524520874, + "learning_rate": 0.00014286197837751286, + "loss": 0.9194, + "step": 4232 + }, + { + "epoch": 1.359126665596404, + "grad_norm": 0.8240073323249817, + "learning_rate": 0.00014283043819210905, + "loss": 0.5618, + "step": 4233 + }, + { + "epoch": 1.3594477444212554, + "grad_norm": 0.8680970072746277, + "learning_rate": 0.00014279889278794627, + "loss": 0.6105, + "step": 4234 + }, + { + "epoch": 1.359768823246107, + "grad_norm": 1.032063364982605, + "learning_rate": 0.00014276734216886821, + "loss": 0.7325, + "step": 4235 + }, + { + "epoch": 1.3600899020709585, + "grad_norm": 0.9061574935913086, + "learning_rate": 0.00014273578633871927, + "loss": 0.6026, + "step": 4236 + }, + { + "epoch": 1.3604109808958098, + "grad_norm": 0.7440208196640015, + "learning_rate": 0.00014270422530134432, + "loss": 0.552, + "step": 4237 + }, + { + "epoch": 1.3607320597206614, + "grad_norm": 1.1810888051986694, + "learning_rate": 0.00014267265906058914, + "loss": 0.7499, + "step": 4238 + }, + { + "epoch": 1.361053138545513, + "grad_norm": 0.903121292591095, + "learning_rate": 0.0001426410876202999, + "loss": 0.6064, + "step": 4239 + }, + { + "epoch": 1.3613742173703645, + "grad_norm": 0.9991443157196045, + "learning_rate": 0.00014260951098432343, + "loss": 0.661, + "step": 4240 + }, + { + "epoch": 1.361695296195216, + "grad_norm": 0.9029455184936523, + "learning_rate": 0.00014257792915650728, + "loss": 0.5712, + "step": 4241 + }, + { + "epoch": 1.3620163750200676, + "grad_norm": 0.899472713470459, + "learning_rate": 0.00014254634214069963, + "loss": 0.6596, + "step": 4242 + }, + { + "epoch": 1.3623374538449189, + "grad_norm": 0.7812873125076294, + "learning_rate": 0.00014251474994074928, + "loss": 0.5881, + "step": 4243 + }, + { + "epoch": 1.3626585326697704, + "grad_norm": 1.202508807182312, + "learning_rate": 0.00014248315256050557, + "loss": 0.6451, + "step": 4244 + }, + { + "epoch": 1.362979611494622, + "grad_norm": 0.7961817979812622, + "learning_rate": 0.0001424515500038186, + "loss": 0.4156, + "step": 4245 + }, + { + "epoch": 1.3633006903194733, + "grad_norm": 0.9045254588127136, + "learning_rate": 0.00014241994227453901, + "loss": 0.5355, + "step": 4246 + }, + { + "epoch": 1.3636217691443249, + "grad_norm": 1.0353204011917114, + "learning_rate": 0.00014238832937651816, + "loss": 0.6559, + "step": 4247 + }, + { + "epoch": 1.3639428479691764, + "grad_norm": 0.866054117679596, + "learning_rate": 0.00014235671131360798, + "loss": 0.562, + "step": 4248 + }, + { + "epoch": 1.364263926794028, + "grad_norm": 1.5949488878250122, + "learning_rate": 0.00014232508808966098, + "loss": 0.7606, + "step": 4249 + }, + { + "epoch": 1.3645850056188795, + "grad_norm": 1.053368091583252, + "learning_rate": 0.00014229345970853032, + "loss": 0.6, + "step": 4250 + }, + { + "epoch": 1.364906084443731, + "grad_norm": 0.8603706359863281, + "learning_rate": 0.00014226182617406996, + "loss": 0.5477, + "step": 4251 + }, + { + "epoch": 1.3652271632685824, + "grad_norm": 2.3181567192077637, + "learning_rate": 0.00014223018749013423, + "loss": 0.6106, + "step": 4252 + }, + { + "epoch": 1.365548242093434, + "grad_norm": 0.8432567715644836, + "learning_rate": 0.0001421985436605783, + "loss": 0.4574, + "step": 4253 + }, + { + "epoch": 1.3658693209182855, + "grad_norm": 1.323652982711792, + "learning_rate": 0.0001421668946892578, + "loss": 0.7721, + "step": 4254 + }, + { + "epoch": 1.3661903997431368, + "grad_norm": 0.9810237288475037, + "learning_rate": 0.0001421352405800291, + "loss": 0.5758, + "step": 4255 + }, + { + "epoch": 1.3665114785679884, + "grad_norm": 1.0755305290222168, + "learning_rate": 0.00014210358133674912, + "loss": 0.5253, + "step": 4256 + }, + { + "epoch": 1.36683255739284, + "grad_norm": 1.2368508577346802, + "learning_rate": 0.00014207191696327548, + "loss": 0.5937, + "step": 4257 + }, + { + "epoch": 1.3671536362176915, + "grad_norm": 0.9989250302314758, + "learning_rate": 0.00014204024746346637, + "loss": 0.7571, + "step": 4258 + }, + { + "epoch": 1.367474715042543, + "grad_norm": 0.9901091456413269, + "learning_rate": 0.00014200857284118066, + "loss": 0.6444, + "step": 4259 + }, + { + "epoch": 1.3677957938673946, + "grad_norm": 0.8036117553710938, + "learning_rate": 0.00014197689310027772, + "loss": 0.4396, + "step": 4260 + }, + { + "epoch": 1.368116872692246, + "grad_norm": 1.049436330795288, + "learning_rate": 0.00014194520824461771, + "loss": 0.5589, + "step": 4261 + }, + { + "epoch": 1.3684379515170975, + "grad_norm": 0.8437821865081787, + "learning_rate": 0.00014191351827806133, + "loss": 0.5044, + "step": 4262 + }, + { + "epoch": 1.368759030341949, + "grad_norm": 1.1721175909042358, + "learning_rate": 0.00014188182320446985, + "loss": 0.4725, + "step": 4263 + }, + { + "epoch": 1.3690801091668003, + "grad_norm": 0.7921808958053589, + "learning_rate": 0.00014185012302770527, + "loss": 0.4147, + "step": 4264 + }, + { + "epoch": 1.369401187991652, + "grad_norm": 1.2697014808654785, + "learning_rate": 0.00014181841775163013, + "loss": 0.9084, + "step": 4265 + }, + { + "epoch": 1.3697222668165034, + "grad_norm": 0.8648690581321716, + "learning_rate": 0.0001417867073801077, + "loss": 0.9195, + "step": 4266 + }, + { + "epoch": 1.370043345641355, + "grad_norm": 0.7991724014282227, + "learning_rate": 0.00014175499191700167, + "loss": 0.7083, + "step": 4267 + }, + { + "epoch": 1.3703644244662065, + "grad_norm": 0.8012338876724243, + "learning_rate": 0.00014172327136617656, + "loss": 0.4684, + "step": 4268 + }, + { + "epoch": 1.3706855032910579, + "grad_norm": 0.9364102482795715, + "learning_rate": 0.00014169154573149737, + "loss": 0.383, + "step": 4269 + }, + { + "epoch": 1.3710065821159094, + "grad_norm": 1.1031969785690308, + "learning_rate": 0.0001416598150168298, + "loss": 0.5984, + "step": 4270 + }, + { + "epoch": 1.371327660940761, + "grad_norm": 1.1069051027297974, + "learning_rate": 0.00014162807922604012, + "loss": 0.7745, + "step": 4271 + }, + { + "epoch": 1.3716487397656125, + "grad_norm": 1.0749973058700562, + "learning_rate": 0.00014159633836299527, + "loss": 0.7569, + "step": 4272 + }, + { + "epoch": 1.3719698185904639, + "grad_norm": 0.8237097263336182, + "learning_rate": 0.0001415645924315628, + "loss": 0.7982, + "step": 4273 + }, + { + "epoch": 1.3722908974153154, + "grad_norm": 0.9842055439949036, + "learning_rate": 0.0001415328414356108, + "loss": 0.6246, + "step": 4274 + }, + { + "epoch": 1.372611976240167, + "grad_norm": 0.9688705205917358, + "learning_rate": 0.00014150108537900805, + "loss": 0.5753, + "step": 4275 + }, + { + "epoch": 1.3729330550650185, + "grad_norm": 0.6846550703048706, + "learning_rate": 0.00014146932426562392, + "loss": 0.5259, + "step": 4276 + }, + { + "epoch": 1.37325413388987, + "grad_norm": 1.078979730606079, + "learning_rate": 0.00014143755809932845, + "loss": 0.6775, + "step": 4277 + }, + { + "epoch": 1.3735752127147214, + "grad_norm": 0.8213169574737549, + "learning_rate": 0.00014140578688399218, + "loss": 0.4362, + "step": 4278 + }, + { + "epoch": 1.373896291539573, + "grad_norm": 1.00385320186615, + "learning_rate": 0.00014137401062348638, + "loss": 0.6659, + "step": 4279 + }, + { + "epoch": 1.3742173703644245, + "grad_norm": 0.8310264945030212, + "learning_rate": 0.0001413422293216829, + "loss": 0.8067, + "step": 4280 + }, + { + "epoch": 1.374538449189276, + "grad_norm": 0.888613224029541, + "learning_rate": 0.0001413104429824542, + "loss": 0.688, + "step": 4281 + }, + { + "epoch": 1.3748595280141274, + "grad_norm": 0.9263864755630493, + "learning_rate": 0.0001412786516096733, + "loss": 0.5974, + "step": 4282 + }, + { + "epoch": 1.375180606838979, + "grad_norm": 1.2327766418457031, + "learning_rate": 0.00014124685520721392, + "loss": 0.8498, + "step": 4283 + }, + { + "epoch": 1.3755016856638305, + "grad_norm": 0.7652939558029175, + "learning_rate": 0.00014121505377895038, + "loss": 0.4643, + "step": 4284 + }, + { + "epoch": 1.375822764488682, + "grad_norm": 0.8574872016906738, + "learning_rate": 0.00014118324732875748, + "loss": 0.5679, + "step": 4285 + }, + { + "epoch": 1.3761438433135336, + "grad_norm": 1.3594255447387695, + "learning_rate": 0.00014115143586051088, + "loss": 0.7465, + "step": 4286 + }, + { + "epoch": 1.376464922138385, + "grad_norm": 0.8461220264434814, + "learning_rate": 0.00014111961937808665, + "loss": 0.4636, + "step": 4287 + }, + { + "epoch": 1.3767860009632364, + "grad_norm": 0.9702261686325073, + "learning_rate": 0.0001410877978853615, + "loss": 0.6617, + "step": 4288 + }, + { + "epoch": 1.377107079788088, + "grad_norm": 0.8296035528182983, + "learning_rate": 0.0001410559713862128, + "loss": 0.5687, + "step": 4289 + }, + { + "epoch": 1.3774281586129395, + "grad_norm": 1.2875922918319702, + "learning_rate": 0.00014102413988451856, + "loss": 0.8027, + "step": 4290 + }, + { + "epoch": 1.3777492374377909, + "grad_norm": 0.8944180607795715, + "learning_rate": 0.00014099230338415728, + "loss": 0.6401, + "step": 4291 + }, + { + "epoch": 1.3780703162626424, + "grad_norm": 0.9776332378387451, + "learning_rate": 0.00014096046188900822, + "loss": 0.7287, + "step": 4292 + }, + { + "epoch": 1.378391395087494, + "grad_norm": 0.7893264293670654, + "learning_rate": 0.00014092861540295108, + "loss": 0.5128, + "step": 4293 + }, + { + "epoch": 1.3787124739123455, + "grad_norm": 1.7183669805526733, + "learning_rate": 0.0001408967639298663, + "loss": 0.5958, + "step": 4294 + }, + { + "epoch": 1.379033552737197, + "grad_norm": 1.1158559322357178, + "learning_rate": 0.00014086490747363493, + "loss": 0.8365, + "step": 4295 + }, + { + "epoch": 1.3793546315620484, + "grad_norm": 1.0233869552612305, + "learning_rate": 0.00014083304603813848, + "loss": 0.7533, + "step": 4296 + }, + { + "epoch": 1.3796757103869, + "grad_norm": 1.1017775535583496, + "learning_rate": 0.0001408011796272593, + "loss": 0.6726, + "step": 4297 + }, + { + "epoch": 1.3799967892117515, + "grad_norm": 0.8513362407684326, + "learning_rate": 0.0001407693082448801, + "loss": 0.5752, + "step": 4298 + }, + { + "epoch": 1.380317868036603, + "grad_norm": 1.0489625930786133, + "learning_rate": 0.00014073743189488435, + "loss": 0.5832, + "step": 4299 + }, + { + "epoch": 1.3806389468614544, + "grad_norm": 0.8657923340797424, + "learning_rate": 0.00014070555058115614, + "loss": 0.6029, + "step": 4300 + }, + { + "epoch": 1.380960025686306, + "grad_norm": 1.106916904449463, + "learning_rate": 0.00014067366430758004, + "loss": 0.655, + "step": 4301 + }, + { + "epoch": 1.3812811045111575, + "grad_norm": 0.9051868915557861, + "learning_rate": 0.00014064177307804133, + "loss": 0.6185, + "step": 4302 + }, + { + "epoch": 1.381602183336009, + "grad_norm": 0.950446605682373, + "learning_rate": 0.00014060987689642581, + "loss": 0.6738, + "step": 4303 + }, + { + "epoch": 1.3819232621608606, + "grad_norm": 1.1080812215805054, + "learning_rate": 0.00014057797576662, + "loss": 0.6398, + "step": 4304 + }, + { + "epoch": 1.382244340985712, + "grad_norm": 0.6714284420013428, + "learning_rate": 0.00014054606969251095, + "loss": 0.4397, + "step": 4305 + }, + { + "epoch": 1.3825654198105635, + "grad_norm": 0.9164280891418457, + "learning_rate": 0.00014051415867798628, + "loss": 0.5544, + "step": 4306 + }, + { + "epoch": 1.382886498635415, + "grad_norm": 0.951714277267456, + "learning_rate": 0.00014048224272693424, + "loss": 0.6772, + "step": 4307 + }, + { + "epoch": 1.3832075774602666, + "grad_norm": 0.9855425357818604, + "learning_rate": 0.00014045032184324373, + "loss": 0.6, + "step": 4308 + }, + { + "epoch": 1.383528656285118, + "grad_norm": 0.9370409250259399, + "learning_rate": 0.00014041839603080422, + "loss": 0.5603, + "step": 4309 + }, + { + "epoch": 1.3838497351099694, + "grad_norm": 0.9768514633178711, + "learning_rate": 0.00014038646529350579, + "loss": 0.5308, + "step": 4310 + }, + { + "epoch": 1.384170813934821, + "grad_norm": 1.384358525276184, + "learning_rate": 0.00014035452963523902, + "loss": 0.6596, + "step": 4311 + }, + { + "epoch": 1.3844918927596725, + "grad_norm": 1.2001529932022095, + "learning_rate": 0.00014032258905989522, + "loss": 0.4874, + "step": 4312 + }, + { + "epoch": 1.384812971584524, + "grad_norm": 0.5518187284469604, + "learning_rate": 0.00014029064357136628, + "loss": 0.3581, + "step": 4313 + }, + { + "epoch": 1.3851340504093754, + "grad_norm": 0.7382297515869141, + "learning_rate": 0.00014025869317354462, + "loss": 0.4, + "step": 4314 + }, + { + "epoch": 1.385455129234227, + "grad_norm": 0.8723937273025513, + "learning_rate": 0.00014022673787032332, + "loss": 0.6065, + "step": 4315 + }, + { + "epoch": 1.3857762080590785, + "grad_norm": 0.8456537127494812, + "learning_rate": 0.00014019477766559604, + "loss": 0.5173, + "step": 4316 + }, + { + "epoch": 1.38609728688393, + "grad_norm": 0.8314757347106934, + "learning_rate": 0.000140162812563257, + "loss": 0.5405, + "step": 4317 + }, + { + "epoch": 1.3864183657087814, + "grad_norm": 0.9915850162506104, + "learning_rate": 0.00014013084256720107, + "loss": 0.4016, + "step": 4318 + }, + { + "epoch": 1.386739444533633, + "grad_norm": 0.6893324851989746, + "learning_rate": 0.00014009886768132375, + "loss": 0.2416, + "step": 4319 + }, + { + "epoch": 1.3870605233584845, + "grad_norm": 0.7434089183807373, + "learning_rate": 0.000140066887909521, + "loss": 0.2607, + "step": 4320 + }, + { + "epoch": 1.387381602183336, + "grad_norm": 0.7426462769508362, + "learning_rate": 0.00014003490325568954, + "loss": 0.6526, + "step": 4321 + }, + { + "epoch": 1.3877026810081876, + "grad_norm": 1.0482592582702637, + "learning_rate": 0.00014000291372372647, + "loss": 0.6866, + "step": 4322 + }, + { + "epoch": 1.388023759833039, + "grad_norm": 0.7865654826164246, + "learning_rate": 0.00013997091931752977, + "loss": 0.7286, + "step": 4323 + }, + { + "epoch": 1.3883448386578905, + "grad_norm": 0.7435041666030884, + "learning_rate": 0.00013993892004099777, + "loss": 0.6065, + "step": 4324 + }, + { + "epoch": 1.388665917482742, + "grad_norm": 1.0697741508483887, + "learning_rate": 0.00013990691589802954, + "loss": 0.7525, + "step": 4325 + }, + { + "epoch": 1.3889869963075936, + "grad_norm": 0.6864974498748779, + "learning_rate": 0.00013987490689252463, + "loss": 0.5124, + "step": 4326 + }, + { + "epoch": 1.389308075132445, + "grad_norm": 1.0872437953948975, + "learning_rate": 0.00013984289302838328, + "loss": 0.7208, + "step": 4327 + }, + { + "epoch": 1.3896291539572965, + "grad_norm": 0.8134937286376953, + "learning_rate": 0.00013981087430950628, + "loss": 0.5676, + "step": 4328 + }, + { + "epoch": 1.389950232782148, + "grad_norm": 0.8556815981864929, + "learning_rate": 0.000139778850739795, + "loss": 0.6281, + "step": 4329 + }, + { + "epoch": 1.3902713116069996, + "grad_norm": 0.8196367025375366, + "learning_rate": 0.0001397468223231514, + "loss": 0.5989, + "step": 4330 + }, + { + "epoch": 1.3905923904318511, + "grad_norm": 0.7392389178276062, + "learning_rate": 0.00013971478906347806, + "loss": 0.5696, + "step": 4331 + }, + { + "epoch": 1.3909134692567025, + "grad_norm": 0.8934568166732788, + "learning_rate": 0.0001396827509646782, + "loss": 0.649, + "step": 4332 + }, + { + "epoch": 1.391234548081554, + "grad_norm": 0.861387312412262, + "learning_rate": 0.00013965070803065543, + "loss": 0.5132, + "step": 4333 + }, + { + "epoch": 1.3915556269064056, + "grad_norm": 0.8012207746505737, + "learning_rate": 0.00013961866026531417, + "loss": 0.4985, + "step": 4334 + }, + { + "epoch": 1.391876705731257, + "grad_norm": 1.5408804416656494, + "learning_rate": 0.00013958660767255938, + "loss": 0.8035, + "step": 4335 + }, + { + "epoch": 1.3921977845561084, + "grad_norm": 1.0151411294937134, + "learning_rate": 0.00013955455025629651, + "loss": 0.6018, + "step": 4336 + }, + { + "epoch": 1.39251886338096, + "grad_norm": 0.9835482835769653, + "learning_rate": 0.00013952248802043165, + "loss": 0.625, + "step": 4337 + }, + { + "epoch": 1.3928399422058115, + "grad_norm": 1.0157198905944824, + "learning_rate": 0.00013949042096887153, + "loss": 0.6432, + "step": 4338 + }, + { + "epoch": 1.393161021030663, + "grad_norm": 1.2168697118759155, + "learning_rate": 0.0001394583491055234, + "loss": 0.4917, + "step": 4339 + }, + { + "epoch": 1.3934820998555146, + "grad_norm": 1.4631038904190063, + "learning_rate": 0.00013942627243429512, + "loss": 0.6929, + "step": 4340 + }, + { + "epoch": 1.393803178680366, + "grad_norm": 1.6047730445861816, + "learning_rate": 0.00013939419095909512, + "loss": 0.6978, + "step": 4341 + }, + { + "epoch": 1.3941242575052175, + "grad_norm": 1.1495589017868042, + "learning_rate": 0.00013936210468383246, + "loss": 0.8062, + "step": 4342 + }, + { + "epoch": 1.394445336330069, + "grad_norm": 1.0219823122024536, + "learning_rate": 0.00013933001361241673, + "loss": 0.6822, + "step": 4343 + }, + { + "epoch": 1.3947664151549206, + "grad_norm": 0.9300022125244141, + "learning_rate": 0.00013929791774875815, + "loss": 0.6795, + "step": 4344 + }, + { + "epoch": 1.395087493979772, + "grad_norm": 0.8773917555809021, + "learning_rate": 0.00013926581709676751, + "loss": 0.4946, + "step": 4345 + }, + { + "epoch": 1.3954085728046235, + "grad_norm": 0.8553141951560974, + "learning_rate": 0.00013923371166035616, + "loss": 0.6083, + "step": 4346 + }, + { + "epoch": 1.395729651629475, + "grad_norm": 0.9813185930252075, + "learning_rate": 0.00013920160144343603, + "loss": 0.8011, + "step": 4347 + }, + { + "epoch": 1.3960507304543266, + "grad_norm": 1.4478272199630737, + "learning_rate": 0.0001391694864499197, + "loss": 0.5279, + "step": 4348 + }, + { + "epoch": 1.3963718092791781, + "grad_norm": 0.9650620818138123, + "learning_rate": 0.00013913736668372026, + "loss": 0.6811, + "step": 4349 + }, + { + "epoch": 1.3966928881040295, + "grad_norm": 1.1189510822296143, + "learning_rate": 0.00013910524214875137, + "loss": 0.7923, + "step": 4350 + }, + { + "epoch": 1.397013966928881, + "grad_norm": 1.1408734321594238, + "learning_rate": 0.00013907311284892736, + "loss": 0.6301, + "step": 4351 + }, + { + "epoch": 1.3973350457537326, + "grad_norm": 1.3011447191238403, + "learning_rate": 0.00013904097878816312, + "loss": 0.4263, + "step": 4352 + }, + { + "epoch": 1.3976561245785841, + "grad_norm": 0.7979041337966919, + "learning_rate": 0.00013900883997037397, + "loss": 0.534, + "step": 4353 + }, + { + "epoch": 1.3979772034034355, + "grad_norm": 0.9429032206535339, + "learning_rate": 0.00013897669639947606, + "loss": 0.7349, + "step": 4354 + }, + { + "epoch": 1.398298282228287, + "grad_norm": 1.0224515199661255, + "learning_rate": 0.00013894454807938586, + "loss": 0.5381, + "step": 4355 + }, + { + "epoch": 1.3986193610531386, + "grad_norm": 0.8756483197212219, + "learning_rate": 0.0001389123950140206, + "loss": 0.5863, + "step": 4356 + }, + { + "epoch": 1.39894043987799, + "grad_norm": 1.352267861366272, + "learning_rate": 0.0001388802372072981, + "loss": 0.6053, + "step": 4357 + }, + { + "epoch": 1.3992615187028417, + "grad_norm": 0.9977678060531616, + "learning_rate": 0.00013884807466313663, + "loss": 0.6707, + "step": 4358 + }, + { + "epoch": 1.399582597527693, + "grad_norm": 0.9317201972007751, + "learning_rate": 0.00013881590738545508, + "loss": 0.6246, + "step": 4359 + }, + { + "epoch": 1.3999036763525445, + "grad_norm": 0.8313597440719604, + "learning_rate": 0.00013878373537817292, + "loss": 0.4458, + "step": 4360 + }, + { + "epoch": 1.400224755177396, + "grad_norm": 0.9241980314254761, + "learning_rate": 0.0001387515586452103, + "loss": 0.566, + "step": 4361 + }, + { + "epoch": 1.4005458340022474, + "grad_norm": 0.5518043041229248, + "learning_rate": 0.00013871937719048779, + "loss": 0.3723, + "step": 4362 + }, + { + "epoch": 1.400866912827099, + "grad_norm": 0.598588764667511, + "learning_rate": 0.00013868719101792665, + "loss": 0.354, + "step": 4363 + }, + { + "epoch": 1.4011879916519505, + "grad_norm": 1.0155926942825317, + "learning_rate": 0.00013865500013144857, + "loss": 0.614, + "step": 4364 + }, + { + "epoch": 1.401509070476802, + "grad_norm": 0.5616649389266968, + "learning_rate": 0.000138622804534976, + "loss": 0.4872, + "step": 4365 + }, + { + "epoch": 1.4018301493016536, + "grad_norm": 0.6463456749916077, + "learning_rate": 0.00013859060423243187, + "loss": 1.0319, + "step": 4366 + }, + { + "epoch": 1.4021512281265052, + "grad_norm": 0.708520770072937, + "learning_rate": 0.00013855839922773968, + "loss": 1.1378, + "step": 4367 + }, + { + "epoch": 1.4024723069513565, + "grad_norm": 0.6274012327194214, + "learning_rate": 0.00013852618952482347, + "loss": 0.4126, + "step": 4368 + }, + { + "epoch": 1.402793385776208, + "grad_norm": 0.7933291792869568, + "learning_rate": 0.00013849397512760795, + "loss": 0.4245, + "step": 4369 + }, + { + "epoch": 1.4031144646010596, + "grad_norm": 0.7001267671585083, + "learning_rate": 0.0001384617560400183, + "loss": 0.3049, + "step": 4370 + }, + { + "epoch": 1.403435543425911, + "grad_norm": 0.7495763897895813, + "learning_rate": 0.00013842953226598037, + "loss": 0.3643, + "step": 4371 + }, + { + "epoch": 1.4037566222507625, + "grad_norm": 0.7259708046913147, + "learning_rate": 0.00013839730380942053, + "loss": 0.3979, + "step": 4372 + }, + { + "epoch": 1.404077701075614, + "grad_norm": 1.0623929500579834, + "learning_rate": 0.00013836507067426564, + "loss": 0.9446, + "step": 4373 + }, + { + "epoch": 1.4043987799004656, + "grad_norm": 0.9666998386383057, + "learning_rate": 0.00013833283286444328, + "loss": 0.7658, + "step": 4374 + }, + { + "epoch": 1.4047198587253171, + "grad_norm": 0.7665534019470215, + "learning_rate": 0.0001383005903838815, + "loss": 0.5662, + "step": 4375 + }, + { + "epoch": 1.4050409375501687, + "grad_norm": 1.0555462837219238, + "learning_rate": 0.000138268343236509, + "loss": 0.7391, + "step": 4376 + }, + { + "epoch": 1.40536201637502, + "grad_norm": 0.9639325141906738, + "learning_rate": 0.0001382360914262549, + "loss": 0.6964, + "step": 4377 + }, + { + "epoch": 1.4056830951998716, + "grad_norm": 0.8347415924072266, + "learning_rate": 0.0001382038349570491, + "loss": 0.5785, + "step": 4378 + }, + { + "epoch": 1.406004174024723, + "grad_norm": 0.8627128005027771, + "learning_rate": 0.00013817157383282184, + "loss": 0.6525, + "step": 4379 + }, + { + "epoch": 1.4063252528495744, + "grad_norm": 1.1546865701675415, + "learning_rate": 0.00013813930805750413, + "loss": 0.7064, + "step": 4380 + }, + { + "epoch": 1.406646331674426, + "grad_norm": 0.9595782160758972, + "learning_rate": 0.00013810703763502744, + "loss": 0.593, + "step": 4381 + }, + { + "epoch": 1.4069674104992775, + "grad_norm": 0.7686512470245361, + "learning_rate": 0.00013807476256932376, + "loss": 0.4359, + "step": 4382 + }, + { + "epoch": 1.407288489324129, + "grad_norm": 1.0802874565124512, + "learning_rate": 0.00013804248286432578, + "loss": 0.8544, + "step": 4383 + }, + { + "epoch": 1.4076095681489806, + "grad_norm": 0.8571449518203735, + "learning_rate": 0.00013801019852396665, + "loss": 0.6631, + "step": 4384 + }, + { + "epoch": 1.4079306469738322, + "grad_norm": 0.9189439415931702, + "learning_rate": 0.00013797790955218014, + "loss": 0.6817, + "step": 4385 + }, + { + "epoch": 1.4082517257986835, + "grad_norm": 0.814200222492218, + "learning_rate": 0.00013794561595290052, + "loss": 0.5863, + "step": 4386 + }, + { + "epoch": 1.408572804623535, + "grad_norm": 1.049676537513733, + "learning_rate": 0.0001379133177300627, + "loss": 0.6476, + "step": 4387 + }, + { + "epoch": 1.4088938834483866, + "grad_norm": 0.9915246367454529, + "learning_rate": 0.00013788101488760215, + "loss": 0.6418, + "step": 4388 + }, + { + "epoch": 1.409214962273238, + "grad_norm": 1.644821047782898, + "learning_rate": 0.00013784870742945482, + "loss": 0.776, + "step": 4389 + }, + { + "epoch": 1.4095360410980895, + "grad_norm": 0.856365978717804, + "learning_rate": 0.00013781639535955732, + "loss": 0.5186, + "step": 4390 + }, + { + "epoch": 1.409857119922941, + "grad_norm": 0.9367002844810486, + "learning_rate": 0.00013778407868184672, + "loss": 0.7263, + "step": 4391 + }, + { + "epoch": 1.4101781987477926, + "grad_norm": 0.8069283962249756, + "learning_rate": 0.00013775175740026078, + "loss": 0.6471, + "step": 4392 + }, + { + "epoch": 1.4104992775726442, + "grad_norm": 0.9524908065795898, + "learning_rate": 0.00013771943151873767, + "loss": 0.762, + "step": 4393 + }, + { + "epoch": 1.4108203563974957, + "grad_norm": 1.1511192321777344, + "learning_rate": 0.00013768710104121627, + "loss": 0.5543, + "step": 4394 + }, + { + "epoch": 1.411141435222347, + "grad_norm": 0.8177773952484131, + "learning_rate": 0.00013765476597163594, + "loss": 0.6148, + "step": 4395 + }, + { + "epoch": 1.4114625140471986, + "grad_norm": 1.0919883251190186, + "learning_rate": 0.00013762242631393655, + "loss": 0.7347, + "step": 4396 + }, + { + "epoch": 1.4117835928720501, + "grad_norm": 0.9575279951095581, + "learning_rate": 0.0001375900820720587, + "loss": 0.6493, + "step": 4397 + }, + { + "epoch": 1.4121046716969015, + "grad_norm": 0.8994300365447998, + "learning_rate": 0.0001375577332499433, + "loss": 0.6778, + "step": 4398 + }, + { + "epoch": 1.412425750521753, + "grad_norm": 1.2716469764709473, + "learning_rate": 0.0001375253798515321, + "loss": 0.794, + "step": 4399 + }, + { + "epoch": 1.4127468293466046, + "grad_norm": 0.9935747385025024, + "learning_rate": 0.00013749302188076717, + "loss": 0.688, + "step": 4400 + }, + { + "epoch": 1.413067908171456, + "grad_norm": 1.3132479190826416, + "learning_rate": 0.00013746065934159123, + "loss": 0.6833, + "step": 4401 + }, + { + "epoch": 1.4133889869963077, + "grad_norm": 0.9004419445991516, + "learning_rate": 0.00013742829223794759, + "loss": 0.5584, + "step": 4402 + }, + { + "epoch": 1.4137100658211592, + "grad_norm": 0.8987523317337036, + "learning_rate": 0.00013739592057378003, + "loss": 0.5162, + "step": 4403 + }, + { + "epoch": 1.4140311446460105, + "grad_norm": 1.0713285207748413, + "learning_rate": 0.00013736354435303305, + "loss": 0.6563, + "step": 4404 + }, + { + "epoch": 1.414352223470862, + "grad_norm": 0.9994504451751709, + "learning_rate": 0.0001373311635796515, + "loss": 0.6936, + "step": 4405 + }, + { + "epoch": 1.4146733022957136, + "grad_norm": 1.2554911375045776, + "learning_rate": 0.0001372987782575809, + "loss": 0.6968, + "step": 4406 + }, + { + "epoch": 1.414994381120565, + "grad_norm": 0.9798457622528076, + "learning_rate": 0.0001372663883907673, + "loss": 0.648, + "step": 4407 + }, + { + "epoch": 1.4153154599454165, + "grad_norm": 1.1374400854110718, + "learning_rate": 0.00013723399398315734, + "loss": 0.6739, + "step": 4408 + }, + { + "epoch": 1.415636538770268, + "grad_norm": 0.9649138450622559, + "learning_rate": 0.00013720159503869815, + "loss": 0.5802, + "step": 4409 + }, + { + "epoch": 1.4159576175951196, + "grad_norm": 1.5144495964050293, + "learning_rate": 0.00013716919156133746, + "loss": 0.6543, + "step": 4410 + }, + { + "epoch": 1.4162786964199712, + "grad_norm": 1.2621392011642456, + "learning_rate": 0.00013713678355502351, + "loss": 0.6814, + "step": 4411 + }, + { + "epoch": 1.4165997752448227, + "grad_norm": 0.90981125831604, + "learning_rate": 0.0001371043710237051, + "loss": 0.5224, + "step": 4412 + }, + { + "epoch": 1.416920854069674, + "grad_norm": 0.5423502922058105, + "learning_rate": 0.00013707195397133165, + "loss": 0.369, + "step": 4413 + }, + { + "epoch": 1.4172419328945256, + "grad_norm": 0.6597849130630493, + "learning_rate": 0.0001370395324018531, + "loss": 0.4365, + "step": 4414 + }, + { + "epoch": 1.4175630117193772, + "grad_norm": 0.6346552968025208, + "learning_rate": 0.00013700710631921984, + "loss": 0.5433, + "step": 4415 + }, + { + "epoch": 1.4178840905442285, + "grad_norm": 0.8271416425704956, + "learning_rate": 0.00013697467572738295, + "loss": 0.9643, + "step": 4416 + }, + { + "epoch": 1.41820516936908, + "grad_norm": 0.77828449010849, + "learning_rate": 0.00013694224063029396, + "loss": 0.8363, + "step": 4417 + }, + { + "epoch": 1.4185262481939316, + "grad_norm": 0.9805882573127747, + "learning_rate": 0.00013690980103190503, + "loss": 0.4089, + "step": 4418 + }, + { + "epoch": 1.4188473270187831, + "grad_norm": 0.9755864143371582, + "learning_rate": 0.00013687735693616876, + "loss": 0.3488, + "step": 4419 + }, + { + "epoch": 1.4191684058436347, + "grad_norm": 0.9460292458534241, + "learning_rate": 0.00013684490834703843, + "loss": 0.3581, + "step": 4420 + }, + { + "epoch": 1.4194894846684862, + "grad_norm": 0.8120269179344177, + "learning_rate": 0.00013681245526846783, + "loss": 0.4829, + "step": 4421 + }, + { + "epoch": 1.4198105634933376, + "grad_norm": 0.925631046295166, + "learning_rate": 0.00013677999770441115, + "loss": 0.7187, + "step": 4422 + }, + { + "epoch": 1.4201316423181891, + "grad_norm": 0.8358470797538757, + "learning_rate": 0.00013674753565882334, + "loss": 0.6196, + "step": 4423 + }, + { + "epoch": 1.4204527211430407, + "grad_norm": 1.1134681701660156, + "learning_rate": 0.0001367150691356598, + "loss": 0.7756, + "step": 4424 + }, + { + "epoch": 1.420773799967892, + "grad_norm": 0.7973681688308716, + "learning_rate": 0.00013668259813887643, + "loss": 0.5704, + "step": 4425 + }, + { + "epoch": 1.4210948787927435, + "grad_norm": 0.8833571076393127, + "learning_rate": 0.00013665012267242974, + "loss": 0.7326, + "step": 4426 + }, + { + "epoch": 1.421415957617595, + "grad_norm": 0.85068678855896, + "learning_rate": 0.00013661764274027678, + "loss": 0.5295, + "step": 4427 + }, + { + "epoch": 1.4217370364424466, + "grad_norm": 0.5901229977607727, + "learning_rate": 0.00013658515834637512, + "loss": 0.4465, + "step": 4428 + }, + { + "epoch": 1.4220581152672982, + "grad_norm": 0.7249242067337036, + "learning_rate": 0.0001365526694946829, + "loss": 0.6021, + "step": 4429 + }, + { + "epoch": 1.4223791940921497, + "grad_norm": 0.8554542660713196, + "learning_rate": 0.0001365201761891588, + "loss": 0.5952, + "step": 4430 + }, + { + "epoch": 1.422700272917001, + "grad_norm": 1.1563721895217896, + "learning_rate": 0.00013648767843376196, + "loss": 0.6391, + "step": 4431 + }, + { + "epoch": 1.4230213517418526, + "grad_norm": 0.7757640480995178, + "learning_rate": 0.0001364551762324522, + "loss": 0.5914, + "step": 4432 + }, + { + "epoch": 1.4233424305667042, + "grad_norm": 0.9488760828971863, + "learning_rate": 0.00013642266958918984, + "loss": 0.678, + "step": 4433 + }, + { + "epoch": 1.4236635093915555, + "grad_norm": 0.8043296337127686, + "learning_rate": 0.00013639015850793563, + "loss": 0.6684, + "step": 4434 + }, + { + "epoch": 1.423984588216407, + "grad_norm": 1.0018153190612793, + "learning_rate": 0.000136357642992651, + "loss": 0.8602, + "step": 4435 + }, + { + "epoch": 1.4243056670412586, + "grad_norm": 0.8006235361099243, + "learning_rate": 0.00013632512304729785, + "loss": 0.6058, + "step": 4436 + }, + { + "epoch": 1.4246267458661102, + "grad_norm": 0.9268879890441895, + "learning_rate": 0.00013629259867583863, + "loss": 0.6974, + "step": 4437 + }, + { + "epoch": 1.4249478246909617, + "grad_norm": 0.8567279577255249, + "learning_rate": 0.00013626006988223636, + "loss": 0.5062, + "step": 4438 + }, + { + "epoch": 1.4252689035158133, + "grad_norm": 1.0375492572784424, + "learning_rate": 0.00013622753667045457, + "loss": 0.7928, + "step": 4439 + }, + { + "epoch": 1.4255899823406646, + "grad_norm": 0.6748252511024475, + "learning_rate": 0.00013619499904445734, + "loss": 0.4775, + "step": 4440 + }, + { + "epoch": 1.4259110611655161, + "grad_norm": 0.8363870978355408, + "learning_rate": 0.00013616245700820922, + "loss": 0.6386, + "step": 4441 + }, + { + "epoch": 1.4262321399903677, + "grad_norm": 1.0793986320495605, + "learning_rate": 0.00013612991056567543, + "loss": 0.5909, + "step": 4442 + }, + { + "epoch": 1.426553218815219, + "grad_norm": 0.9321547150611877, + "learning_rate": 0.00013609735972082166, + "loss": 0.7883, + "step": 4443 + }, + { + "epoch": 1.4268742976400706, + "grad_norm": 1.0707225799560547, + "learning_rate": 0.00013606480447761409, + "loss": 0.6685, + "step": 4444 + }, + { + "epoch": 1.4271953764649221, + "grad_norm": 0.9186208248138428, + "learning_rate": 0.00013603224484001948, + "loss": 0.6659, + "step": 4445 + }, + { + "epoch": 1.4275164552897737, + "grad_norm": 1.3622729778289795, + "learning_rate": 0.00013599968081200514, + "loss": 0.6381, + "step": 4446 + }, + { + "epoch": 1.4278375341146252, + "grad_norm": 0.9809760451316833, + "learning_rate": 0.0001359671123975389, + "loss": 0.5467, + "step": 4447 + }, + { + "epoch": 1.4281586129394768, + "grad_norm": 1.0845415592193604, + "learning_rate": 0.00013593453960058908, + "loss": 0.6937, + "step": 4448 + }, + { + "epoch": 1.428479691764328, + "grad_norm": 1.316145896911621, + "learning_rate": 0.00013590196242512463, + "loss": 0.7624, + "step": 4449 + }, + { + "epoch": 1.4288007705891796, + "grad_norm": 0.8910375833511353, + "learning_rate": 0.00013586938087511494, + "loss": 0.532, + "step": 4450 + }, + { + "epoch": 1.4291218494140312, + "grad_norm": 1.1741594076156616, + "learning_rate": 0.00013583679495453, + "loss": 0.5843, + "step": 4451 + }, + { + "epoch": 1.4294429282388825, + "grad_norm": 0.8071622252464294, + "learning_rate": 0.00013580420466734037, + "loss": 0.5575, + "step": 4452 + }, + { + "epoch": 1.429764007063734, + "grad_norm": 1.1299529075622559, + "learning_rate": 0.00013577161001751694, + "loss": 0.4591, + "step": 4453 + }, + { + "epoch": 1.4300850858885856, + "grad_norm": 1.1855186223983765, + "learning_rate": 0.00013573901100903134, + "loss": 0.6654, + "step": 4454 + }, + { + "epoch": 1.4304061647134372, + "grad_norm": 1.0821056365966797, + "learning_rate": 0.00013570640764585566, + "loss": 0.6234, + "step": 4455 + }, + { + "epoch": 1.4307272435382887, + "grad_norm": 1.0219788551330566, + "learning_rate": 0.00013567379993196252, + "loss": 0.5468, + "step": 4456 + }, + { + "epoch": 1.4310483223631403, + "grad_norm": 1.1291626691818237, + "learning_rate": 0.00013564118787132506, + "loss": 0.782, + "step": 4457 + }, + { + "epoch": 1.4313694011879916, + "grad_norm": 0.9877511262893677, + "learning_rate": 0.000135608571467917, + "loss": 0.5307, + "step": 4458 + }, + { + "epoch": 1.4316904800128432, + "grad_norm": 0.9503813982009888, + "learning_rate": 0.0001355759507257125, + "loss": 0.5951, + "step": 4459 + }, + { + "epoch": 1.4320115588376947, + "grad_norm": 0.9377137422561646, + "learning_rate": 0.0001355433256486863, + "loss": 0.4201, + "step": 4460 + }, + { + "epoch": 1.432332637662546, + "grad_norm": 0.9886158108711243, + "learning_rate": 0.0001355106962408137, + "loss": 0.5017, + "step": 4461 + }, + { + "epoch": 1.4326537164873976, + "grad_norm": 1.0197458267211914, + "learning_rate": 0.0001354780625060705, + "loss": 0.4774, + "step": 4462 + }, + { + "epoch": 1.4329747953122491, + "grad_norm": 0.9758638143539429, + "learning_rate": 0.00013544542444843299, + "loss": 0.4103, + "step": 4463 + }, + { + "epoch": 1.4332958741371007, + "grad_norm": 0.7870468497276306, + "learning_rate": 0.000135412782071878, + "loss": 0.4326, + "step": 4464 + }, + { + "epoch": 1.4336169529619522, + "grad_norm": 0.738239049911499, + "learning_rate": 0.00013538013538038295, + "loss": 0.8734, + "step": 4465 + }, + { + "epoch": 1.4339380317868038, + "grad_norm": 0.6055427193641663, + "learning_rate": 0.00013534748437792573, + "loss": 0.7507, + "step": 4466 + }, + { + "epoch": 1.4342591106116551, + "grad_norm": 0.7751237154006958, + "learning_rate": 0.00013531482906848475, + "loss": 0.3607, + "step": 4467 + }, + { + "epoch": 1.4345801894365067, + "grad_norm": 0.7698683738708496, + "learning_rate": 0.000135282169456039, + "loss": 0.3953, + "step": 4468 + }, + { + "epoch": 1.4349012682613582, + "grad_norm": 0.8559454679489136, + "learning_rate": 0.00013524950554456784, + "loss": 0.4011, + "step": 4469 + }, + { + "epoch": 1.4352223470862095, + "grad_norm": 1.0439728498458862, + "learning_rate": 0.00013521683733805145, + "loss": 0.775, + "step": 4470 + }, + { + "epoch": 1.435543425911061, + "grad_norm": 0.8648383021354675, + "learning_rate": 0.00013518416484047018, + "loss": 0.6064, + "step": 4471 + }, + { + "epoch": 1.4358645047359127, + "grad_norm": 1.1003457307815552, + "learning_rate": 0.0001351514880558052, + "loss": 0.5959, + "step": 4472 + }, + { + "epoch": 1.4361855835607642, + "grad_norm": 0.8835905194282532, + "learning_rate": 0.00013511880698803801, + "loss": 0.579, + "step": 4473 + }, + { + "epoch": 1.4365066623856158, + "grad_norm": 0.654793381690979, + "learning_rate": 0.00013508612164115068, + "loss": 0.4533, + "step": 4474 + }, + { + "epoch": 1.4368277412104673, + "grad_norm": 0.832150399684906, + "learning_rate": 0.0001350534320191259, + "loss": 0.59, + "step": 4475 + }, + { + "epoch": 1.4371488200353186, + "grad_norm": 0.805191695690155, + "learning_rate": 0.00013502073812594675, + "loss": 0.5468, + "step": 4476 + }, + { + "epoch": 1.4374698988601702, + "grad_norm": 1.0974748134613037, + "learning_rate": 0.00013498803996559692, + "loss": 0.8118, + "step": 4477 + }, + { + "epoch": 1.4377909776850217, + "grad_norm": 1.0176666975021362, + "learning_rate": 0.0001349553375420605, + "loss": 0.6775, + "step": 4478 + }, + { + "epoch": 1.438112056509873, + "grad_norm": 1.417904019355774, + "learning_rate": 0.00013492263085932224, + "loss": 0.7306, + "step": 4479 + }, + { + "epoch": 1.4384331353347246, + "grad_norm": 1.5575790405273438, + "learning_rate": 0.00013488991992136734, + "loss": 0.7895, + "step": 4480 + }, + { + "epoch": 1.4387542141595762, + "grad_norm": 0.9798082113265991, + "learning_rate": 0.00013485720473218154, + "loss": 0.6748, + "step": 4481 + }, + { + "epoch": 1.4390752929844277, + "grad_norm": 1.1362134218215942, + "learning_rate": 0.00013482448529575106, + "loss": 0.7912, + "step": 4482 + }, + { + "epoch": 1.4393963718092793, + "grad_norm": 0.7490417957305908, + "learning_rate": 0.0001347917616160627, + "loss": 0.549, + "step": 4483 + }, + { + "epoch": 1.4397174506341308, + "grad_norm": 1.027808427810669, + "learning_rate": 0.0001347590336971037, + "loss": 0.7783, + "step": 4484 + }, + { + "epoch": 1.4400385294589821, + "grad_norm": 0.8055115938186646, + "learning_rate": 0.0001347263015428619, + "loss": 0.5538, + "step": 4485 + }, + { + "epoch": 1.4403596082838337, + "grad_norm": 1.0068027973175049, + "learning_rate": 0.00013469356515732558, + "loss": 0.8646, + "step": 4486 + }, + { + "epoch": 1.4406806871086852, + "grad_norm": 1.301796555519104, + "learning_rate": 0.00013466082454448362, + "loss": 0.7354, + "step": 4487 + }, + { + "epoch": 1.4410017659335366, + "grad_norm": 0.9036690592765808, + "learning_rate": 0.0001346280797083253, + "loss": 0.5607, + "step": 4488 + }, + { + "epoch": 1.4413228447583881, + "grad_norm": 0.928037703037262, + "learning_rate": 0.0001345953306528405, + "loss": 0.7001, + "step": 4489 + }, + { + "epoch": 1.4416439235832397, + "grad_norm": 0.9121316075325012, + "learning_rate": 0.00013456257738201957, + "loss": 0.561, + "step": 4490 + }, + { + "epoch": 1.4419650024080912, + "grad_norm": 1.3250795602798462, + "learning_rate": 0.00013452981989985348, + "loss": 0.81, + "step": 4491 + }, + { + "epoch": 1.4422860812329428, + "grad_norm": 0.8794850707054138, + "learning_rate": 0.00013449705821033355, + "loss": 0.7304, + "step": 4492 + }, + { + "epoch": 1.4426071600577943, + "grad_norm": 1.1400325298309326, + "learning_rate": 0.0001344642923174517, + "loss": 0.6693, + "step": 4493 + }, + { + "epoch": 1.4429282388826457, + "grad_norm": 1.1574450731277466, + "learning_rate": 0.00013443152222520038, + "loss": 0.5701, + "step": 4494 + }, + { + "epoch": 1.4432493177074972, + "grad_norm": 1.0430783033370972, + "learning_rate": 0.00013439874793757254, + "loss": 0.7403, + "step": 4495 + }, + { + "epoch": 1.4435703965323488, + "grad_norm": 0.968147337436676, + "learning_rate": 0.00013436596945856164, + "loss": 0.7173, + "step": 4496 + }, + { + "epoch": 1.4438914753572, + "grad_norm": 0.7893198132514954, + "learning_rate": 0.00013433318679216153, + "loss": 0.5111, + "step": 4497 + }, + { + "epoch": 1.4442125541820516, + "grad_norm": 0.9785538911819458, + "learning_rate": 0.00013430039994236678, + "loss": 0.5591, + "step": 4498 + }, + { + "epoch": 1.4445336330069032, + "grad_norm": 1.5484408140182495, + "learning_rate": 0.00013426760891317236, + "loss": 0.7088, + "step": 4499 + }, + { + "epoch": 1.4448547118317547, + "grad_norm": 0.9762724041938782, + "learning_rate": 0.00013423481370857375, + "loss": 0.5886, + "step": 4500 + }, + { + "epoch": 1.4451757906566063, + "grad_norm": 1.3003888130187988, + "learning_rate": 0.00013420201433256689, + "loss": 0.7262, + "step": 4501 + }, + { + "epoch": 1.4454968694814578, + "grad_norm": 0.7951440811157227, + "learning_rate": 0.00013416921078914835, + "loss": 0.4705, + "step": 4502 + }, + { + "epoch": 1.4458179483063092, + "grad_norm": 0.8316115140914917, + "learning_rate": 0.0001341364030823151, + "loss": 0.5546, + "step": 4503 + }, + { + "epoch": 1.4461390271311607, + "grad_norm": 1.0351200103759766, + "learning_rate": 0.0001341035912160647, + "loss": 0.5174, + "step": 4504 + }, + { + "epoch": 1.4464601059560123, + "grad_norm": 1.0522757768630981, + "learning_rate": 0.0001340707751943952, + "loss": 0.6695, + "step": 4505 + }, + { + "epoch": 1.4467811847808636, + "grad_norm": 0.8904996514320374, + "learning_rate": 0.00013403795502130503, + "loss": 0.5893, + "step": 4506 + }, + { + "epoch": 1.4471022636057151, + "grad_norm": 0.866426408290863, + "learning_rate": 0.0001340051307007933, + "loss": 0.6581, + "step": 4507 + }, + { + "epoch": 1.4474233424305667, + "grad_norm": 0.922524094581604, + "learning_rate": 0.00013397230223685956, + "loss": 0.5626, + "step": 4508 + }, + { + "epoch": 1.4477444212554182, + "grad_norm": 1.0396236181259155, + "learning_rate": 0.00013393946963350382, + "loss": 0.524, + "step": 4509 + }, + { + "epoch": 1.4480655000802698, + "grad_norm": 1.3418883085250854, + "learning_rate": 0.00013390663289472666, + "loss": 0.6468, + "step": 4510 + }, + { + "epoch": 1.4483865789051211, + "grad_norm": 0.7169436812400818, + "learning_rate": 0.00013387379202452917, + "loss": 0.4527, + "step": 4511 + }, + { + "epoch": 1.4487076577299727, + "grad_norm": 1.3441863059997559, + "learning_rate": 0.00013384094702691281, + "loss": 0.5522, + "step": 4512 + }, + { + "epoch": 1.4490287365548242, + "grad_norm": 0.7752569913864136, + "learning_rate": 0.00013380809790587974, + "loss": 0.441, + "step": 4513 + }, + { + "epoch": 1.4493498153796758, + "grad_norm": 0.8399336934089661, + "learning_rate": 0.00013377524466543248, + "loss": 0.4828, + "step": 4514 + }, + { + "epoch": 1.449670894204527, + "grad_norm": 0.9468197822570801, + "learning_rate": 0.00013374238730957412, + "loss": 0.6818, + "step": 4515 + }, + { + "epoch": 1.4499919730293787, + "grad_norm": 0.5419335961341858, + "learning_rate": 0.00013370952584230823, + "loss": 0.7663, + "step": 4516 + }, + { + "epoch": 1.4503130518542302, + "grad_norm": 0.7655202150344849, + "learning_rate": 0.00013367666026763882, + "loss": 0.6944, + "step": 4517 + }, + { + "epoch": 1.4506341306790818, + "grad_norm": 0.7268679141998291, + "learning_rate": 0.00013364379058957056, + "loss": 0.3399, + "step": 4518 + }, + { + "epoch": 1.4509552095039333, + "grad_norm": 0.787480354309082, + "learning_rate": 0.00013361091681210845, + "loss": 0.4652, + "step": 4519 + }, + { + "epoch": 1.4512762883287846, + "grad_norm": 0.9196494221687317, + "learning_rate": 0.00013357803893925807, + "loss": 0.5813, + "step": 4520 + }, + { + "epoch": 1.4515973671536362, + "grad_norm": 0.7516376376152039, + "learning_rate": 0.00013354515697502553, + "loss": 0.3046, + "step": 4521 + }, + { + "epoch": 1.4519184459784877, + "grad_norm": 0.7327350974082947, + "learning_rate": 0.00013351227092341732, + "loss": 0.6606, + "step": 4522 + }, + { + "epoch": 1.4522395248033393, + "grad_norm": 0.9102757573127747, + "learning_rate": 0.0001334793807884406, + "loss": 0.7293, + "step": 4523 + }, + { + "epoch": 1.4525606036281906, + "grad_norm": 0.8285297155380249, + "learning_rate": 0.00013344648657410282, + "loss": 0.5445, + "step": 4524 + }, + { + "epoch": 1.4528816824530422, + "grad_norm": 1.0268747806549072, + "learning_rate": 0.00013341358828441218, + "loss": 0.7724, + "step": 4525 + }, + { + "epoch": 1.4532027612778937, + "grad_norm": 0.7675826549530029, + "learning_rate": 0.0001333806859233771, + "loss": 0.5759, + "step": 4526 + }, + { + "epoch": 1.4535238401027453, + "grad_norm": 0.8815962672233582, + "learning_rate": 0.00013334777949500673, + "loss": 0.7797, + "step": 4527 + }, + { + "epoch": 1.4538449189275968, + "grad_norm": 0.7846258282661438, + "learning_rate": 0.00013331486900331057, + "loss": 0.5603, + "step": 4528 + }, + { + "epoch": 1.4541659977524481, + "grad_norm": 1.0539195537567139, + "learning_rate": 0.00013328195445229868, + "loss": 0.7525, + "step": 4529 + }, + { + "epoch": 1.4544870765772997, + "grad_norm": 0.9988424777984619, + "learning_rate": 0.0001332490358459816, + "loss": 0.7323, + "step": 4530 + }, + { + "epoch": 1.4548081554021512, + "grad_norm": 0.7499678730964661, + "learning_rate": 0.00013321611318837032, + "loss": 0.6056, + "step": 4531 + }, + { + "epoch": 1.4551292342270028, + "grad_norm": 0.7443246841430664, + "learning_rate": 0.00013318318648347646, + "loss": 0.4723, + "step": 4532 + }, + { + "epoch": 1.4554503130518541, + "grad_norm": 0.7882360219955444, + "learning_rate": 0.00013315025573531198, + "loss": 0.5804, + "step": 4533 + }, + { + "epoch": 1.4557713918767057, + "grad_norm": 0.9871779680252075, + "learning_rate": 0.00013311732094788937, + "loss": 0.715, + "step": 4534 + }, + { + "epoch": 1.4560924707015572, + "grad_norm": 0.8856052756309509, + "learning_rate": 0.00013308438212522163, + "loss": 0.6614, + "step": 4535 + }, + { + "epoch": 1.4564135495264088, + "grad_norm": 0.9039063453674316, + "learning_rate": 0.0001330514392713223, + "loss": 0.7465, + "step": 4536 + }, + { + "epoch": 1.4567346283512603, + "grad_norm": 0.7212230563163757, + "learning_rate": 0.00013301849239020537, + "loss": 0.6114, + "step": 4537 + }, + { + "epoch": 1.4570557071761117, + "grad_norm": 0.8474130630493164, + "learning_rate": 0.00013298554148588528, + "loss": 0.5634, + "step": 4538 + }, + { + "epoch": 1.4573767860009632, + "grad_norm": 1.1321617364883423, + "learning_rate": 0.00013295258656237702, + "loss": 0.6322, + "step": 4539 + }, + { + "epoch": 1.4576978648258148, + "grad_norm": 1.0911418199539185, + "learning_rate": 0.00013291962762369608, + "loss": 0.6877, + "step": 4540 + }, + { + "epoch": 1.4580189436506663, + "grad_norm": 0.9263336658477783, + "learning_rate": 0.00013288666467385833, + "loss": 0.5769, + "step": 4541 + }, + { + "epoch": 1.4583400224755176, + "grad_norm": 0.677189826965332, + "learning_rate": 0.00013285369771688025, + "loss": 0.5574, + "step": 4542 + }, + { + "epoch": 1.4586611013003692, + "grad_norm": 0.9409946203231812, + "learning_rate": 0.0001328207267567788, + "loss": 0.6771, + "step": 4543 + }, + { + "epoch": 1.4589821801252207, + "grad_norm": 0.848575234413147, + "learning_rate": 0.00013278775179757131, + "loss": 0.5214, + "step": 4544 + }, + { + "epoch": 1.4593032589500723, + "grad_norm": 0.8765393495559692, + "learning_rate": 0.0001327547728432757, + "loss": 0.5847, + "step": 4545 + }, + { + "epoch": 1.4596243377749238, + "grad_norm": 0.8433587551116943, + "learning_rate": 0.0001327217898979104, + "loss": 0.4884, + "step": 4546 + }, + { + "epoch": 1.4599454165997752, + "grad_norm": 1.2517951726913452, + "learning_rate": 0.00013268880296549425, + "loss": 0.5431, + "step": 4547 + }, + { + "epoch": 1.4602664954246267, + "grad_norm": 0.8801559209823608, + "learning_rate": 0.00013265581205004661, + "loss": 0.5063, + "step": 4548 + }, + { + "epoch": 1.4605875742494783, + "grad_norm": 1.0915852785110474, + "learning_rate": 0.00013262281715558736, + "loss": 0.6934, + "step": 4549 + }, + { + "epoch": 1.4609086530743298, + "grad_norm": 1.139147400856018, + "learning_rate": 0.00013258981828613678, + "loss": 0.6021, + "step": 4550 + }, + { + "epoch": 1.4612297318991812, + "grad_norm": 1.0436866283416748, + "learning_rate": 0.00013255681544571568, + "loss": 0.8493, + "step": 4551 + }, + { + "epoch": 1.4615508107240327, + "grad_norm": 0.8446107506752014, + "learning_rate": 0.0001325238086383454, + "loss": 0.4369, + "step": 4552 + }, + { + "epoch": 1.4618718895488843, + "grad_norm": 2.1884477138519287, + "learning_rate": 0.00013249079786804765, + "loss": 0.6191, + "step": 4553 + }, + { + "epoch": 1.4621929683737358, + "grad_norm": 0.8910212516784668, + "learning_rate": 0.00013245778313884478, + "loss": 0.4856, + "step": 4554 + }, + { + "epoch": 1.4625140471985874, + "grad_norm": 0.6236334443092346, + "learning_rate": 0.00013242476445475944, + "loss": 0.4026, + "step": 4555 + }, + { + "epoch": 1.4628351260234387, + "grad_norm": 1.155137538909912, + "learning_rate": 0.00013239174181981495, + "loss": 0.7141, + "step": 4556 + }, + { + "epoch": 1.4631562048482902, + "grad_norm": 1.5037484169006348, + "learning_rate": 0.00013235871523803496, + "loss": 0.7128, + "step": 4557 + }, + { + "epoch": 1.4634772836731418, + "grad_norm": 0.9598552584648132, + "learning_rate": 0.0001323256847134437, + "loss": 0.6375, + "step": 4558 + }, + { + "epoch": 1.4637983624979933, + "grad_norm": 0.9860318303108215, + "learning_rate": 0.00013229265025006576, + "loss": 0.572, + "step": 4559 + }, + { + "epoch": 1.4641194413228447, + "grad_norm": 0.9279815554618835, + "learning_rate": 0.00013225961185192638, + "loss": 0.4483, + "step": 4560 + }, + { + "epoch": 1.4644405201476962, + "grad_norm": 0.8800573945045471, + "learning_rate": 0.00013222656952305113, + "loss": 0.5297, + "step": 4561 + }, + { + "epoch": 1.4647615989725478, + "grad_norm": 1.1138170957565308, + "learning_rate": 0.00013219352326746613, + "loss": 0.5575, + "step": 4562 + }, + { + "epoch": 1.4650826777973993, + "grad_norm": 0.760184645652771, + "learning_rate": 0.00013216047308919798, + "loss": 0.4005, + "step": 4563 + }, + { + "epoch": 1.4654037566222509, + "grad_norm": 0.6601788997650146, + "learning_rate": 0.00013212741899227374, + "loss": 0.3951, + "step": 4564 + }, + { + "epoch": 1.4657248354471022, + "grad_norm": 0.825016438961029, + "learning_rate": 0.00013209436098072095, + "loss": 0.7009, + "step": 4565 + }, + { + "epoch": 1.4660459142719537, + "grad_norm": 0.652347981929779, + "learning_rate": 0.00013206129905856765, + "loss": 0.6756, + "step": 4566 + }, + { + "epoch": 1.4663669930968053, + "grad_norm": 0.5992451906204224, + "learning_rate": 0.00013202823322984228, + "loss": 0.6338, + "step": 4567 + }, + { + "epoch": 1.4666880719216568, + "grad_norm": 0.8534404039382935, + "learning_rate": 0.00013199516349857385, + "loss": 0.5359, + "step": 4568 + }, + { + "epoch": 1.4670091507465082, + "grad_norm": 0.7051871418952942, + "learning_rate": 0.0001319620898687918, + "loss": 0.3355, + "step": 4569 + }, + { + "epoch": 1.4673302295713597, + "grad_norm": 0.9312176704406738, + "learning_rate": 0.00013192901234452607, + "loss": 0.4381, + "step": 4570 + }, + { + "epoch": 1.4676513083962113, + "grad_norm": 1.0084874629974365, + "learning_rate": 0.00013189593092980702, + "loss": 0.3896, + "step": 4571 + }, + { + "epoch": 1.4679723872210628, + "grad_norm": 0.9059427380561829, + "learning_rate": 0.00013186284562866554, + "loss": 0.3917, + "step": 4572 + }, + { + "epoch": 1.4682934660459144, + "grad_norm": 0.9761437773704529, + "learning_rate": 0.00013182975644513296, + "loss": 0.6641, + "step": 4573 + }, + { + "epoch": 1.4686145448707657, + "grad_norm": 0.8613228797912598, + "learning_rate": 0.00013179666338324108, + "loss": 0.5554, + "step": 4574 + }, + { + "epoch": 1.4689356236956173, + "grad_norm": 0.9337467551231384, + "learning_rate": 0.00013176356644702225, + "loss": 0.82, + "step": 4575 + }, + { + "epoch": 1.4692567025204688, + "grad_norm": 0.6009548306465149, + "learning_rate": 0.00013173046564050924, + "loss": 0.4591, + "step": 4576 + }, + { + "epoch": 1.4695777813453204, + "grad_norm": 0.8322634100914001, + "learning_rate": 0.0001316973609677352, + "loss": 0.6572, + "step": 4577 + }, + { + "epoch": 1.4698988601701717, + "grad_norm": 0.9558845162391663, + "learning_rate": 0.00013166425243273385, + "loss": 0.7128, + "step": 4578 + }, + { + "epoch": 1.4702199389950232, + "grad_norm": 0.9433383345603943, + "learning_rate": 0.0001316311400395394, + "loss": 0.6818, + "step": 4579 + }, + { + "epoch": 1.4705410178198748, + "grad_norm": 0.7871822118759155, + "learning_rate": 0.0001315980237921865, + "loss": 0.6238, + "step": 4580 + }, + { + "epoch": 1.4708620966447263, + "grad_norm": 0.8970574140548706, + "learning_rate": 0.00013156490369471027, + "loss": 0.6878, + "step": 4581 + }, + { + "epoch": 1.4711831754695779, + "grad_norm": 1.0396785736083984, + "learning_rate": 0.00013153177975114625, + "loss": 0.7369, + "step": 4582 + }, + { + "epoch": 1.4715042542944292, + "grad_norm": 0.8116987943649292, + "learning_rate": 0.0001314986519655305, + "loss": 0.6017, + "step": 4583 + }, + { + "epoch": 1.4718253331192808, + "grad_norm": 1.061795711517334, + "learning_rate": 0.00013146552034189955, + "loss": 0.5786, + "step": 4584 + }, + { + "epoch": 1.4721464119441323, + "grad_norm": 0.9677996039390564, + "learning_rate": 0.0001314323848842904, + "loss": 0.4793, + "step": 4585 + }, + { + "epoch": 1.4724674907689839, + "grad_norm": 0.8710011839866638, + "learning_rate": 0.00013139924559674052, + "loss": 0.6054, + "step": 4586 + }, + { + "epoch": 1.4727885695938352, + "grad_norm": 0.9556978344917297, + "learning_rate": 0.00013136610248328778, + "loss": 0.6016, + "step": 4587 + }, + { + "epoch": 1.4731096484186867, + "grad_norm": 1.0169155597686768, + "learning_rate": 0.0001313329555479706, + "loss": 0.5464, + "step": 4588 + }, + { + "epoch": 1.4734307272435383, + "grad_norm": 0.9183655977249146, + "learning_rate": 0.00013129980479482782, + "loss": 0.7172, + "step": 4589 + }, + { + "epoch": 1.4737518060683898, + "grad_norm": 0.7591384053230286, + "learning_rate": 0.00013126665022789879, + "loss": 0.6191, + "step": 4590 + }, + { + "epoch": 1.4740728848932414, + "grad_norm": 0.8953664898872375, + "learning_rate": 0.00013123349185122327, + "loss": 0.532, + "step": 4591 + }, + { + "epoch": 1.4743939637180927, + "grad_norm": 1.7143136262893677, + "learning_rate": 0.0001312003296688415, + "loss": 0.581, + "step": 4592 + }, + { + "epoch": 1.4747150425429443, + "grad_norm": 0.8613825440406799, + "learning_rate": 0.0001311671636847942, + "loss": 0.5599, + "step": 4593 + }, + { + "epoch": 1.4750361213677958, + "grad_norm": 0.901800274848938, + "learning_rate": 0.00013113399390312256, + "loss": 0.5636, + "step": 4594 + }, + { + "epoch": 1.4753572001926474, + "grad_norm": 0.9919823408126831, + "learning_rate": 0.0001311008203278682, + "loss": 0.6591, + "step": 4595 + }, + { + "epoch": 1.4756782790174987, + "grad_norm": 1.3764430284500122, + "learning_rate": 0.0001310676429630732, + "loss": 0.6994, + "step": 4596 + }, + { + "epoch": 1.4759993578423503, + "grad_norm": 1.025925874710083, + "learning_rate": 0.00013103446181278013, + "loss": 0.571, + "step": 4597 + }, + { + "epoch": 1.4763204366672018, + "grad_norm": 0.9428420662879944, + "learning_rate": 0.00013100127688103205, + "loss": 0.6081, + "step": 4598 + }, + { + "epoch": 1.4766415154920534, + "grad_norm": 0.8529512882232666, + "learning_rate": 0.00013096808817187242, + "loss": 0.4775, + "step": 4599 + }, + { + "epoch": 1.476962594316905, + "grad_norm": 1.0675090551376343, + "learning_rate": 0.0001309348956893452, + "loss": 0.7771, + "step": 4600 + }, + { + "epoch": 1.4772836731417562, + "grad_norm": 1.3736308813095093, + "learning_rate": 0.00013090169943749476, + "loss": 0.6906, + "step": 4601 + }, + { + "epoch": 1.4776047519666078, + "grad_norm": 1.8220711946487427, + "learning_rate": 0.00013086849942036595, + "loss": 0.704, + "step": 4602 + }, + { + "epoch": 1.4779258307914593, + "grad_norm": 0.9620269536972046, + "learning_rate": 0.00013083529564200417, + "loss": 0.526, + "step": 4603 + }, + { + "epoch": 1.4782469096163107, + "grad_norm": 0.6493226289749146, + "learning_rate": 0.00013080208810645514, + "loss": 0.3707, + "step": 4604 + }, + { + "epoch": 1.4785679884411622, + "grad_norm": 1.0979892015457153, + "learning_rate": 0.0001307688768177651, + "loss": 0.6589, + "step": 4605 + }, + { + "epoch": 1.4788890672660138, + "grad_norm": 0.8274226784706116, + "learning_rate": 0.00013073566177998074, + "loss": 0.5467, + "step": 4606 + }, + { + "epoch": 1.4792101460908653, + "grad_norm": 1.2158207893371582, + "learning_rate": 0.0001307024429971492, + "loss": 0.5996, + "step": 4607 + }, + { + "epoch": 1.4795312249157169, + "grad_norm": 1.105692982673645, + "learning_rate": 0.00013066922047331813, + "loss": 0.5918, + "step": 4608 + }, + { + "epoch": 1.4798523037405684, + "grad_norm": 0.8877955675125122, + "learning_rate": 0.00013063599421253558, + "loss": 0.5614, + "step": 4609 + }, + { + "epoch": 1.4801733825654197, + "grad_norm": 0.8180930018424988, + "learning_rate": 0.0001306027642188501, + "loss": 0.5291, + "step": 4610 + }, + { + "epoch": 1.4804944613902713, + "grad_norm": 1.009455680847168, + "learning_rate": 0.00013056953049631057, + "loss": 0.4543, + "step": 4611 + }, + { + "epoch": 1.4808155402151228, + "grad_norm": 0.779489278793335, + "learning_rate": 0.0001305362930489665, + "loss": 0.5272, + "step": 4612 + }, + { + "epoch": 1.4811366190399742, + "grad_norm": 0.9974098801612854, + "learning_rate": 0.0001305030518808678, + "loss": 0.4729, + "step": 4613 + }, + { + "epoch": 1.4814576978648257, + "grad_norm": 0.7673447132110596, + "learning_rate": 0.00013046980699606467, + "loss": 0.345, + "step": 4614 + }, + { + "epoch": 1.4817787766896773, + "grad_norm": 0.6975226402282715, + "learning_rate": 0.00013043655839860803, + "loss": 0.6528, + "step": 4615 + }, + { + "epoch": 1.4820998555145288, + "grad_norm": 0.652308464050293, + "learning_rate": 0.00013040330609254903, + "loss": 0.5996, + "step": 4616 + }, + { + "epoch": 1.4824209343393804, + "grad_norm": 0.7014802694320679, + "learning_rate": 0.00013037005008193942, + "loss": 0.6009, + "step": 4617 + }, + { + "epoch": 1.482742013164232, + "grad_norm": 0.8486183881759644, + "learning_rate": 0.00013033679037083138, + "loss": 0.5923, + "step": 4618 + }, + { + "epoch": 1.4830630919890833, + "grad_norm": 0.7879297137260437, + "learning_rate": 0.00013030352696327742, + "loss": 0.3314, + "step": 4619 + }, + { + "epoch": 1.4833841708139348, + "grad_norm": 0.6520053148269653, + "learning_rate": 0.0001302702598633306, + "loss": 0.3041, + "step": 4620 + }, + { + "epoch": 1.4837052496387864, + "grad_norm": 0.8596171736717224, + "learning_rate": 0.00013023698907504446, + "loss": 0.3313, + "step": 4621 + }, + { + "epoch": 1.4840263284636377, + "grad_norm": 0.5723639130592346, + "learning_rate": 0.0001302037146024729, + "loss": 0.188, + "step": 4622 + }, + { + "epoch": 1.4843474072884892, + "grad_norm": 0.7225738167762756, + "learning_rate": 0.00013017043644967036, + "loss": 0.3561, + "step": 4623 + }, + { + "epoch": 1.4846684861133408, + "grad_norm": 0.9386597871780396, + "learning_rate": 0.00013013715462069166, + "loss": 0.7271, + "step": 4624 + }, + { + "epoch": 1.4849895649381923, + "grad_norm": 1.0117478370666504, + "learning_rate": 0.00013010386911959206, + "loss": 0.806, + "step": 4625 + }, + { + "epoch": 1.485310643763044, + "grad_norm": 0.9228258728981018, + "learning_rate": 0.00013007057995042732, + "loss": 0.5942, + "step": 4626 + }, + { + "epoch": 1.4856317225878954, + "grad_norm": 0.9322078227996826, + "learning_rate": 0.0001300372871172536, + "loss": 0.6389, + "step": 4627 + }, + { + "epoch": 1.4859528014127468, + "grad_norm": 1.0570144653320312, + "learning_rate": 0.00013000399062412763, + "loss": 0.6589, + "step": 4628 + }, + { + "epoch": 1.4862738802375983, + "grad_norm": 0.8913359642028809, + "learning_rate": 0.0001299706904751064, + "loss": 0.6874, + "step": 4629 + }, + { + "epoch": 1.4865949590624499, + "grad_norm": 0.8339388966560364, + "learning_rate": 0.0001299373866742474, + "loss": 0.6398, + "step": 4630 + }, + { + "epoch": 1.4869160378873012, + "grad_norm": 0.6980022192001343, + "learning_rate": 0.00012990407922560868, + "loss": 0.4593, + "step": 4631 + }, + { + "epoch": 1.4872371167121528, + "grad_norm": 0.9570392370223999, + "learning_rate": 0.00012987076813324858, + "loss": 0.5951, + "step": 4632 + }, + { + "epoch": 1.4875581955370043, + "grad_norm": 0.8993356227874756, + "learning_rate": 0.00012983745340122604, + "loss": 0.7096, + "step": 4633 + }, + { + "epoch": 1.4878792743618559, + "grad_norm": 0.9646190404891968, + "learning_rate": 0.0001298041350336003, + "loss": 0.662, + "step": 4634 + }, + { + "epoch": 1.4882003531867074, + "grad_norm": 0.8036277294158936, + "learning_rate": 0.00012977081303443107, + "loss": 0.5762, + "step": 4635 + }, + { + "epoch": 1.488521432011559, + "grad_norm": 1.3247047662734985, + "learning_rate": 0.0001297374874077786, + "loss": 0.6607, + "step": 4636 + }, + { + "epoch": 1.4888425108364103, + "grad_norm": 0.9229692816734314, + "learning_rate": 0.0001297041581577035, + "loss": 0.6021, + "step": 4637 + }, + { + "epoch": 1.4891635896612618, + "grad_norm": 0.8703567981719971, + "learning_rate": 0.00012967082528826684, + "loss": 0.5731, + "step": 4638 + }, + { + "epoch": 1.4894846684861134, + "grad_norm": 1.0840383768081665, + "learning_rate": 0.00012963748880353011, + "loss": 0.6943, + "step": 4639 + }, + { + "epoch": 1.4898057473109647, + "grad_norm": 0.9261624217033386, + "learning_rate": 0.00012960414870755524, + "loss": 0.5193, + "step": 4640 + }, + { + "epoch": 1.4901268261358163, + "grad_norm": 1.1405514478683472, + "learning_rate": 0.00012957080500440468, + "loss": 0.709, + "step": 4641 + }, + { + "epoch": 1.4904479049606678, + "grad_norm": 1.0057867765426636, + "learning_rate": 0.00012953745769814123, + "loss": 0.6075, + "step": 4642 + }, + { + "epoch": 1.4907689837855194, + "grad_norm": 0.86097252368927, + "learning_rate": 0.00012950410679282815, + "loss": 0.4511, + "step": 4643 + }, + { + "epoch": 1.491090062610371, + "grad_norm": 1.2975316047668457, + "learning_rate": 0.00012947075229252915, + "loss": 0.7537, + "step": 4644 + }, + { + "epoch": 1.4914111414352225, + "grad_norm": 1.0226067304611206, + "learning_rate": 0.00012943739420130836, + "loss": 0.497, + "step": 4645 + }, + { + "epoch": 1.4917322202600738, + "grad_norm": 1.0293515920639038, + "learning_rate": 0.0001294040325232304, + "loss": 0.5741, + "step": 4646 + }, + { + "epoch": 1.4920532990849253, + "grad_norm": 0.7942186594009399, + "learning_rate": 0.0001293706672623603, + "loss": 0.5992, + "step": 4647 + }, + { + "epoch": 1.492374377909777, + "grad_norm": 0.8032208681106567, + "learning_rate": 0.00012933729842276343, + "loss": 0.4878, + "step": 4648 + }, + { + "epoch": 1.4926954567346282, + "grad_norm": 0.9675914645195007, + "learning_rate": 0.00012930392600850573, + "loss": 0.5973, + "step": 4649 + }, + { + "epoch": 1.4930165355594798, + "grad_norm": 1.0577799081802368, + "learning_rate": 0.0001292705500236536, + "loss": 0.7173, + "step": 4650 + }, + { + "epoch": 1.4933376143843313, + "grad_norm": 0.9797313213348389, + "learning_rate": 0.00012923717047227368, + "loss": 0.5796, + "step": 4651 + }, + { + "epoch": 1.4936586932091829, + "grad_norm": 1.041603446006775, + "learning_rate": 0.00012920378735843327, + "loss": 0.4257, + "step": 4652 + }, + { + "epoch": 1.4939797720340344, + "grad_norm": 0.7669867873191833, + "learning_rate": 0.0001291704006861999, + "loss": 0.5239, + "step": 4653 + }, + { + "epoch": 1.494300850858886, + "grad_norm": 0.8458186388015747, + "learning_rate": 0.00012913701045964173, + "loss": 0.4882, + "step": 4654 + }, + { + "epoch": 1.4946219296837373, + "grad_norm": 0.9558455944061279, + "learning_rate": 0.00012910361668282719, + "loss": 0.53, + "step": 4655 + }, + { + "epoch": 1.4949430085085889, + "grad_norm": 0.9773117899894714, + "learning_rate": 0.00012907021935982526, + "loss": 0.5155, + "step": 4656 + }, + { + "epoch": 1.4952640873334404, + "grad_norm": 1.0766710042953491, + "learning_rate": 0.00012903681849470527, + "loss": 0.5791, + "step": 4657 + }, + { + "epoch": 1.4955851661582917, + "grad_norm": 0.9035495519638062, + "learning_rate": 0.00012900341409153703, + "loss": 0.5649, + "step": 4658 + }, + { + "epoch": 1.4959062449831433, + "grad_norm": 1.3426756858825684, + "learning_rate": 0.00012897000615439075, + "loss": 0.4312, + "step": 4659 + }, + { + "epoch": 1.4962273238079948, + "grad_norm": 1.0395222902297974, + "learning_rate": 0.0001289365946873371, + "loss": 0.5155, + "step": 4660 + }, + { + "epoch": 1.4965484026328464, + "grad_norm": 0.9974014163017273, + "learning_rate": 0.00012890317969444716, + "loss": 0.6004, + "step": 4661 + }, + { + "epoch": 1.496869481457698, + "grad_norm": 0.9123116135597229, + "learning_rate": 0.00012886976117979247, + "loss": 0.6294, + "step": 4662 + }, + { + "epoch": 1.4971905602825495, + "grad_norm": 0.7099243402481079, + "learning_rate": 0.00012883633914744492, + "loss": 0.3992, + "step": 4663 + }, + { + "epoch": 1.4975116391074008, + "grad_norm": 0.47554677724838257, + "learning_rate": 0.00012880291360147693, + "loss": 0.3285, + "step": 4664 + }, + { + "epoch": 1.4978327179322524, + "grad_norm": 0.7324989438056946, + "learning_rate": 0.00012876948454596128, + "loss": 0.6776, + "step": 4665 + }, + { + "epoch": 1.498153796757104, + "grad_norm": 0.677893877029419, + "learning_rate": 0.00012873605198497124, + "loss": 0.5352, + "step": 4666 + }, + { + "epoch": 1.4984748755819552, + "grad_norm": 0.9297817945480347, + "learning_rate": 0.00012870261592258037, + "loss": 0.451, + "step": 4667 + }, + { + "epoch": 1.4987959544068068, + "grad_norm": 0.8155097961425781, + "learning_rate": 0.00012866917636286286, + "loss": 0.5778, + "step": 4668 + }, + { + "epoch": 1.4991170332316583, + "grad_norm": 0.7590045928955078, + "learning_rate": 0.00012863573330989313, + "loss": 0.3355, + "step": 4669 + }, + { + "epoch": 1.49943811205651, + "grad_norm": 0.9300277233123779, + "learning_rate": 0.0001286022867677462, + "loss": 0.3444, + "step": 4670 + }, + { + "epoch": 1.4997591908813614, + "grad_norm": 0.7806175947189331, + "learning_rate": 0.00012856883674049736, + "loss": 0.4089, + "step": 4671 + }, + { + "epoch": 1.500080269706213, + "grad_norm": 0.7708941102027893, + "learning_rate": 0.00012853538323222242, + "loss": 0.5435, + "step": 4672 + }, + { + "epoch": 1.5004013485310643, + "grad_norm": 1.0800435543060303, + "learning_rate": 0.0001285019262469976, + "loss": 0.6181, + "step": 4673 + }, + { + "epoch": 1.5007224273559159, + "grad_norm": 1.2180079221725464, + "learning_rate": 0.00012846846578889955, + "loss": 0.705, + "step": 4674 + }, + { + "epoch": 1.5010435061807674, + "grad_norm": 1.081411600112915, + "learning_rate": 0.00012843500186200527, + "loss": 0.6284, + "step": 4675 + }, + { + "epoch": 1.5013645850056188, + "grad_norm": 0.7021897435188293, + "learning_rate": 0.00012840153447039228, + "loss": 0.5129, + "step": 4676 + }, + { + "epoch": 1.5016856638304703, + "grad_norm": 0.9453624486923218, + "learning_rate": 0.00012836806361813844, + "loss": 0.5492, + "step": 4677 + }, + { + "epoch": 1.5020067426553219, + "grad_norm": 0.9042197465896606, + "learning_rate": 0.00012833458930932212, + "loss": 0.7457, + "step": 4678 + }, + { + "epoch": 1.5023278214801734, + "grad_norm": 1.1903046369552612, + "learning_rate": 0.00012830111154802203, + "loss": 0.5891, + "step": 4679 + }, + { + "epoch": 1.502648900305025, + "grad_norm": 0.779662013053894, + "learning_rate": 0.00012826763033831735, + "loss": 0.665, + "step": 4680 + }, + { + "epoch": 1.5029699791298765, + "grad_norm": 0.9202207326889038, + "learning_rate": 0.00012823414568428768, + "loss": 0.7502, + "step": 4681 + }, + { + "epoch": 1.5032910579547278, + "grad_norm": 0.9345004558563232, + "learning_rate": 0.00012820065759001293, + "loss": 0.6421, + "step": 4682 + }, + { + "epoch": 1.5036121367795794, + "grad_norm": 0.7800152897834778, + "learning_rate": 0.00012816716605957367, + "loss": 0.5856, + "step": 4683 + }, + { + "epoch": 1.503933215604431, + "grad_norm": 0.8760406374931335, + "learning_rate": 0.00012813367109705063, + "loss": 0.6065, + "step": 4684 + }, + { + "epoch": 1.5042542944292823, + "grad_norm": 0.9938950538635254, + "learning_rate": 0.00012810017270652513, + "loss": 0.7377, + "step": 4685 + }, + { + "epoch": 1.5045753732541338, + "grad_norm": 0.9110274910926819, + "learning_rate": 0.0001280666708920788, + "loss": 0.6981, + "step": 4686 + }, + { + "epoch": 1.5048964520789854, + "grad_norm": 0.902837336063385, + "learning_rate": 0.00012803316565779377, + "loss": 0.6673, + "step": 4687 + }, + { + "epoch": 1.505217530903837, + "grad_norm": 0.8513844013214111, + "learning_rate": 0.0001279996570077525, + "loss": 0.623, + "step": 4688 + }, + { + "epoch": 1.5055386097286885, + "grad_norm": 0.8973796963691711, + "learning_rate": 0.000127966144946038, + "loss": 0.5205, + "step": 4689 + }, + { + "epoch": 1.50585968855354, + "grad_norm": 0.9134697318077087, + "learning_rate": 0.00012793262947673355, + "loss": 0.6839, + "step": 4690 + }, + { + "epoch": 1.5061807673783913, + "grad_norm": 1.0323461294174194, + "learning_rate": 0.00012789911060392294, + "loss": 0.8083, + "step": 4691 + }, + { + "epoch": 1.506501846203243, + "grad_norm": 0.8523952960968018, + "learning_rate": 0.00012786558833169031, + "loss": 0.4945, + "step": 4692 + }, + { + "epoch": 1.5068229250280945, + "grad_norm": 1.019515037536621, + "learning_rate": 0.0001278320626641203, + "loss": 0.5435, + "step": 4693 + }, + { + "epoch": 1.5071440038529458, + "grad_norm": 0.8791556358337402, + "learning_rate": 0.00012779853360529785, + "loss": 0.6037, + "step": 4694 + }, + { + "epoch": 1.5074650826777973, + "grad_norm": 0.9916840195655823, + "learning_rate": 0.00012776500115930842, + "loss": 0.581, + "step": 4695 + }, + { + "epoch": 1.5077861615026489, + "grad_norm": 0.7988699674606323, + "learning_rate": 0.00012773146533023782, + "loss": 0.5452, + "step": 4696 + }, + { + "epoch": 1.5081072403275004, + "grad_norm": 0.7620382308959961, + "learning_rate": 0.00012769792612217224, + "loss": 0.5199, + "step": 4697 + }, + { + "epoch": 1.508428319152352, + "grad_norm": 1.141939401626587, + "learning_rate": 0.0001276643835391984, + "loss": 0.7435, + "step": 4698 + }, + { + "epoch": 1.5087493979772035, + "grad_norm": 0.9198951125144958, + "learning_rate": 0.00012763083758540337, + "loss": 0.6203, + "step": 4699 + }, + { + "epoch": 1.5090704768020549, + "grad_norm": 0.7245876789093018, + "learning_rate": 0.0001275972882648746, + "loss": 0.5134, + "step": 4700 + }, + { + "epoch": 1.5093915556269064, + "grad_norm": 0.7252978086471558, + "learning_rate": 0.0001275637355816999, + "loss": 0.4891, + "step": 4701 + }, + { + "epoch": 1.5097126344517577, + "grad_norm": 0.7837473750114441, + "learning_rate": 0.0001275301795399677, + "loss": 0.4528, + "step": 4702 + }, + { + "epoch": 1.5100337132766093, + "grad_norm": 0.8873403668403625, + "learning_rate": 0.00012749662014376663, + "loss": 0.5449, + "step": 4703 + }, + { + "epoch": 1.5103547921014608, + "grad_norm": 0.9590548872947693, + "learning_rate": 0.00012746305739718577, + "loss": 0.5991, + "step": 4704 + }, + { + "epoch": 1.5106758709263124, + "grad_norm": 1.017732858657837, + "learning_rate": 0.00012742949130431467, + "loss": 0.5145, + "step": 4705 + }, + { + "epoch": 1.510996949751164, + "grad_norm": 0.720311164855957, + "learning_rate": 0.00012739592186924328, + "loss": 0.4315, + "step": 4706 + }, + { + "epoch": 1.5113180285760155, + "grad_norm": 0.7741522192955017, + "learning_rate": 0.00012736234909606185, + "loss": 0.529, + "step": 4707 + }, + { + "epoch": 1.511639107400867, + "grad_norm": 1.115466594696045, + "learning_rate": 0.00012732877298886124, + "loss": 0.6952, + "step": 4708 + }, + { + "epoch": 1.5119601862257184, + "grad_norm": 0.8059918880462646, + "learning_rate": 0.00012729519355173254, + "loss": 0.5197, + "step": 4709 + }, + { + "epoch": 1.51228126505057, + "grad_norm": 1.1203827857971191, + "learning_rate": 0.0001272616107887673, + "loss": 0.5233, + "step": 4710 + }, + { + "epoch": 1.5126023438754213, + "grad_norm": 1.13717520236969, + "learning_rate": 0.00012722802470405744, + "loss": 0.5846, + "step": 4711 + }, + { + "epoch": 1.5129234227002728, + "grad_norm": 0.5460472106933594, + "learning_rate": 0.0001271944353016954, + "loss": 0.3628, + "step": 4712 + }, + { + "epoch": 1.5132445015251244, + "grad_norm": 0.9577854871749878, + "learning_rate": 0.00012716084258577388, + "loss": 0.5651, + "step": 4713 + }, + { + "epoch": 1.513565580349976, + "grad_norm": 0.5291711091995239, + "learning_rate": 0.0001271272465603861, + "loss": 0.3436, + "step": 4714 + }, + { + "epoch": 1.5138866591748275, + "grad_norm": 0.6409555673599243, + "learning_rate": 0.0001270936472296256, + "loss": 0.7887, + "step": 4715 + }, + { + "epoch": 1.514207737999679, + "grad_norm": 0.6114294528961182, + "learning_rate": 0.00012706004459758636, + "loss": 0.7388, + "step": 4716 + }, + { + "epoch": 1.5145288168245306, + "grad_norm": 0.6325629353523254, + "learning_rate": 0.00012702643866836278, + "loss": 0.416, + "step": 4717 + }, + { + "epoch": 1.5148498956493819, + "grad_norm": 0.9128963351249695, + "learning_rate": 0.00012699282944604967, + "loss": 0.4181, + "step": 4718 + }, + { + "epoch": 1.5151709744742334, + "grad_norm": 0.7952489256858826, + "learning_rate": 0.00012695921693474212, + "loss": 0.2609, + "step": 4719 + }, + { + "epoch": 1.5154920532990848, + "grad_norm": 0.715871274471283, + "learning_rate": 0.00012692560113853574, + "loss": 0.269, + "step": 4720 + }, + { + "epoch": 1.5158131321239363, + "grad_norm": 0.9184644222259521, + "learning_rate": 0.00012689198206152657, + "loss": 0.5186, + "step": 4721 + }, + { + "epoch": 1.5161342109487879, + "grad_norm": 1.1602627038955688, + "learning_rate": 0.00012685835970781097, + "loss": 0.736, + "step": 4722 + }, + { + "epoch": 1.5164552897736394, + "grad_norm": 1.0593267679214478, + "learning_rate": 0.0001268247340814857, + "loss": 0.6185, + "step": 4723 + }, + { + "epoch": 1.516776368598491, + "grad_norm": 0.8425109386444092, + "learning_rate": 0.00012679110518664795, + "loss": 0.555, + "step": 4724 + }, + { + "epoch": 1.5170974474233425, + "grad_norm": 1.03610360622406, + "learning_rate": 0.00012675747302739527, + "loss": 0.8006, + "step": 4725 + }, + { + "epoch": 1.517418526248194, + "grad_norm": 0.6738163828849792, + "learning_rate": 0.00012672383760782568, + "loss": 0.431, + "step": 4726 + }, + { + "epoch": 1.5177396050730454, + "grad_norm": 1.9449659585952759, + "learning_rate": 0.00012669019893203759, + "loss": 0.658, + "step": 4727 + }, + { + "epoch": 1.518060683897897, + "grad_norm": 0.9231836795806885, + "learning_rate": 0.00012665655700412967, + "loss": 0.6246, + "step": 4728 + }, + { + "epoch": 1.5183817627227483, + "grad_norm": 0.884365975856781, + "learning_rate": 0.00012662291182820114, + "loss": 0.6044, + "step": 4729 + }, + { + "epoch": 1.5187028415475998, + "grad_norm": 0.9164744019508362, + "learning_rate": 0.00012658926340835156, + "loss": 0.6805, + "step": 4730 + }, + { + "epoch": 1.5190239203724514, + "grad_norm": 1.071161150932312, + "learning_rate": 0.00012655561174868088, + "loss": 0.6354, + "step": 4731 + }, + { + "epoch": 1.519344999197303, + "grad_norm": 1.1153299808502197, + "learning_rate": 0.00012652195685328946, + "loss": 0.8903, + "step": 4732 + }, + { + "epoch": 1.5196660780221545, + "grad_norm": 1.0313321352005005, + "learning_rate": 0.00012648829872627807, + "loss": 0.5328, + "step": 4733 + }, + { + "epoch": 1.519987156847006, + "grad_norm": 1.025863528251648, + "learning_rate": 0.00012645463737174782, + "loss": 0.6995, + "step": 4734 + }, + { + "epoch": 1.5203082356718576, + "grad_norm": 0.8044242858886719, + "learning_rate": 0.00012642097279380027, + "loss": 0.6276, + "step": 4735 + }, + { + "epoch": 1.520629314496709, + "grad_norm": 1.842185139656067, + "learning_rate": 0.0001263873049965373, + "loss": 0.6605, + "step": 4736 + }, + { + "epoch": 1.5209503933215605, + "grad_norm": 1.1269233226776123, + "learning_rate": 0.00012635363398406127, + "loss": 0.6219, + "step": 4737 + }, + { + "epoch": 1.5212714721464118, + "grad_norm": 1.2168982028961182, + "learning_rate": 0.00012631995976047488, + "loss": 0.9384, + "step": 4738 + }, + { + "epoch": 1.5215925509712633, + "grad_norm": 0.9263548254966736, + "learning_rate": 0.0001262862823298812, + "loss": 0.6786, + "step": 4739 + }, + { + "epoch": 1.5219136297961149, + "grad_norm": 0.9486871361732483, + "learning_rate": 0.00012625260169638378, + "loss": 0.7042, + "step": 4740 + }, + { + "epoch": 1.5222347086209664, + "grad_norm": 1.2256191968917847, + "learning_rate": 0.00012621891786408648, + "loss": 0.8557, + "step": 4741 + }, + { + "epoch": 1.522555787445818, + "grad_norm": 0.9984161853790283, + "learning_rate": 0.00012618523083709357, + "loss": 0.6621, + "step": 4742 + }, + { + "epoch": 1.5228768662706695, + "grad_norm": 1.0978655815124512, + "learning_rate": 0.00012615154061950968, + "loss": 0.6981, + "step": 4743 + }, + { + "epoch": 1.523197945095521, + "grad_norm": 0.8551576137542725, + "learning_rate": 0.00012611784721543995, + "loss": 0.6271, + "step": 4744 + }, + { + "epoch": 1.5235190239203724, + "grad_norm": 0.907645583152771, + "learning_rate": 0.00012608415062898972, + "loss": 0.6353, + "step": 4745 + }, + { + "epoch": 1.523840102745224, + "grad_norm": 1.3650954961776733, + "learning_rate": 0.00012605045086426487, + "loss": 0.5611, + "step": 4746 + }, + { + "epoch": 1.5241611815700753, + "grad_norm": 0.9021798372268677, + "learning_rate": 0.00012601674792537157, + "loss": 0.6619, + "step": 4747 + }, + { + "epoch": 1.5244822603949268, + "grad_norm": 1.0378648042678833, + "learning_rate": 0.00012598304181641646, + "loss": 0.6465, + "step": 4748 + }, + { + "epoch": 1.5248033392197784, + "grad_norm": 1.0701501369476318, + "learning_rate": 0.00012594933254150655, + "loss": 0.6018, + "step": 4749 + }, + { + "epoch": 1.52512441804463, + "grad_norm": 0.9513067603111267, + "learning_rate": 0.00012591562010474914, + "loss": 0.5637, + "step": 4750 + }, + { + "epoch": 1.5254454968694815, + "grad_norm": 0.8903157114982605, + "learning_rate": 0.00012588190451025207, + "loss": 0.6278, + "step": 4751 + }, + { + "epoch": 1.525766575694333, + "grad_norm": 1.711920142173767, + "learning_rate": 0.00012584818576212345, + "loss": 0.6439, + "step": 4752 + }, + { + "epoch": 1.5260876545191846, + "grad_norm": 1.3381930589675903, + "learning_rate": 0.0001258144638644718, + "loss": 0.6182, + "step": 4753 + }, + { + "epoch": 1.526408733344036, + "grad_norm": 0.8519673347473145, + "learning_rate": 0.000125780738821406, + "loss": 0.5031, + "step": 4754 + }, + { + "epoch": 1.5267298121688875, + "grad_norm": 1.123969554901123, + "learning_rate": 0.0001257470106370354, + "loss": 0.702, + "step": 4755 + }, + { + "epoch": 1.5270508909937388, + "grad_norm": 0.7065092921257019, + "learning_rate": 0.00012571327931546963, + "loss": 0.438, + "step": 4756 + }, + { + "epoch": 1.5273719698185904, + "grad_norm": 1.5530750751495361, + "learning_rate": 0.00012567954486081878, + "loss": 0.5497, + "step": 4757 + }, + { + "epoch": 1.527693048643442, + "grad_norm": 1.869837760925293, + "learning_rate": 0.0001256458072771933, + "loss": 0.5909, + "step": 4758 + }, + { + "epoch": 1.5280141274682935, + "grad_norm": 0.8413248658180237, + "learning_rate": 0.00012561206656870396, + "loss": 0.5934, + "step": 4759 + }, + { + "epoch": 1.528335206293145, + "grad_norm": 0.7256144285202026, + "learning_rate": 0.00012557832273946205, + "loss": 0.4381, + "step": 4760 + }, + { + "epoch": 1.5286562851179966, + "grad_norm": 1.1708571910858154, + "learning_rate": 0.00012554457579357905, + "loss": 0.5908, + "step": 4761 + }, + { + "epoch": 1.5289773639428481, + "grad_norm": 0.7625494003295898, + "learning_rate": 0.00012551082573516705, + "loss": 0.4367, + "step": 4762 + }, + { + "epoch": 1.5292984427676994, + "grad_norm": 0.81993567943573, + "learning_rate": 0.00012547707256833823, + "loss": 0.5166, + "step": 4763 + }, + { + "epoch": 1.529619521592551, + "grad_norm": 0.6082926392555237, + "learning_rate": 0.00012544331629720543, + "loss": 0.4116, + "step": 4764 + }, + { + "epoch": 1.5299406004174023, + "grad_norm": 0.818605899810791, + "learning_rate": 0.00012540955692588173, + "loss": 0.6703, + "step": 4765 + }, + { + "epoch": 1.5302616792422539, + "grad_norm": 0.6201203465461731, + "learning_rate": 0.00012537579445848058, + "loss": 1.0005, + "step": 4766 + }, + { + "epoch": 1.5305827580671054, + "grad_norm": 0.7237659692764282, + "learning_rate": 0.00012534202889911584, + "loss": 0.6653, + "step": 4767 + }, + { + "epoch": 1.530903836891957, + "grad_norm": 0.7227284908294678, + "learning_rate": 0.00012530826025190174, + "loss": 0.5275, + "step": 4768 + }, + { + "epoch": 1.5312249157168085, + "grad_norm": 0.7782474160194397, + "learning_rate": 0.00012527448852095295, + "loss": 0.4055, + "step": 4769 + }, + { + "epoch": 1.53154599454166, + "grad_norm": 0.8507752418518066, + "learning_rate": 0.00012524071371038434, + "loss": 0.2621, + "step": 4770 + }, + { + "epoch": 1.5318670733665116, + "grad_norm": 0.8687464594841003, + "learning_rate": 0.0001252069358243114, + "loss": 0.3743, + "step": 4771 + }, + { + "epoch": 1.532188152191363, + "grad_norm": 0.7505998015403748, + "learning_rate": 0.00012517315486684972, + "loss": 0.4383, + "step": 4772 + }, + { + "epoch": 1.5325092310162145, + "grad_norm": 0.9363805055618286, + "learning_rate": 0.0001251393708421155, + "loss": 0.7721, + "step": 4773 + }, + { + "epoch": 1.5328303098410658, + "grad_norm": 0.9391258955001831, + "learning_rate": 0.00012510558375422522, + "loss": 0.7531, + "step": 4774 + }, + { + "epoch": 1.5331513886659174, + "grad_norm": 0.9769285917282104, + "learning_rate": 0.0001250717936072957, + "loss": 0.8322, + "step": 4775 + }, + { + "epoch": 1.533472467490769, + "grad_norm": 0.670604944229126, + "learning_rate": 0.00012503800040544416, + "loss": 0.4766, + "step": 4776 + }, + { + "epoch": 1.5337935463156205, + "grad_norm": 0.7332351803779602, + "learning_rate": 0.00012500420415278822, + "loss": 0.6418, + "step": 4777 + }, + { + "epoch": 1.534114625140472, + "grad_norm": 0.7202760577201843, + "learning_rate": 0.00012497040485344584, + "loss": 0.5964, + "step": 4778 + }, + { + "epoch": 1.5344357039653236, + "grad_norm": 0.9445421695709229, + "learning_rate": 0.0001249366025115354, + "loss": 0.6184, + "step": 4779 + }, + { + "epoch": 1.5347567827901751, + "grad_norm": 0.7218775749206543, + "learning_rate": 0.0001249027971311756, + "loss": 0.5217, + "step": 4780 + }, + { + "epoch": 1.5350778616150265, + "grad_norm": 0.7306594252586365, + "learning_rate": 0.0001248689887164855, + "loss": 0.5849, + "step": 4781 + }, + { + "epoch": 1.535398940439878, + "grad_norm": 0.942954421043396, + "learning_rate": 0.00012483517727158454, + "loss": 0.6694, + "step": 4782 + }, + { + "epoch": 1.5357200192647293, + "grad_norm": 0.7879648208618164, + "learning_rate": 0.00012480136280059256, + "loss": 0.5698, + "step": 4783 + }, + { + "epoch": 1.536041098089581, + "grad_norm": 0.835502028465271, + "learning_rate": 0.00012476754530762977, + "loss": 0.63, + "step": 4784 + }, + { + "epoch": 1.5363621769144324, + "grad_norm": 0.9831227660179138, + "learning_rate": 0.00012473372479681672, + "loss": 0.7113, + "step": 4785 + }, + { + "epoch": 1.536683255739284, + "grad_norm": 0.9741390347480774, + "learning_rate": 0.00012469990127227432, + "loss": 0.6879, + "step": 4786 + }, + { + "epoch": 1.5370043345641355, + "grad_norm": 0.7839673757553101, + "learning_rate": 0.00012466607473812387, + "loss": 0.5085, + "step": 4787 + }, + { + "epoch": 1.537325413388987, + "grad_norm": 0.9261371493339539, + "learning_rate": 0.000124632245198487, + "loss": 0.6465, + "step": 4788 + }, + { + "epoch": 1.5376464922138386, + "grad_norm": 1.3706035614013672, + "learning_rate": 0.0001245984126574858, + "loss": 0.8686, + "step": 4789 + }, + { + "epoch": 1.53796757103869, + "grad_norm": 0.6611718535423279, + "learning_rate": 0.00012456457711924266, + "loss": 0.3897, + "step": 4790 + }, + { + "epoch": 1.5382886498635415, + "grad_norm": 0.9132871031761169, + "learning_rate": 0.00012453073858788026, + "loss": 0.6996, + "step": 4791 + }, + { + "epoch": 1.5386097286883929, + "grad_norm": 0.9300896525382996, + "learning_rate": 0.00012449689706752178, + "loss": 0.7055, + "step": 4792 + }, + { + "epoch": 1.5389308075132444, + "grad_norm": 0.7551592588424683, + "learning_rate": 0.00012446305256229073, + "loss": 0.6199, + "step": 4793 + }, + { + "epoch": 1.539251886338096, + "grad_norm": 0.9444193243980408, + "learning_rate": 0.00012442920507631092, + "loss": 0.6397, + "step": 4794 + }, + { + "epoch": 1.5395729651629475, + "grad_norm": 0.8001585006713867, + "learning_rate": 0.00012439535461370658, + "loss": 0.4635, + "step": 4795 + }, + { + "epoch": 1.539894043987799, + "grad_norm": 0.9087579250335693, + "learning_rate": 0.00012436150117860225, + "loss": 0.5755, + "step": 4796 + }, + { + "epoch": 1.5402151228126506, + "grad_norm": 0.9415332674980164, + "learning_rate": 0.00012432764477512292, + "loss": 0.6303, + "step": 4797 + }, + { + "epoch": 1.5405362016375022, + "grad_norm": 0.8173394203186035, + "learning_rate": 0.00012429378540739386, + "loss": 0.5832, + "step": 4798 + }, + { + "epoch": 1.5408572804623535, + "grad_norm": 1.0621390342712402, + "learning_rate": 0.00012425992307954075, + "loss": 0.6614, + "step": 4799 + }, + { + "epoch": 1.541178359287205, + "grad_norm": 1.3637477159500122, + "learning_rate": 0.0001242260577956896, + "loss": 0.6073, + "step": 4800 + }, + { + "epoch": 1.5414994381120564, + "grad_norm": 1.0318750143051147, + "learning_rate": 0.00012419218955996676, + "loss": 0.6623, + "step": 4801 + }, + { + "epoch": 1.541820516936908, + "grad_norm": 1.1521292924880981, + "learning_rate": 0.00012415831837649905, + "loss": 0.5285, + "step": 4802 + }, + { + "epoch": 1.5421415957617595, + "grad_norm": 1.148503065109253, + "learning_rate": 0.0001241244442494135, + "loss": 0.5444, + "step": 4803 + }, + { + "epoch": 1.542462674586611, + "grad_norm": 0.9957001805305481, + "learning_rate": 0.0001240905671828376, + "loss": 0.5206, + "step": 4804 + }, + { + "epoch": 1.5427837534114626, + "grad_norm": 0.6883019804954529, + "learning_rate": 0.00012405668718089917, + "loss": 0.4488, + "step": 4805 + }, + { + "epoch": 1.5431048322363141, + "grad_norm": 1.0081177949905396, + "learning_rate": 0.00012402280424772639, + "loss": 0.6462, + "step": 4806 + }, + { + "epoch": 1.5434259110611657, + "grad_norm": 0.8516654372215271, + "learning_rate": 0.00012398891838744778, + "loss": 0.5383, + "step": 4807 + }, + { + "epoch": 1.543746989886017, + "grad_norm": 0.8348969221115112, + "learning_rate": 0.0001239550296041922, + "loss": 0.484, + "step": 4808 + }, + { + "epoch": 1.5440680687108685, + "grad_norm": 1.3261665105819702, + "learning_rate": 0.00012392113790208895, + "loss": 0.5407, + "step": 4809 + }, + { + "epoch": 1.5443891475357199, + "grad_norm": 1.114880919456482, + "learning_rate": 0.0001238872432852676, + "loss": 0.7105, + "step": 4810 + }, + { + "epoch": 1.5447102263605714, + "grad_norm": 0.8749669790267944, + "learning_rate": 0.0001238533457578581, + "loss": 0.4329, + "step": 4811 + }, + { + "epoch": 1.545031305185423, + "grad_norm": 0.7413171529769897, + "learning_rate": 0.0001238194453239908, + "loss": 0.3563, + "step": 4812 + }, + { + "epoch": 1.5453523840102745, + "grad_norm": 0.7285036444664001, + "learning_rate": 0.0001237855419877963, + "loss": 0.4244, + "step": 4813 + }, + { + "epoch": 1.545673462835126, + "grad_norm": 0.6822255253791809, + "learning_rate": 0.0001237516357534057, + "loss": 0.3974, + "step": 4814 + }, + { + "epoch": 1.5459945416599776, + "grad_norm": 0.44284263253211975, + "learning_rate": 0.0001237177266249503, + "loss": 0.6239, + "step": 4815 + }, + { + "epoch": 1.5463156204848292, + "grad_norm": 0.6735876798629761, + "learning_rate": 0.0001236838146065619, + "loss": 1.0601, + "step": 4816 + }, + { + "epoch": 1.5466366993096805, + "grad_norm": 0.6792389154434204, + "learning_rate": 0.00012364989970237248, + "loss": 0.6631, + "step": 4817 + }, + { + "epoch": 1.546957778134532, + "grad_norm": 0.8029338717460632, + "learning_rate": 0.00012361598191651453, + "loss": 0.446, + "step": 4818 + }, + { + "epoch": 1.5472788569593834, + "grad_norm": 0.5691446661949158, + "learning_rate": 0.00012358206125312083, + "loss": 0.2494, + "step": 4819 + }, + { + "epoch": 1.547599935784235, + "grad_norm": 0.7446752786636353, + "learning_rate": 0.00012354813771632447, + "loss": 0.3416, + "step": 4820 + }, + { + "epoch": 1.5479210146090865, + "grad_norm": 0.5998992323875427, + "learning_rate": 0.000123514211310259, + "loss": 0.2877, + "step": 4821 + }, + { + "epoch": 1.548242093433938, + "grad_norm": 0.9401177763938904, + "learning_rate": 0.00012348028203905818, + "loss": 0.7846, + "step": 4822 + }, + { + "epoch": 1.5485631722587896, + "grad_norm": 0.9916895627975464, + "learning_rate": 0.00012344634990685624, + "loss": 0.8004, + "step": 4823 + }, + { + "epoch": 1.5488842510836411, + "grad_norm": 0.8018080592155457, + "learning_rate": 0.0001234124149177877, + "loss": 0.6483, + "step": 4824 + }, + { + "epoch": 1.5492053299084927, + "grad_norm": 0.9374871850013733, + "learning_rate": 0.00012337847707598738, + "loss": 0.7748, + "step": 4825 + }, + { + "epoch": 1.549526408733344, + "grad_norm": 0.975395917892456, + "learning_rate": 0.00012334453638559057, + "loss": 0.6303, + "step": 4826 + }, + { + "epoch": 1.5498474875581956, + "grad_norm": 1.0059309005737305, + "learning_rate": 0.0001233105928507328, + "loss": 0.4923, + "step": 4827 + }, + { + "epoch": 1.550168566383047, + "grad_norm": 0.9571434259414673, + "learning_rate": 0.00012327664647554998, + "loss": 0.7088, + "step": 4828 + }, + { + "epoch": 1.5504896452078984, + "grad_norm": 0.9377606511116028, + "learning_rate": 0.0001232426972641784, + "loss": 0.7066, + "step": 4829 + }, + { + "epoch": 1.55081072403275, + "grad_norm": 0.748374342918396, + "learning_rate": 0.00012320874522075468, + "loss": 0.4916, + "step": 4830 + }, + { + "epoch": 1.5511318028576015, + "grad_norm": 0.8802591562271118, + "learning_rate": 0.00012317479034941573, + "loss": 0.5684, + "step": 4831 + }, + { + "epoch": 1.551452881682453, + "grad_norm": 0.7750247120857239, + "learning_rate": 0.0001231408326542989, + "loss": 0.5093, + "step": 4832 + }, + { + "epoch": 1.5517739605073047, + "grad_norm": 0.7303853034973145, + "learning_rate": 0.00012310687213954182, + "loss": 0.582, + "step": 4833 + }, + { + "epoch": 1.5520950393321562, + "grad_norm": 0.9113678932189941, + "learning_rate": 0.0001230729088092824, + "loss": 0.5847, + "step": 4834 + }, + { + "epoch": 1.5524161181570075, + "grad_norm": 0.9570289254188538, + "learning_rate": 0.00012303894266765908, + "loss": 0.622, + "step": 4835 + }, + { + "epoch": 1.552737196981859, + "grad_norm": 0.9025993347167969, + "learning_rate": 0.00012300497371881046, + "loss": 0.5098, + "step": 4836 + }, + { + "epoch": 1.5530582758067104, + "grad_norm": 1.2178800106048584, + "learning_rate": 0.00012297100196687557, + "loss": 0.6402, + "step": 4837 + }, + { + "epoch": 1.553379354631562, + "grad_norm": 0.9967452883720398, + "learning_rate": 0.00012293702741599378, + "loss": 0.6723, + "step": 4838 + }, + { + "epoch": 1.5537004334564135, + "grad_norm": 0.7148991823196411, + "learning_rate": 0.00012290305007030478, + "loss": 0.5875, + "step": 4839 + }, + { + "epoch": 1.554021512281265, + "grad_norm": 1.1900482177734375, + "learning_rate": 0.00012286906993394856, + "loss": 0.6827, + "step": 4840 + }, + { + "epoch": 1.5543425911061166, + "grad_norm": 1.2599159479141235, + "learning_rate": 0.00012283508701106557, + "loss": 0.8436, + "step": 4841 + }, + { + "epoch": 1.5546636699309682, + "grad_norm": 1.0731333494186401, + "learning_rate": 0.0001228011013057965, + "loss": 0.7042, + "step": 4842 + }, + { + "epoch": 1.5549847487558197, + "grad_norm": 0.7968559861183167, + "learning_rate": 0.0001227671128222824, + "loss": 0.5391, + "step": 4843 + }, + { + "epoch": 1.555305827580671, + "grad_norm": 1.2837064266204834, + "learning_rate": 0.00012273312156466464, + "loss": 0.8585, + "step": 4844 + }, + { + "epoch": 1.5556269064055226, + "grad_norm": 0.7288888692855835, + "learning_rate": 0.00012269912753708502, + "loss": 0.5193, + "step": 4845 + }, + { + "epoch": 1.555947985230374, + "grad_norm": 0.8178963661193848, + "learning_rate": 0.0001226651307436855, + "loss": 0.5001, + "step": 4846 + }, + { + "epoch": 1.5562690640552255, + "grad_norm": 1.0722037553787231, + "learning_rate": 0.0001226311311886086, + "loss": 0.5924, + "step": 4847 + }, + { + "epoch": 1.556590142880077, + "grad_norm": 0.9165698885917664, + "learning_rate": 0.00012259712887599698, + "loss": 0.5656, + "step": 4848 + }, + { + "epoch": 1.5569112217049286, + "grad_norm": 1.0873520374298096, + "learning_rate": 0.00012256312380999376, + "loss": 0.7788, + "step": 4849 + }, + { + "epoch": 1.5572323005297801, + "grad_norm": 1.1277052164077759, + "learning_rate": 0.00012252911599474237, + "loss": 0.6183, + "step": 4850 + }, + { + "epoch": 1.5575533793546317, + "grad_norm": 0.8544272780418396, + "learning_rate": 0.0001224951054343865, + "loss": 0.5859, + "step": 4851 + }, + { + "epoch": 1.5578744581794832, + "grad_norm": 1.0256620645523071, + "learning_rate": 0.0001224610921330703, + "loss": 0.663, + "step": 4852 + }, + { + "epoch": 1.5581955370043346, + "grad_norm": 0.9969653487205505, + "learning_rate": 0.00012242707609493814, + "loss": 0.6084, + "step": 4853 + }, + { + "epoch": 1.558516615829186, + "grad_norm": 0.8547719717025757, + "learning_rate": 0.00012239305732413477, + "loss": 0.5886, + "step": 4854 + }, + { + "epoch": 1.5588376946540374, + "grad_norm": 1.0089030265808105, + "learning_rate": 0.0001223590358248053, + "loss": 0.6111, + "step": 4855 + }, + { + "epoch": 1.559158773478889, + "grad_norm": 1.0773385763168335, + "learning_rate": 0.00012232501160109514, + "loss": 0.5771, + "step": 4856 + }, + { + "epoch": 1.5594798523037405, + "grad_norm": 0.8894745707511902, + "learning_rate": 0.00012229098465715006, + "loss": 0.5668, + "step": 4857 + }, + { + "epoch": 1.559800931128592, + "grad_norm": 0.9482760429382324, + "learning_rate": 0.00012225695499711606, + "loss": 0.4387, + "step": 4858 + }, + { + "epoch": 1.5601220099534436, + "grad_norm": 1.192726969718933, + "learning_rate": 0.00012222292262513965, + "loss": 0.5511, + "step": 4859 + }, + { + "epoch": 1.5604430887782952, + "grad_norm": 0.9910078644752502, + "learning_rate": 0.0001221888875453675, + "loss": 0.6603, + "step": 4860 + }, + { + "epoch": 1.5607641676031467, + "grad_norm": 1.0606755018234253, + "learning_rate": 0.00012215484976194676, + "loss": 0.534, + "step": 4861 + }, + { + "epoch": 1.561085246427998, + "grad_norm": 1.2194321155548096, + "learning_rate": 0.00012212080927902474, + "loss": 0.4939, + "step": 4862 + }, + { + "epoch": 1.5614063252528496, + "grad_norm": 0.4965793490409851, + "learning_rate": 0.00012208676610074917, + "loss": 0.3877, + "step": 4863 + }, + { + "epoch": 1.561727404077701, + "grad_norm": 0.7706876993179321, + "learning_rate": 0.00012205272023126821, + "loss": 0.4616, + "step": 4864 + }, + { + "epoch": 1.5620484829025525, + "grad_norm": 0.7944642305374146, + "learning_rate": 0.00012201867167473015, + "loss": 0.9236, + "step": 4865 + }, + { + "epoch": 1.562369561727404, + "grad_norm": 0.7954860329627991, + "learning_rate": 0.00012198462043528376, + "loss": 0.9766, + "step": 4866 + }, + { + "epoch": 1.5626906405522556, + "grad_norm": 0.6089110970497131, + "learning_rate": 0.00012195056651707806, + "loss": 0.7265, + "step": 4867 + }, + { + "epoch": 1.5630117193771071, + "grad_norm": 0.6708793640136719, + "learning_rate": 0.00012191650992426238, + "loss": 0.3664, + "step": 4868 + }, + { + "epoch": 1.5633327982019587, + "grad_norm": 1.029671549797058, + "learning_rate": 0.00012188245066098647, + "loss": 0.4111, + "step": 4869 + }, + { + "epoch": 1.5636538770268102, + "grad_norm": 0.8126581311225891, + "learning_rate": 0.00012184838873140032, + "loss": 0.3168, + "step": 4870 + }, + { + "epoch": 1.5639749558516616, + "grad_norm": 0.7029819488525391, + "learning_rate": 0.00012181432413965428, + "loss": 0.4029, + "step": 4871 + }, + { + "epoch": 1.5642960346765131, + "grad_norm": 0.7372133135795593, + "learning_rate": 0.000121780256889899, + "loss": 0.7746, + "step": 4872 + }, + { + "epoch": 1.5646171135013645, + "grad_norm": 0.8719139099121094, + "learning_rate": 0.00012174618698628549, + "loss": 0.6874, + "step": 4873 + }, + { + "epoch": 1.564938192326216, + "grad_norm": 0.8879600167274475, + "learning_rate": 0.00012171211443296505, + "loss": 0.6931, + "step": 4874 + }, + { + "epoch": 1.5652592711510676, + "grad_norm": 0.8598337769508362, + "learning_rate": 0.00012167803923408934, + "loss": 0.5866, + "step": 4875 + }, + { + "epoch": 1.565580349975919, + "grad_norm": 0.6961860656738281, + "learning_rate": 0.00012164396139381029, + "loss": 0.4646, + "step": 4876 + }, + { + "epoch": 1.5659014288007707, + "grad_norm": 0.7586543560028076, + "learning_rate": 0.00012160988091628022, + "loss": 0.5582, + "step": 4877 + }, + { + "epoch": 1.5662225076256222, + "grad_norm": 0.82035231590271, + "learning_rate": 0.00012157579780565169, + "loss": 0.5765, + "step": 4878 + }, + { + "epoch": 1.5665435864504738, + "grad_norm": 0.6588397026062012, + "learning_rate": 0.00012154171206607764, + "loss": 0.4265, + "step": 4879 + }, + { + "epoch": 1.566864665275325, + "grad_norm": 0.7135642766952515, + "learning_rate": 0.00012150762370171136, + "loss": 0.4743, + "step": 4880 + }, + { + "epoch": 1.5671857441001766, + "grad_norm": 0.7233707308769226, + "learning_rate": 0.00012147353271670634, + "loss": 0.474, + "step": 4881 + }, + { + "epoch": 1.567506822925028, + "grad_norm": 0.773969829082489, + "learning_rate": 0.00012143943911521647, + "loss": 0.5916, + "step": 4882 + }, + { + "epoch": 1.5678279017498795, + "grad_norm": 0.9881671667098999, + "learning_rate": 0.000121405342901396, + "loss": 0.6945, + "step": 4883 + }, + { + "epoch": 1.568148980574731, + "grad_norm": 1.0888925790786743, + "learning_rate": 0.00012137124407939943, + "loss": 0.5791, + "step": 4884 + }, + { + "epoch": 1.5684700593995826, + "grad_norm": 1.0212979316711426, + "learning_rate": 0.00012133714265338161, + "loss": 0.7161, + "step": 4885 + }, + { + "epoch": 1.5687911382244342, + "grad_norm": 0.8693848848342896, + "learning_rate": 0.00012130303862749767, + "loss": 0.6095, + "step": 4886 + }, + { + "epoch": 1.5691122170492857, + "grad_norm": 1.1114261150360107, + "learning_rate": 0.00012126893200590308, + "loss": 0.6328, + "step": 4887 + }, + { + "epoch": 1.569433295874137, + "grad_norm": 0.9835912585258484, + "learning_rate": 0.00012123482279275365, + "loss": 0.6541, + "step": 4888 + }, + { + "epoch": 1.5697543746989886, + "grad_norm": 0.8264690041542053, + "learning_rate": 0.00012120071099220549, + "loss": 0.4623, + "step": 4889 + }, + { + "epoch": 1.5700754535238401, + "grad_norm": 0.8061067461967468, + "learning_rate": 0.00012116659660841499, + "loss": 0.524, + "step": 4890 + }, + { + "epoch": 1.5703965323486915, + "grad_norm": 0.846877932548523, + "learning_rate": 0.00012113247964553888, + "loss": 0.6132, + "step": 4891 + }, + { + "epoch": 1.570717611173543, + "grad_norm": 1.043820858001709, + "learning_rate": 0.00012109836010773424, + "loss": 0.6175, + "step": 4892 + }, + { + "epoch": 1.5710386899983946, + "grad_norm": 1.267200231552124, + "learning_rate": 0.0001210642379991584, + "loss": 0.6943, + "step": 4893 + }, + { + "epoch": 1.5713597688232461, + "grad_norm": 1.0244359970092773, + "learning_rate": 0.00012103011332396908, + "loss": 0.6472, + "step": 4894 + }, + { + "epoch": 1.5716808476480977, + "grad_norm": 1.727343201637268, + "learning_rate": 0.00012099598608632428, + "loss": 0.5938, + "step": 4895 + }, + { + "epoch": 1.5720019264729492, + "grad_norm": 0.8495163917541504, + "learning_rate": 0.0001209618562903822, + "loss": 0.5524, + "step": 4896 + }, + { + "epoch": 1.5723230052978006, + "grad_norm": 1.0026789903640747, + "learning_rate": 0.00012092772394030152, + "loss": 0.5923, + "step": 4897 + }, + { + "epoch": 1.572644084122652, + "grad_norm": 1.362273097038269, + "learning_rate": 0.00012089358904024117, + "loss": 0.7854, + "step": 4898 + }, + { + "epoch": 1.5729651629475037, + "grad_norm": 0.79168701171875, + "learning_rate": 0.00012085945159436038, + "loss": 0.5297, + "step": 4899 + }, + { + "epoch": 1.573286241772355, + "grad_norm": 1.0517579317092896, + "learning_rate": 0.00012082531160681869, + "loss": 0.5276, + "step": 4900 + }, + { + "epoch": 1.5736073205972065, + "grad_norm": 1.1384689807891846, + "learning_rate": 0.00012079116908177593, + "loss": 0.4365, + "step": 4901 + }, + { + "epoch": 1.573928399422058, + "grad_norm": 0.916374683380127, + "learning_rate": 0.00012075702402339231, + "loss": 0.7095, + "step": 4902 + }, + { + "epoch": 1.5742494782469096, + "grad_norm": 0.8095363974571228, + "learning_rate": 0.00012072287643582825, + "loss": 0.5287, + "step": 4903 + }, + { + "epoch": 1.5745705570717612, + "grad_norm": 0.7615143060684204, + "learning_rate": 0.0001206887263232446, + "loss": 0.5034, + "step": 4904 + }, + { + "epoch": 1.5748916358966127, + "grad_norm": 0.9699037671089172, + "learning_rate": 0.00012065457368980236, + "loss": 0.4912, + "step": 4905 + }, + { + "epoch": 1.575212714721464, + "grad_norm": 1.3205819129943848, + "learning_rate": 0.00012062041853966298, + "loss": 0.7739, + "step": 4906 + }, + { + "epoch": 1.5755337935463156, + "grad_norm": 1.0966612100601196, + "learning_rate": 0.00012058626087698814, + "loss": 0.5963, + "step": 4907 + }, + { + "epoch": 1.5758548723711672, + "grad_norm": 1.8393914699554443, + "learning_rate": 0.00012055210070593988, + "loss": 0.5436, + "step": 4908 + }, + { + "epoch": 1.5761759511960185, + "grad_norm": 1.010202407836914, + "learning_rate": 0.00012051793803068046, + "loss": 0.5538, + "step": 4909 + }, + { + "epoch": 1.57649703002087, + "grad_norm": 0.9576059579849243, + "learning_rate": 0.00012048377285537256, + "loss": 0.4986, + "step": 4910 + }, + { + "epoch": 1.5768181088457216, + "grad_norm": 1.2450673580169678, + "learning_rate": 0.00012044960518417903, + "loss": 0.6183, + "step": 4911 + }, + { + "epoch": 1.5771391876705732, + "grad_norm": 0.9183635115623474, + "learning_rate": 0.00012041543502126318, + "loss": 0.5388, + "step": 4912 + }, + { + "epoch": 1.5774602664954247, + "grad_norm": 0.63742595911026, + "learning_rate": 0.0001203812623707885, + "loss": 0.3583, + "step": 4913 + }, + { + "epoch": 1.5777813453202763, + "grad_norm": 0.7811853885650635, + "learning_rate": 0.00012034708723691881, + "loss": 0.4776, + "step": 4914 + }, + { + "epoch": 1.5781024241451276, + "grad_norm": 0.6186936497688293, + "learning_rate": 0.00012031290962381823, + "loss": 0.5344, + "step": 4915 + }, + { + "epoch": 1.5784235029699791, + "grad_norm": 0.5814709663391113, + "learning_rate": 0.00012027872953565125, + "loss": 0.766, + "step": 4916 + }, + { + "epoch": 1.5787445817948307, + "grad_norm": 0.767975926399231, + "learning_rate": 0.00012024454697658261, + "loss": 0.4243, + "step": 4917 + }, + { + "epoch": 1.579065660619682, + "grad_norm": 0.8333182334899902, + "learning_rate": 0.00012021036195077731, + "loss": 0.4496, + "step": 4918 + }, + { + "epoch": 1.5793867394445336, + "grad_norm": 0.9080848693847656, + "learning_rate": 0.0001201761744624007, + "loss": 0.2366, + "step": 4919 + }, + { + "epoch": 1.5797078182693851, + "grad_norm": 0.7227024435997009, + "learning_rate": 0.00012014198451561842, + "loss": 0.2954, + "step": 4920 + }, + { + "epoch": 1.5800288970942367, + "grad_norm": 0.7397527694702148, + "learning_rate": 0.00012010779211459648, + "loss": 0.2761, + "step": 4921 + }, + { + "epoch": 1.5803499759190882, + "grad_norm": 0.753508448600769, + "learning_rate": 0.00012007359726350105, + "loss": 0.4536, + "step": 4922 + }, + { + "epoch": 1.5806710547439398, + "grad_norm": 1.0282949209213257, + "learning_rate": 0.00012003939996649865, + "loss": 0.7643, + "step": 4923 + }, + { + "epoch": 1.580992133568791, + "grad_norm": 0.9250288605690002, + "learning_rate": 0.00012000520022775617, + "loss": 0.7326, + "step": 4924 + }, + { + "epoch": 1.5813132123936426, + "grad_norm": 0.898415207862854, + "learning_rate": 0.00011997099805144069, + "loss": 0.6032, + "step": 4925 + }, + { + "epoch": 1.5816342912184942, + "grad_norm": 0.9478784799575806, + "learning_rate": 0.00011993679344171973, + "loss": 0.6384, + "step": 4926 + }, + { + "epoch": 1.5819553700433455, + "grad_norm": 1.0748721361160278, + "learning_rate": 0.00011990258640276094, + "loss": 0.3729, + "step": 4927 + }, + { + "epoch": 1.582276448868197, + "grad_norm": 0.7065726518630981, + "learning_rate": 0.00011986837693873237, + "loss": 0.5082, + "step": 4928 + }, + { + "epoch": 1.5825975276930486, + "grad_norm": 0.789629340171814, + "learning_rate": 0.00011983416505380234, + "loss": 0.5867, + "step": 4929 + }, + { + "epoch": 1.5829186065179002, + "grad_norm": 0.8611066341400146, + "learning_rate": 0.00011979995075213946, + "loss": 0.6781, + "step": 4930 + }, + { + "epoch": 1.5832396853427517, + "grad_norm": 0.8396785259246826, + "learning_rate": 0.00011976573403791262, + "loss": 0.6383, + "step": 4931 + }, + { + "epoch": 1.5835607641676033, + "grad_norm": 0.779766321182251, + "learning_rate": 0.00011973151491529107, + "loss": 0.6317, + "step": 4932 + }, + { + "epoch": 1.5838818429924546, + "grad_norm": 0.8786318898200989, + "learning_rate": 0.00011969729338844429, + "loss": 0.683, + "step": 4933 + }, + { + "epoch": 1.5842029218173062, + "grad_norm": 1.0004550218582153, + "learning_rate": 0.000119663069461542, + "loss": 0.6376, + "step": 4934 + }, + { + "epoch": 1.5845240006421577, + "grad_norm": 0.8501437306404114, + "learning_rate": 0.0001196288431387544, + "loss": 0.6203, + "step": 4935 + }, + { + "epoch": 1.584845079467009, + "grad_norm": 1.1006617546081543, + "learning_rate": 0.00011959461442425177, + "loss": 0.8655, + "step": 4936 + }, + { + "epoch": 1.5851661582918606, + "grad_norm": 1.571334719657898, + "learning_rate": 0.00011956038332220483, + "loss": 0.5745, + "step": 4937 + }, + { + "epoch": 1.5854872371167121, + "grad_norm": 0.8604090213775635, + "learning_rate": 0.00011952614983678452, + "loss": 0.5597, + "step": 4938 + }, + { + "epoch": 1.5858083159415637, + "grad_norm": 0.9236046671867371, + "learning_rate": 0.00011949191397216206, + "loss": 0.5838, + "step": 4939 + }, + { + "epoch": 1.5861293947664152, + "grad_norm": 0.8020102381706238, + "learning_rate": 0.00011945767573250903, + "loss": 0.5327, + "step": 4940 + }, + { + "epoch": 1.5864504735912668, + "grad_norm": 1.1067618131637573, + "learning_rate": 0.0001194234351219972, + "loss": 0.5908, + "step": 4941 + }, + { + "epoch": 1.5867715524161181, + "grad_norm": 1.1690189838409424, + "learning_rate": 0.00011938919214479876, + "loss": 0.6194, + "step": 4942 + }, + { + "epoch": 1.5870926312409697, + "grad_norm": 0.883562445640564, + "learning_rate": 0.00011935494680508606, + "loss": 0.487, + "step": 4943 + }, + { + "epoch": 1.587413710065821, + "grad_norm": 0.8721025586128235, + "learning_rate": 0.00011932069910703176, + "loss": 0.4942, + "step": 4944 + }, + { + "epoch": 1.5877347888906725, + "grad_norm": 1.2519181966781616, + "learning_rate": 0.0001192864490548089, + "loss": 0.6544, + "step": 4945 + }, + { + "epoch": 1.588055867715524, + "grad_norm": 0.8696319460868835, + "learning_rate": 0.00011925219665259075, + "loss": 0.5955, + "step": 4946 + }, + { + "epoch": 1.5883769465403756, + "grad_norm": 0.7267146706581116, + "learning_rate": 0.00011921794190455082, + "loss": 0.5205, + "step": 4947 + }, + { + "epoch": 1.5886980253652272, + "grad_norm": 0.9043694138526917, + "learning_rate": 0.00011918368481486297, + "loss": 0.6211, + "step": 4948 + }, + { + "epoch": 1.5890191041900787, + "grad_norm": 0.7768150568008423, + "learning_rate": 0.00011914942538770131, + "loss": 0.5542, + "step": 4949 + }, + { + "epoch": 1.5893401830149303, + "grad_norm": 1.070797324180603, + "learning_rate": 0.00011911516362724024, + "loss": 0.5892, + "step": 4950 + }, + { + "epoch": 1.5896612618397816, + "grad_norm": 0.8269063234329224, + "learning_rate": 0.00011908089953765449, + "loss": 0.5293, + "step": 4951 + }, + { + "epoch": 1.5899823406646332, + "grad_norm": 0.8718098402023315, + "learning_rate": 0.00011904663312311901, + "loss": 0.5791, + "step": 4952 + }, + { + "epoch": 1.5903034194894845, + "grad_norm": 0.6878900527954102, + "learning_rate": 0.00011901236438780902, + "loss": 0.3673, + "step": 4953 + }, + { + "epoch": 1.590624498314336, + "grad_norm": 0.8858333230018616, + "learning_rate": 0.00011897809333590014, + "loss": 0.5325, + "step": 4954 + }, + { + "epoch": 1.5909455771391876, + "grad_norm": 0.894816517829895, + "learning_rate": 0.00011894381997156813, + "loss": 0.4611, + "step": 4955 + }, + { + "epoch": 1.5912666559640392, + "grad_norm": 1.1014503240585327, + "learning_rate": 0.00011890954429898912, + "loss": 0.6538, + "step": 4956 + }, + { + "epoch": 1.5915877347888907, + "grad_norm": 0.9481959342956543, + "learning_rate": 0.00011887526632233954, + "loss": 0.5716, + "step": 4957 + }, + { + "epoch": 1.5919088136137423, + "grad_norm": 1.010277271270752, + "learning_rate": 0.00011884098604579597, + "loss": 0.7025, + "step": 4958 + }, + { + "epoch": 1.5922298924385938, + "grad_norm": 0.8406404256820679, + "learning_rate": 0.00011880670347353539, + "loss": 0.4381, + "step": 4959 + }, + { + "epoch": 1.5925509712634451, + "grad_norm": 0.948979914188385, + "learning_rate": 0.00011877241860973507, + "loss": 0.5589, + "step": 4960 + }, + { + "epoch": 1.5928720500882967, + "grad_norm": 0.9533628821372986, + "learning_rate": 0.00011873813145857249, + "loss": 0.5339, + "step": 4961 + }, + { + "epoch": 1.593193128913148, + "grad_norm": 1.116711974143982, + "learning_rate": 0.0001187038420242254, + "loss": 0.4653, + "step": 4962 + }, + { + "epoch": 1.5935142077379996, + "grad_norm": 0.6900465488433838, + "learning_rate": 0.0001186695503108719, + "loss": 0.4061, + "step": 4963 + }, + { + "epoch": 1.5938352865628511, + "grad_norm": 0.548774003982544, + "learning_rate": 0.00011863525632269032, + "loss": 0.3729, + "step": 4964 + }, + { + "epoch": 1.5941563653877027, + "grad_norm": 0.519191324710846, + "learning_rate": 0.0001186009600638593, + "loss": 0.5593, + "step": 4965 + }, + { + "epoch": 1.5944774442125542, + "grad_norm": 0.5707334280014038, + "learning_rate": 0.00011856666153855776, + "loss": 0.4427, + "step": 4966 + }, + { + "epoch": 1.5947985230374058, + "grad_norm": 0.5691962242126465, + "learning_rate": 0.00011853236075096474, + "loss": 0.264, + "step": 4967 + }, + { + "epoch": 1.5951196018622573, + "grad_norm": 0.8893996477127075, + "learning_rate": 0.00011849805770525983, + "loss": 0.4208, + "step": 4968 + }, + { + "epoch": 1.5954406806871086, + "grad_norm": 1.1631325483322144, + "learning_rate": 0.0001184637524056227, + "loss": 0.2257, + "step": 4969 + }, + { + "epoch": 1.5957617595119602, + "grad_norm": 0.7988505363464355, + "learning_rate": 0.00011842944485623335, + "loss": 0.2805, + "step": 4970 + }, + { + "epoch": 1.5960828383368115, + "grad_norm": 0.6999688148498535, + "learning_rate": 0.00011839513506127203, + "loss": 0.3814, + "step": 4971 + }, + { + "epoch": 1.596403917161663, + "grad_norm": 0.7029821276664734, + "learning_rate": 0.0001183608230249193, + "loss": 0.5627, + "step": 4972 + }, + { + "epoch": 1.5967249959865146, + "grad_norm": 0.8678010702133179, + "learning_rate": 0.00011832650875135598, + "loss": 0.7244, + "step": 4973 + }, + { + "epoch": 1.5970460748113662, + "grad_norm": 0.8419655561447144, + "learning_rate": 0.00011829219224476318, + "loss": 0.6425, + "step": 4974 + }, + { + "epoch": 1.5973671536362177, + "grad_norm": 0.9832744598388672, + "learning_rate": 0.00011825787350932222, + "loss": 0.7158, + "step": 4975 + }, + { + "epoch": 1.5976882324610693, + "grad_norm": 0.9112509489059448, + "learning_rate": 0.00011822355254921478, + "loss": 0.5684, + "step": 4976 + }, + { + "epoch": 1.5980093112859208, + "grad_norm": 1.4586632251739502, + "learning_rate": 0.00011818922936862269, + "loss": 0.4982, + "step": 4977 + }, + { + "epoch": 1.5983303901107722, + "grad_norm": 0.8100280165672302, + "learning_rate": 0.00011815490397172821, + "loss": 0.6925, + "step": 4978 + }, + { + "epoch": 1.5986514689356237, + "grad_norm": 0.8247403502464294, + "learning_rate": 0.00011812057636271374, + "loss": 0.6636, + "step": 4979 + }, + { + "epoch": 1.598972547760475, + "grad_norm": 0.8497698903083801, + "learning_rate": 0.00011808624654576202, + "loss": 0.5672, + "step": 4980 + }, + { + "epoch": 1.5992936265853266, + "grad_norm": 0.9448472261428833, + "learning_rate": 0.00011805191452505602, + "loss": 0.6503, + "step": 4981 + }, + { + "epoch": 1.5996147054101781, + "grad_norm": 1.0496573448181152, + "learning_rate": 0.00011801758030477897, + "loss": 0.8055, + "step": 4982 + }, + { + "epoch": 1.5999357842350297, + "grad_norm": 0.7179785966873169, + "learning_rate": 0.00011798324388911444, + "loss": 0.4881, + "step": 4983 + }, + { + "epoch": 1.6002568630598812, + "grad_norm": 0.8595174551010132, + "learning_rate": 0.00011794890528224618, + "loss": 0.615, + "step": 4984 + }, + { + "epoch": 1.6005779418847328, + "grad_norm": 0.8961060047149658, + "learning_rate": 0.00011791456448835825, + "loss": 0.6427, + "step": 4985 + }, + { + "epoch": 1.6008990207095843, + "grad_norm": 0.963125467300415, + "learning_rate": 0.00011788022151163495, + "loss": 0.7784, + "step": 4986 + }, + { + "epoch": 1.6012200995344357, + "grad_norm": 0.7629128694534302, + "learning_rate": 0.00011784587635626094, + "loss": 0.509, + "step": 4987 + }, + { + "epoch": 1.6015411783592872, + "grad_norm": 1.0571444034576416, + "learning_rate": 0.000117811529026421, + "loss": 0.7576, + "step": 4988 + }, + { + "epoch": 1.6018622571841385, + "grad_norm": 1.0005722045898438, + "learning_rate": 0.00011777717952630031, + "loss": 0.6151, + "step": 4989 + }, + { + "epoch": 1.60218333600899, + "grad_norm": 0.7066619396209717, + "learning_rate": 0.00011774282786008422, + "loss": 0.5324, + "step": 4990 + }, + { + "epoch": 1.6025044148338417, + "grad_norm": 0.6396430730819702, + "learning_rate": 0.00011770847403195834, + "loss": 0.5243, + "step": 4991 + }, + { + "epoch": 1.6028254936586932, + "grad_norm": 1.4114651679992676, + "learning_rate": 0.00011767411804610864, + "loss": 0.4948, + "step": 4992 + }, + { + "epoch": 1.6031465724835448, + "grad_norm": 0.9310147166252136, + "learning_rate": 0.00011763975990672125, + "loss": 0.7609, + "step": 4993 + }, + { + "epoch": 1.6034676513083963, + "grad_norm": 0.8665596842765808, + "learning_rate": 0.00011760539961798262, + "loss": 0.6365, + "step": 4994 + }, + { + "epoch": 1.6037887301332479, + "grad_norm": 0.831099808216095, + "learning_rate": 0.00011757103718407947, + "loss": 0.5575, + "step": 4995 + }, + { + "epoch": 1.6041098089580992, + "grad_norm": 0.9257915616035461, + "learning_rate": 0.00011753667260919872, + "loss": 0.5272, + "step": 4996 + }, + { + "epoch": 1.6044308877829507, + "grad_norm": 1.372697353363037, + "learning_rate": 0.00011750230589752762, + "loss": 0.5669, + "step": 4997 + }, + { + "epoch": 1.604751966607802, + "grad_norm": 0.978588342666626, + "learning_rate": 0.00011746793705325363, + "loss": 0.532, + "step": 4998 + }, + { + "epoch": 1.6050730454326536, + "grad_norm": 1.0235109329223633, + "learning_rate": 0.00011743356608056449, + "loss": 0.7292, + "step": 4999 + }, + { + "epoch": 1.6053941242575052, + "grad_norm": 1.004314661026001, + "learning_rate": 0.0001173991929836482, + "loss": 0.5686, + "step": 5000 + }, + { + "epoch": 1.6057152030823567, + "grad_norm": 1.8537969589233398, + "learning_rate": 0.00011736481776669306, + "loss": 0.4302, + "step": 5001 + }, + { + "epoch": 1.6060362819072083, + "grad_norm": 0.9163577556610107, + "learning_rate": 0.00011733044043388752, + "loss": 0.5876, + "step": 5002 + }, + { + "epoch": 1.6063573607320598, + "grad_norm": 1.2079983949661255, + "learning_rate": 0.00011729606098942039, + "loss": 0.6811, + "step": 5003 + }, + { + "epoch": 1.6066784395569114, + "grad_norm": 1.0708050727844238, + "learning_rate": 0.00011726167943748067, + "loss": 0.5863, + "step": 5004 + }, + { + "epoch": 1.6069995183817627, + "grad_norm": 0.7910842895507812, + "learning_rate": 0.00011722729578225769, + "loss": 0.4729, + "step": 5005 + }, + { + "epoch": 1.6073205972066142, + "grad_norm": 0.8176678419113159, + "learning_rate": 0.00011719291002794096, + "loss": 0.5986, + "step": 5006 + }, + { + "epoch": 1.6076416760314656, + "grad_norm": 1.354212760925293, + "learning_rate": 0.0001171585221787203, + "loss": 0.5428, + "step": 5007 + }, + { + "epoch": 1.6079627548563171, + "grad_norm": 1.120445966720581, + "learning_rate": 0.00011712413223878578, + "loss": 0.5019, + "step": 5008 + }, + { + "epoch": 1.6082838336811687, + "grad_norm": 0.7986057996749878, + "learning_rate": 0.00011708974021232769, + "loss": 0.4202, + "step": 5009 + }, + { + "epoch": 1.6086049125060202, + "grad_norm": 0.9796577095985413, + "learning_rate": 0.00011705534610353657, + "loss": 0.5712, + "step": 5010 + }, + { + "epoch": 1.6089259913308718, + "grad_norm": 1.225272297859192, + "learning_rate": 0.00011702094991660326, + "loss": 0.615, + "step": 5011 + }, + { + "epoch": 1.6092470701557233, + "grad_norm": 0.8488008975982666, + "learning_rate": 0.00011698655165571886, + "loss": 0.4476, + "step": 5012 + }, + { + "epoch": 1.6095681489805749, + "grad_norm": 0.8429365158081055, + "learning_rate": 0.00011695215132507464, + "loss": 0.5331, + "step": 5013 + }, + { + "epoch": 1.6098892278054262, + "grad_norm": 0.8646475672721863, + "learning_rate": 0.00011691774892886222, + "loss": 0.3656, + "step": 5014 + }, + { + "epoch": 1.6102103066302778, + "grad_norm": 0.603814423084259, + "learning_rate": 0.00011688334447127338, + "loss": 0.3372, + "step": 5015 + }, + { + "epoch": 1.610531385455129, + "grad_norm": 0.5882185697555542, + "learning_rate": 0.00011684893795650027, + "loss": 0.9782, + "step": 5016 + }, + { + "epoch": 1.6108524642799806, + "grad_norm": 0.5701581239700317, + "learning_rate": 0.00011681452938873516, + "loss": 0.3712, + "step": 5017 + }, + { + "epoch": 1.6111735431048322, + "grad_norm": 0.9942889213562012, + "learning_rate": 0.00011678011877217065, + "loss": 0.4755, + "step": 5018 + }, + { + "epoch": 1.6114946219296837, + "grad_norm": 0.8010011911392212, + "learning_rate": 0.00011674570611099955, + "loss": 0.3993, + "step": 5019 + }, + { + "epoch": 1.6118157007545353, + "grad_norm": 0.7642001509666443, + "learning_rate": 0.00011671129140941499, + "loss": 0.4508, + "step": 5020 + }, + { + "epoch": 1.6121367795793868, + "grad_norm": 0.7442882061004639, + "learning_rate": 0.00011667687467161024, + "loss": 0.5295, + "step": 5021 + }, + { + "epoch": 1.6124578584042384, + "grad_norm": 0.8074431419372559, + "learning_rate": 0.00011664245590177892, + "loss": 0.6415, + "step": 5022 + }, + { + "epoch": 1.6127789372290897, + "grad_norm": 0.8954799175262451, + "learning_rate": 0.0001166080351041148, + "loss": 0.6567, + "step": 5023 + }, + { + "epoch": 1.6131000160539413, + "grad_norm": 0.8098952174186707, + "learning_rate": 0.00011657361228281199, + "loss": 0.6164, + "step": 5024 + }, + { + "epoch": 1.6134210948787926, + "grad_norm": 1.059260368347168, + "learning_rate": 0.00011653918744206478, + "loss": 0.4956, + "step": 5025 + }, + { + "epoch": 1.6137421737036441, + "grad_norm": 0.6929076910018921, + "learning_rate": 0.00011650476058606777, + "loss": 0.5855, + "step": 5026 + }, + { + "epoch": 1.6140632525284957, + "grad_norm": 0.9725719094276428, + "learning_rate": 0.00011647033171901573, + "loss": 0.6258, + "step": 5027 + }, + { + "epoch": 1.6143843313533472, + "grad_norm": 0.8998417258262634, + "learning_rate": 0.00011643590084510379, + "loss": 0.6263, + "step": 5028 + }, + { + "epoch": 1.6147054101781988, + "grad_norm": 1.7190628051757812, + "learning_rate": 0.00011640146796852711, + "loss": 0.6147, + "step": 5029 + }, + { + "epoch": 1.6150264890030503, + "grad_norm": 0.8948490023612976, + "learning_rate": 0.00011636703309348133, + "loss": 0.6867, + "step": 5030 + }, + { + "epoch": 1.615347567827902, + "grad_norm": 0.9756568670272827, + "learning_rate": 0.00011633259622416224, + "loss": 0.7751, + "step": 5031 + }, + { + "epoch": 1.6156686466527532, + "grad_norm": 0.9897345304489136, + "learning_rate": 0.00011629815736476581, + "loss": 0.6944, + "step": 5032 + }, + { + "epoch": 1.6159897254776048, + "grad_norm": 1.4569859504699707, + "learning_rate": 0.00011626371651948838, + "loss": 0.5682, + "step": 5033 + }, + { + "epoch": 1.616310804302456, + "grad_norm": 1.191692590713501, + "learning_rate": 0.00011622927369252638, + "loss": 0.7242, + "step": 5034 + }, + { + "epoch": 1.6166318831273077, + "grad_norm": 0.7807984948158264, + "learning_rate": 0.00011619482888807662, + "loss": 0.5721, + "step": 5035 + }, + { + "epoch": 1.6169529619521592, + "grad_norm": 0.7991989254951477, + "learning_rate": 0.00011616038211033613, + "loss": 0.5288, + "step": 5036 + }, + { + "epoch": 1.6172740407770108, + "grad_norm": 0.8879060745239258, + "learning_rate": 0.00011612593336350208, + "loss": 0.4782, + "step": 5037 + }, + { + "epoch": 1.6175951196018623, + "grad_norm": 0.8001835942268372, + "learning_rate": 0.00011609148265177193, + "loss": 0.504, + "step": 5038 + }, + { + "epoch": 1.6179161984267139, + "grad_norm": 0.9068782925605774, + "learning_rate": 0.00011605702997934345, + "loss": 0.5692, + "step": 5039 + }, + { + "epoch": 1.6182372772515654, + "grad_norm": 0.9409818649291992, + "learning_rate": 0.00011602257535041459, + "loss": 0.5669, + "step": 5040 + }, + { + "epoch": 1.6185583560764167, + "grad_norm": 0.9030987620353699, + "learning_rate": 0.0001159881187691835, + "loss": 0.4359, + "step": 5041 + }, + { + "epoch": 1.6188794349012683, + "grad_norm": 1.0598105192184448, + "learning_rate": 0.00011595366023984864, + "loss": 0.693, + "step": 5042 + }, + { + "epoch": 1.6192005137261196, + "grad_norm": 1.0515077114105225, + "learning_rate": 0.00011591919976660868, + "loss": 0.6296, + "step": 5043 + }, + { + "epoch": 1.6195215925509712, + "grad_norm": 0.8490827083587646, + "learning_rate": 0.00011588473735366249, + "loss": 0.5571, + "step": 5044 + }, + { + "epoch": 1.6198426713758227, + "grad_norm": 0.8692120909690857, + "learning_rate": 0.0001158502730052093, + "loss": 0.5099, + "step": 5045 + }, + { + "epoch": 1.6201637502006743, + "grad_norm": 1.1289715766906738, + "learning_rate": 0.00011581580672544838, + "loss": 0.7012, + "step": 5046 + }, + { + "epoch": 1.6204848290255258, + "grad_norm": 1.281795859336853, + "learning_rate": 0.0001157813385185794, + "loss": 0.7737, + "step": 5047 + }, + { + "epoch": 1.6208059078503774, + "grad_norm": 1.1194076538085938, + "learning_rate": 0.00011574686838880215, + "loss": 0.633, + "step": 5048 + }, + { + "epoch": 1.621126986675229, + "grad_norm": 0.8962092399597168, + "learning_rate": 0.00011571239634031679, + "loss": 0.6357, + "step": 5049 + }, + { + "epoch": 1.6214480655000802, + "grad_norm": 0.9650201797485352, + "learning_rate": 0.00011567792237732358, + "loss": 0.5478, + "step": 5050 + }, + { + "epoch": 1.6217691443249318, + "grad_norm": 0.8813601732254028, + "learning_rate": 0.0001156434465040231, + "loss": 0.4708, + "step": 5051 + }, + { + "epoch": 1.6220902231497831, + "grad_norm": 0.8140305876731873, + "learning_rate": 0.0001156089687246161, + "loss": 0.485, + "step": 5052 + }, + { + "epoch": 1.6224113019746347, + "grad_norm": 1.1410284042358398, + "learning_rate": 0.00011557448904330362, + "loss": 0.6938, + "step": 5053 + }, + { + "epoch": 1.6227323807994862, + "grad_norm": 1.0146127939224243, + "learning_rate": 0.0001155400074642869, + "loss": 0.6013, + "step": 5054 + }, + { + "epoch": 1.6230534596243378, + "grad_norm": 1.0160109996795654, + "learning_rate": 0.00011550552399176739, + "loss": 0.4768, + "step": 5055 + }, + { + "epoch": 1.6233745384491893, + "grad_norm": 1.1787000894546509, + "learning_rate": 0.00011547103862994684, + "loss": 0.5916, + "step": 5056 + }, + { + "epoch": 1.6236956172740409, + "grad_norm": 0.8274866342544556, + "learning_rate": 0.00011543655138302714, + "loss": 0.5369, + "step": 5057 + }, + { + "epoch": 1.6240166960988924, + "grad_norm": 0.7210972905158997, + "learning_rate": 0.00011540206225521046, + "loss": 0.3732, + "step": 5058 + }, + { + "epoch": 1.6243377749237438, + "grad_norm": 1.1187121868133545, + "learning_rate": 0.00011536757125069923, + "loss": 0.658, + "step": 5059 + }, + { + "epoch": 1.6246588537485953, + "grad_norm": 0.6988892555236816, + "learning_rate": 0.00011533307837369607, + "loss": 0.3963, + "step": 5060 + }, + { + "epoch": 1.6249799325734466, + "grad_norm": 0.8323413133621216, + "learning_rate": 0.00011529858362840382, + "loss": 0.3751, + "step": 5061 + }, + { + "epoch": 1.6253010113982982, + "grad_norm": 1.572111964225769, + "learning_rate": 0.00011526408701902556, + "loss": 0.5879, + "step": 5062 + }, + { + "epoch": 1.6256220902231497, + "grad_norm": 0.6193574666976929, + "learning_rate": 0.00011522958854976458, + "loss": 0.3986, + "step": 5063 + }, + { + "epoch": 1.6259431690480013, + "grad_norm": 0.6282438635826111, + "learning_rate": 0.00011519508822482446, + "loss": 0.389, + "step": 5064 + }, + { + "epoch": 1.6262642478728528, + "grad_norm": 0.6472731828689575, + "learning_rate": 0.00011516058604840891, + "loss": 0.7557, + "step": 5065 + }, + { + "epoch": 1.6265853266977044, + "grad_norm": 0.5297544598579407, + "learning_rate": 0.00011512608202472194, + "loss": 0.7176, + "step": 5066 + }, + { + "epoch": 1.626906405522556, + "grad_norm": 0.6865007281303406, + "learning_rate": 0.00011509157615796776, + "loss": 0.3564, + "step": 5067 + }, + { + "epoch": 1.6272274843474073, + "grad_norm": 0.6907351613044739, + "learning_rate": 0.00011505706845235078, + "loss": 0.2377, + "step": 5068 + }, + { + "epoch": 1.6275485631722588, + "grad_norm": 0.8242547512054443, + "learning_rate": 0.00011502255891207572, + "loss": 0.2439, + "step": 5069 + }, + { + "epoch": 1.6278696419971102, + "grad_norm": 0.5880115032196045, + "learning_rate": 0.0001149880475413474, + "loss": 0.2065, + "step": 5070 + }, + { + "epoch": 1.6281907208219617, + "grad_norm": 0.6865981817245483, + "learning_rate": 0.00011495353434437098, + "loss": 0.1763, + "step": 5071 + }, + { + "epoch": 1.6285117996468133, + "grad_norm": 0.7891075015068054, + "learning_rate": 0.00011491901932535172, + "loss": 0.5724, + "step": 5072 + }, + { + "epoch": 1.6288328784716648, + "grad_norm": 1.0175186395645142, + "learning_rate": 0.00011488450248849522, + "loss": 0.884, + "step": 5073 + }, + { + "epoch": 1.6291539572965164, + "grad_norm": 0.808527946472168, + "learning_rate": 0.00011484998383800726, + "loss": 0.5626, + "step": 5074 + }, + { + "epoch": 1.629475036121368, + "grad_norm": 0.8275448679924011, + "learning_rate": 0.00011481546337809381, + "loss": 0.5928, + "step": 5075 + }, + { + "epoch": 1.6297961149462195, + "grad_norm": 0.7020288109779358, + "learning_rate": 0.00011478094111296109, + "loss": 0.4619, + "step": 5076 + }, + { + "epoch": 1.6301171937710708, + "grad_norm": 0.7639602422714233, + "learning_rate": 0.0001147464170468155, + "loss": 0.5046, + "step": 5077 + }, + { + "epoch": 1.6304382725959223, + "grad_norm": 0.7162854671478271, + "learning_rate": 0.00011471189118386375, + "loss": 0.5293, + "step": 5078 + }, + { + "epoch": 1.6307593514207737, + "grad_norm": 0.826367199420929, + "learning_rate": 0.00011467736352831266, + "loss": 0.4409, + "step": 5079 + }, + { + "epoch": 1.6310804302456252, + "grad_norm": 0.9901473522186279, + "learning_rate": 0.00011464283408436938, + "loss": 0.7239, + "step": 5080 + }, + { + "epoch": 1.6314015090704768, + "grad_norm": 1.07185697555542, + "learning_rate": 0.00011460830285624118, + "loss": 0.6114, + "step": 5081 + }, + { + "epoch": 1.6317225878953283, + "grad_norm": 1.0970301628112793, + "learning_rate": 0.00011457376984813557, + "loss": 0.6966, + "step": 5082 + }, + { + "epoch": 1.6320436667201799, + "grad_norm": 0.672584056854248, + "learning_rate": 0.00011453923506426032, + "loss": 0.4641, + "step": 5083 + }, + { + "epoch": 1.6323647455450314, + "grad_norm": 0.9183497428894043, + "learning_rate": 0.00011450469850882337, + "loss": 0.7209, + "step": 5084 + }, + { + "epoch": 1.632685824369883, + "grad_norm": 1.120105266571045, + "learning_rate": 0.00011447016018603292, + "loss": 0.7857, + "step": 5085 + }, + { + "epoch": 1.6330069031947343, + "grad_norm": 1.3121768236160278, + "learning_rate": 0.00011443562010009731, + "loss": 0.6645, + "step": 5086 + }, + { + "epoch": 1.6333279820195858, + "grad_norm": 0.8911232352256775, + "learning_rate": 0.00011440107825522521, + "loss": 0.5584, + "step": 5087 + }, + { + "epoch": 1.6336490608444372, + "grad_norm": 0.9025933146476746, + "learning_rate": 0.00011436653465562542, + "loss": 0.5403, + "step": 5088 + }, + { + "epoch": 1.6339701396692887, + "grad_norm": 0.9944098591804504, + "learning_rate": 0.00011433198930550695, + "loss": 0.5797, + "step": 5089 + }, + { + "epoch": 1.6342912184941403, + "grad_norm": 0.8134103417396545, + "learning_rate": 0.00011429744220907903, + "loss": 0.6454, + "step": 5090 + }, + { + "epoch": 1.6346122973189918, + "grad_norm": 0.9299569129943848, + "learning_rate": 0.00011426289337055119, + "loss": 0.6705, + "step": 5091 + }, + { + "epoch": 1.6349333761438434, + "grad_norm": 1.1119047403335571, + "learning_rate": 0.00011422834279413301, + "loss": 0.5778, + "step": 5092 + }, + { + "epoch": 1.635254454968695, + "grad_norm": 0.860736072063446, + "learning_rate": 0.00011419379048403444, + "loss": 0.687, + "step": 5093 + }, + { + "epoch": 1.6355755337935465, + "grad_norm": 1.328277349472046, + "learning_rate": 0.00011415923644446557, + "loss": 0.8227, + "step": 5094 + }, + { + "epoch": 1.6358966126183978, + "grad_norm": 1.0155832767486572, + "learning_rate": 0.00011412468067963669, + "loss": 0.61, + "step": 5095 + }, + { + "epoch": 1.6362176914432494, + "grad_norm": 0.8249773383140564, + "learning_rate": 0.00011409012319375827, + "loss": 0.454, + "step": 5096 + }, + { + "epoch": 1.6365387702681007, + "grad_norm": 0.8771111369132996, + "learning_rate": 0.00011405556399104109, + "loss": 0.6191, + "step": 5097 + }, + { + "epoch": 1.6368598490929522, + "grad_norm": 1.0984089374542236, + "learning_rate": 0.00011402100307569612, + "loss": 0.8062, + "step": 5098 + }, + { + "epoch": 1.6371809279178038, + "grad_norm": 0.7686770558357239, + "learning_rate": 0.00011398644045193444, + "loss": 0.4661, + "step": 5099 + }, + { + "epoch": 1.6375020067426553, + "grad_norm": 0.8252708911895752, + "learning_rate": 0.00011395187612396738, + "loss": 0.5061, + "step": 5100 + }, + { + "epoch": 1.6378230855675069, + "grad_norm": 1.2015011310577393, + "learning_rate": 0.00011391731009600654, + "loss": 0.6285, + "step": 5101 + }, + { + "epoch": 1.6381441643923584, + "grad_norm": 1.1227675676345825, + "learning_rate": 0.00011388274237226371, + "loss": 0.8034, + "step": 5102 + }, + { + "epoch": 1.63846524321721, + "grad_norm": 0.8477095365524292, + "learning_rate": 0.00011384817295695083, + "loss": 0.5733, + "step": 5103 + }, + { + "epoch": 1.6387863220420613, + "grad_norm": 0.7261363863945007, + "learning_rate": 0.00011381360185428007, + "loss": 0.3891, + "step": 5104 + }, + { + "epoch": 1.6391074008669129, + "grad_norm": 0.9272900223731995, + "learning_rate": 0.0001137790290684638, + "loss": 0.3385, + "step": 5105 + }, + { + "epoch": 1.6394284796917642, + "grad_norm": 1.001617431640625, + "learning_rate": 0.00011374445460371466, + "loss": 0.5374, + "step": 5106 + }, + { + "epoch": 1.6397495585166157, + "grad_norm": 0.8543923497200012, + "learning_rate": 0.00011370987846424546, + "loss": 0.5697, + "step": 5107 + }, + { + "epoch": 1.6400706373414673, + "grad_norm": 0.9451769590377808, + "learning_rate": 0.0001136753006542691, + "loss": 0.4807, + "step": 5108 + }, + { + "epoch": 1.6403917161663188, + "grad_norm": 0.6929908394813538, + "learning_rate": 0.00011364072117799885, + "loss": 0.4472, + "step": 5109 + }, + { + "epoch": 1.6407127949911704, + "grad_norm": 1.1182005405426025, + "learning_rate": 0.00011360614003964809, + "loss": 0.5891, + "step": 5110 + }, + { + "epoch": 1.641033873816022, + "grad_norm": 1.0447207689285278, + "learning_rate": 0.00011357155724343045, + "loss": 0.5113, + "step": 5111 + }, + { + "epoch": 1.6413549526408735, + "grad_norm": 0.7325683832168579, + "learning_rate": 0.00011353697279355973, + "loss": 0.3998, + "step": 5112 + }, + { + "epoch": 1.6416760314657248, + "grad_norm": 0.6994525790214539, + "learning_rate": 0.00011350238669424993, + "loss": 0.441, + "step": 5113 + }, + { + "epoch": 1.6419971102905764, + "grad_norm": 1.6808966398239136, + "learning_rate": 0.00011346779894971527, + "loss": 0.5446, + "step": 5114 + }, + { + "epoch": 1.6423181891154277, + "grad_norm": 0.5908676981925964, + "learning_rate": 0.00011343320956417014, + "loss": 0.8066, + "step": 5115 + }, + { + "epoch": 1.6426392679402793, + "grad_norm": 0.6339381337165833, + "learning_rate": 0.00011339861854182922, + "loss": 0.7703, + "step": 5116 + }, + { + "epoch": 1.6429603467651308, + "grad_norm": 0.7568504810333252, + "learning_rate": 0.00011336402588690726, + "loss": 0.5826, + "step": 5117 + }, + { + "epoch": 1.6432814255899824, + "grad_norm": 0.6824870705604553, + "learning_rate": 0.00011332943160361926, + "loss": 0.2832, + "step": 5118 + }, + { + "epoch": 1.643602504414834, + "grad_norm": 0.7612968683242798, + "learning_rate": 0.00011329483569618045, + "loss": 0.291, + "step": 5119 + }, + { + "epoch": 1.6439235832396855, + "grad_norm": 0.7312465906143188, + "learning_rate": 0.00011326023816880625, + "loss": 0.2184, + "step": 5120 + }, + { + "epoch": 1.6442446620645368, + "grad_norm": 0.7438070178031921, + "learning_rate": 0.00011322563902571226, + "loss": 0.3823, + "step": 5121 + }, + { + "epoch": 1.6445657408893883, + "grad_norm": 1.085114598274231, + "learning_rate": 0.00011319103827111426, + "loss": 0.6073, + "step": 5122 + }, + { + "epoch": 1.64488681971424, + "grad_norm": 0.8549251556396484, + "learning_rate": 0.00011315643590922827, + "loss": 0.6365, + "step": 5123 + }, + { + "epoch": 1.6452078985390912, + "grad_norm": 0.8248443603515625, + "learning_rate": 0.00011312183194427046, + "loss": 0.5795, + "step": 5124 + }, + { + "epoch": 1.6455289773639428, + "grad_norm": 0.9894840121269226, + "learning_rate": 0.00011308722638045724, + "loss": 0.5455, + "step": 5125 + }, + { + "epoch": 1.6458500561887943, + "grad_norm": 0.7669897079467773, + "learning_rate": 0.00011305261922200519, + "loss": 0.537, + "step": 5126 + }, + { + "epoch": 1.6461711350136459, + "grad_norm": 0.9028288722038269, + "learning_rate": 0.00011301801047313105, + "loss": 0.6561, + "step": 5127 + }, + { + "epoch": 1.6464922138384974, + "grad_norm": 0.8793859481811523, + "learning_rate": 0.00011298340013805184, + "loss": 0.6243, + "step": 5128 + }, + { + "epoch": 1.646813292663349, + "grad_norm": 0.7349303364753723, + "learning_rate": 0.00011294878822098469, + "loss": 0.5293, + "step": 5129 + }, + { + "epoch": 1.6471343714882003, + "grad_norm": 0.8124568462371826, + "learning_rate": 0.000112914174726147, + "loss": 0.5847, + "step": 5130 + }, + { + "epoch": 1.6474554503130519, + "grad_norm": 1.0718876123428345, + "learning_rate": 0.0001128795596577563, + "loss": 0.6405, + "step": 5131 + }, + { + "epoch": 1.6477765291379034, + "grad_norm": 0.7873112559318542, + "learning_rate": 0.0001128449430200303, + "loss": 0.5677, + "step": 5132 + }, + { + "epoch": 1.6480976079627547, + "grad_norm": 0.8473823666572571, + "learning_rate": 0.00011281032481718697, + "loss": 0.6249, + "step": 5133 + }, + { + "epoch": 1.6484186867876063, + "grad_norm": 0.8866380453109741, + "learning_rate": 0.0001127757050534444, + "loss": 0.7474, + "step": 5134 + }, + { + "epoch": 1.6487397656124578, + "grad_norm": 0.9700244069099426, + "learning_rate": 0.00011274108373302095, + "loss": 0.7294, + "step": 5135 + }, + { + "epoch": 1.6490608444373094, + "grad_norm": 0.9701023697853088, + "learning_rate": 0.00011270646086013505, + "loss": 0.8279, + "step": 5136 + }, + { + "epoch": 1.649381923262161, + "grad_norm": 0.9571521282196045, + "learning_rate": 0.00011267183643900548, + "loss": 0.4458, + "step": 5137 + }, + { + "epoch": 1.6497030020870125, + "grad_norm": 0.8921281099319458, + "learning_rate": 0.00011263721047385105, + "loss": 0.588, + "step": 5138 + }, + { + "epoch": 1.6500240809118638, + "grad_norm": 1.0599466562271118, + "learning_rate": 0.00011260258296889086, + "loss": 0.7079, + "step": 5139 + }, + { + "epoch": 1.6503451597367154, + "grad_norm": 0.9576237201690674, + "learning_rate": 0.00011256795392834419, + "loss": 0.6869, + "step": 5140 + }, + { + "epoch": 1.650666238561567, + "grad_norm": 1.035122036933899, + "learning_rate": 0.00011253332335643043, + "loss": 0.6376, + "step": 5141 + }, + { + "epoch": 1.6509873173864182, + "grad_norm": 1.0403311252593994, + "learning_rate": 0.00011249869125736925, + "loss": 0.5694, + "step": 5142 + }, + { + "epoch": 1.6513083962112698, + "grad_norm": 0.9405300617218018, + "learning_rate": 0.00011246405763538046, + "loss": 0.5892, + "step": 5143 + }, + { + "epoch": 1.6516294750361213, + "grad_norm": 1.0725064277648926, + "learning_rate": 0.00011242942249468402, + "loss": 0.7886, + "step": 5144 + }, + { + "epoch": 1.651950553860973, + "grad_norm": 1.1589728593826294, + "learning_rate": 0.00011239478583950018, + "loss": 0.6928, + "step": 5145 + }, + { + "epoch": 1.6522716326858244, + "grad_norm": 0.9433485865592957, + "learning_rate": 0.00011236014767404927, + "loss": 0.673, + "step": 5146 + }, + { + "epoch": 1.652592711510676, + "grad_norm": 0.7959662079811096, + "learning_rate": 0.00011232550800255188, + "loss": 0.555, + "step": 5147 + }, + { + "epoch": 1.6529137903355273, + "grad_norm": 0.8811464905738831, + "learning_rate": 0.00011229086682922869, + "loss": 0.5339, + "step": 5148 + }, + { + "epoch": 1.6532348691603789, + "grad_norm": 1.0386700630187988, + "learning_rate": 0.00011225622415830068, + "loss": 0.5246, + "step": 5149 + }, + { + "epoch": 1.6535559479852304, + "grad_norm": 0.907658040523529, + "learning_rate": 0.00011222157999398895, + "loss": 0.6589, + "step": 5150 + }, + { + "epoch": 1.6538770268100818, + "grad_norm": 0.6662940979003906, + "learning_rate": 0.00011218693434051475, + "loss": 0.5202, + "step": 5151 + }, + { + "epoch": 1.6541981056349333, + "grad_norm": 0.8499528765678406, + "learning_rate": 0.00011215228720209958, + "loss": 0.5316, + "step": 5152 + }, + { + "epoch": 1.6545191844597849, + "grad_norm": 0.8833222389221191, + "learning_rate": 0.00011211763858296507, + "loss": 0.534, + "step": 5153 + }, + { + "epoch": 1.6548402632846364, + "grad_norm": 0.8427216410636902, + "learning_rate": 0.00011208298848733305, + "loss": 0.4688, + "step": 5154 + }, + { + "epoch": 1.655161342109488, + "grad_norm": 0.808330774307251, + "learning_rate": 0.00011204833691942553, + "loss": 0.5204, + "step": 5155 + }, + { + "epoch": 1.6554824209343395, + "grad_norm": 0.6491779685020447, + "learning_rate": 0.00011201368388346471, + "loss": 0.383, + "step": 5156 + }, + { + "epoch": 1.6558034997591908, + "grad_norm": 1.3244563341140747, + "learning_rate": 0.00011197902938367298, + "loss": 0.5214, + "step": 5157 + }, + { + "epoch": 1.6561245785840424, + "grad_norm": 1.1561415195465088, + "learning_rate": 0.0001119443734242728, + "loss": 0.5834, + "step": 5158 + }, + { + "epoch": 1.656445657408894, + "grad_norm": 1.0299034118652344, + "learning_rate": 0.00011190971600948699, + "loss": 0.5443, + "step": 5159 + }, + { + "epoch": 1.6567667362337453, + "grad_norm": 0.8660802841186523, + "learning_rate": 0.0001118750571435384, + "loss": 0.4689, + "step": 5160 + }, + { + "epoch": 1.6570878150585968, + "grad_norm": 0.6859344840049744, + "learning_rate": 0.00011184039683065013, + "loss": 0.4929, + "step": 5161 + }, + { + "epoch": 1.6574088938834484, + "grad_norm": 1.2435836791992188, + "learning_rate": 0.00011180573507504537, + "loss": 0.5144, + "step": 5162 + }, + { + "epoch": 1.6577299727083, + "grad_norm": 0.9810152649879456, + "learning_rate": 0.00011177107188094764, + "loss": 0.4428, + "step": 5163 + }, + { + "epoch": 1.6580510515331515, + "grad_norm": 2.1121833324432373, + "learning_rate": 0.00011173640725258052, + "loss": 0.5309, + "step": 5164 + }, + { + "epoch": 1.658372130358003, + "grad_norm": 0.8293464183807373, + "learning_rate": 0.00011170174119416776, + "loss": 0.7781, + "step": 5165 + }, + { + "epoch": 1.6586932091828543, + "grad_norm": 0.5578963160514832, + "learning_rate": 0.00011166707370993333, + "loss": 0.5859, + "step": 5166 + }, + { + "epoch": 1.659014288007706, + "grad_norm": 0.5009366869926453, + "learning_rate": 0.00011163240480410135, + "loss": 0.6605, + "step": 5167 + }, + { + "epoch": 1.6593353668325574, + "grad_norm": 0.7301324605941772, + "learning_rate": 0.00011159773448089614, + "loss": 0.4758, + "step": 5168 + }, + { + "epoch": 1.6596564456574088, + "grad_norm": 0.6743203401565552, + "learning_rate": 0.00011156306274454218, + "loss": 0.3494, + "step": 5169 + }, + { + "epoch": 1.6599775244822603, + "grad_norm": 0.7202017903327942, + "learning_rate": 0.00011152838959926408, + "loss": 0.3711, + "step": 5170 + }, + { + "epoch": 1.6602986033071119, + "grad_norm": 0.6269795298576355, + "learning_rate": 0.00011149371504928668, + "loss": 0.2464, + "step": 5171 + }, + { + "epoch": 1.6606196821319634, + "grad_norm": 0.5528831481933594, + "learning_rate": 0.00011145903909883495, + "loss": 0.2153, + "step": 5172 + }, + { + "epoch": 1.660940760956815, + "grad_norm": 0.695706307888031, + "learning_rate": 0.00011142436175213409, + "loss": 0.5348, + "step": 5173 + }, + { + "epoch": 1.6612618397816665, + "grad_norm": 0.8045232892036438, + "learning_rate": 0.0001113896830134094, + "loss": 0.6501, + "step": 5174 + }, + { + "epoch": 1.6615829186065179, + "grad_norm": 1.1666730642318726, + "learning_rate": 0.00011135500288688636, + "loss": 0.6264, + "step": 5175 + }, + { + "epoch": 1.6619039974313694, + "grad_norm": 0.9477275013923645, + "learning_rate": 0.0001113203213767907, + "loss": 0.5997, + "step": 5176 + }, + { + "epoch": 1.662225076256221, + "grad_norm": 0.7591928839683533, + "learning_rate": 0.00011128563848734816, + "loss": 0.4988, + "step": 5177 + }, + { + "epoch": 1.6625461550810723, + "grad_norm": 1.0086168050765991, + "learning_rate": 0.00011125095422278486, + "loss": 0.5753, + "step": 5178 + }, + { + "epoch": 1.6628672339059238, + "grad_norm": 0.78431636095047, + "learning_rate": 0.0001112162685873269, + "loss": 0.6154, + "step": 5179 + }, + { + "epoch": 1.6631883127307754, + "grad_norm": 0.7329674959182739, + "learning_rate": 0.00011118158158520064, + "loss": 0.4773, + "step": 5180 + }, + { + "epoch": 1.663509391555627, + "grad_norm": 0.6751540303230286, + "learning_rate": 0.00011114689322063255, + "loss": 0.4195, + "step": 5181 + }, + { + "epoch": 1.6638304703804785, + "grad_norm": 0.8054059147834778, + "learning_rate": 0.00011111220349784937, + "loss": 0.5682, + "step": 5182 + }, + { + "epoch": 1.66415154920533, + "grad_norm": 1.0262163877487183, + "learning_rate": 0.00011107751242107787, + "loss": 0.8966, + "step": 5183 + }, + { + "epoch": 1.6644726280301814, + "grad_norm": 1.2697820663452148, + "learning_rate": 0.00011104281999454511, + "loss": 0.6868, + "step": 5184 + }, + { + "epoch": 1.664793706855033, + "grad_norm": 1.0350483655929565, + "learning_rate": 0.00011100812622247822, + "loss": 0.5431, + "step": 5185 + }, + { + "epoch": 1.6651147856798842, + "grad_norm": 0.7967930436134338, + "learning_rate": 0.00011097343110910452, + "loss": 0.3997, + "step": 5186 + }, + { + "epoch": 1.6654358645047358, + "grad_norm": 1.0638127326965332, + "learning_rate": 0.00011093873465865157, + "loss": 0.8347, + "step": 5187 + }, + { + "epoch": 1.6657569433295873, + "grad_norm": 0.8634060621261597, + "learning_rate": 0.00011090403687534697, + "loss": 0.5585, + "step": 5188 + }, + { + "epoch": 1.666078022154439, + "grad_norm": 1.0660980939865112, + "learning_rate": 0.00011086933776341852, + "loss": 0.722, + "step": 5189 + }, + { + "epoch": 1.6663991009792904, + "grad_norm": 1.0140385627746582, + "learning_rate": 0.00011083463732709425, + "loss": 0.5254, + "step": 5190 + }, + { + "epoch": 1.666720179804142, + "grad_norm": 1.2487151622772217, + "learning_rate": 0.0001107999355706023, + "loss": 0.6787, + "step": 5191 + }, + { + "epoch": 1.6670412586289935, + "grad_norm": 1.3040145635604858, + "learning_rate": 0.00011076523249817094, + "loss": 0.8144, + "step": 5192 + }, + { + "epoch": 1.6673623374538449, + "grad_norm": 0.9779611229896545, + "learning_rate": 0.00011073052811402867, + "loss": 0.6663, + "step": 5193 + }, + { + "epoch": 1.6676834162786964, + "grad_norm": 0.7410480976104736, + "learning_rate": 0.0001106958224224041, + "loss": 0.49, + "step": 5194 + }, + { + "epoch": 1.6680044951035478, + "grad_norm": 1.0676357746124268, + "learning_rate": 0.000110661115427526, + "loss": 0.6552, + "step": 5195 + }, + { + "epoch": 1.6683255739283993, + "grad_norm": 0.8795901536941528, + "learning_rate": 0.00011062640713362333, + "loss": 0.6453, + "step": 5196 + }, + { + "epoch": 1.6686466527532509, + "grad_norm": 0.8016363382339478, + "learning_rate": 0.0001105916975449252, + "loss": 0.5491, + "step": 5197 + }, + { + "epoch": 1.6689677315781024, + "grad_norm": 0.7814456820487976, + "learning_rate": 0.00011055698666566084, + "loss": 0.5246, + "step": 5198 + }, + { + "epoch": 1.669288810402954, + "grad_norm": 1.1860097646713257, + "learning_rate": 0.00011052227450005967, + "loss": 0.5291, + "step": 5199 + }, + { + "epoch": 1.6696098892278055, + "grad_norm": 0.7847801446914673, + "learning_rate": 0.00011048756105235125, + "loss": 0.5715, + "step": 5200 + }, + { + "epoch": 1.669930968052657, + "grad_norm": 0.6763100028038025, + "learning_rate": 0.00011045284632676536, + "loss": 0.4152, + "step": 5201 + }, + { + "epoch": 1.6702520468775084, + "grad_norm": 1.2399777173995972, + "learning_rate": 0.00011041813032753183, + "loss": 0.504, + "step": 5202 + }, + { + "epoch": 1.67057312570236, + "grad_norm": 0.9793505668640137, + "learning_rate": 0.00011038341305888074, + "loss": 0.7147, + "step": 5203 + }, + { + "epoch": 1.6708942045272113, + "grad_norm": 0.6837410926818848, + "learning_rate": 0.00011034869452504226, + "loss": 0.381, + "step": 5204 + }, + { + "epoch": 1.6712152833520628, + "grad_norm": 1.278186321258545, + "learning_rate": 0.00011031397473024674, + "loss": 0.5128, + "step": 5205 + }, + { + "epoch": 1.6715363621769144, + "grad_norm": 0.8159211277961731, + "learning_rate": 0.00011027925367872469, + "loss": 0.5699, + "step": 5206 + }, + { + "epoch": 1.671857441001766, + "grad_norm": 0.7557732462882996, + "learning_rate": 0.00011024453137470677, + "loss": 0.5079, + "step": 5207 + }, + { + "epoch": 1.6721785198266175, + "grad_norm": 0.9018636345863342, + "learning_rate": 0.00011020980782242376, + "loss": 0.533, + "step": 5208 + }, + { + "epoch": 1.672499598651469, + "grad_norm": 0.8341492414474487, + "learning_rate": 0.00011017508302610664, + "loss": 0.5423, + "step": 5209 + }, + { + "epoch": 1.6728206774763206, + "grad_norm": 1.1660737991333008, + "learning_rate": 0.00011014035698998651, + "loss": 0.4989, + "step": 5210 + }, + { + "epoch": 1.673141756301172, + "grad_norm": 0.9293453097343445, + "learning_rate": 0.00011010562971829463, + "loss": 0.4339, + "step": 5211 + }, + { + "epoch": 1.6734628351260235, + "grad_norm": 0.6236379146575928, + "learning_rate": 0.00011007090121526245, + "loss": 0.4249, + "step": 5212 + }, + { + "epoch": 1.6737839139508748, + "grad_norm": 0.8735912442207336, + "learning_rate": 0.00011003617148512149, + "loss": 0.5046, + "step": 5213 + }, + { + "epoch": 1.6741049927757263, + "grad_norm": 1.3508676290512085, + "learning_rate": 0.00011000144053210348, + "loss": 0.3542, + "step": 5214 + }, + { + "epoch": 1.6744260716005779, + "grad_norm": 1.2363876104354858, + "learning_rate": 0.0001099667083604403, + "loss": 0.4835, + "step": 5215 + }, + { + "epoch": 1.6747471504254294, + "grad_norm": 0.650037944316864, + "learning_rate": 0.00010993197497436391, + "loss": 0.7668, + "step": 5216 + }, + { + "epoch": 1.675068229250281, + "grad_norm": 0.6067812442779541, + "learning_rate": 0.00010989724037810652, + "loss": 0.5965, + "step": 5217 + }, + { + "epoch": 1.6753893080751325, + "grad_norm": 0.8295800685882568, + "learning_rate": 0.00010986250457590039, + "loss": 0.4706, + "step": 5218 + }, + { + "epoch": 1.675710386899984, + "grad_norm": 0.9519297480583191, + "learning_rate": 0.00010982776757197799, + "loss": 0.3641, + "step": 5219 + }, + { + "epoch": 1.6760314657248354, + "grad_norm": 0.8043132424354553, + "learning_rate": 0.00010979302937057192, + "loss": 0.3151, + "step": 5220 + }, + { + "epoch": 1.676352544549687, + "grad_norm": 0.8412678837776184, + "learning_rate": 0.00010975828997591495, + "loss": 0.4017, + "step": 5221 + }, + { + "epoch": 1.6766736233745383, + "grad_norm": 0.5943979620933533, + "learning_rate": 0.00010972354939223996, + "loss": 0.1791, + "step": 5222 + }, + { + "epoch": 1.6769947021993898, + "grad_norm": 0.7364015579223633, + "learning_rate": 0.00010968880762377993, + "loss": 0.2705, + "step": 5223 + }, + { + "epoch": 1.6773157810242414, + "grad_norm": 0.9227198362350464, + "learning_rate": 0.00010965406467476808, + "loss": 0.6049, + "step": 5224 + }, + { + "epoch": 1.677636859849093, + "grad_norm": 0.7940647602081299, + "learning_rate": 0.00010961932054943778, + "loss": 0.5055, + "step": 5225 + }, + { + "epoch": 1.6779579386739445, + "grad_norm": 0.8924132585525513, + "learning_rate": 0.00010958457525202241, + "loss": 0.6137, + "step": 5226 + }, + { + "epoch": 1.678279017498796, + "grad_norm": 0.7673733830451965, + "learning_rate": 0.00010954982878675563, + "loss": 0.5312, + "step": 5227 + }, + { + "epoch": 1.6786000963236476, + "grad_norm": 0.7759941816329956, + "learning_rate": 0.00010951508115787119, + "loss": 0.4362, + "step": 5228 + }, + { + "epoch": 1.678921175148499, + "grad_norm": 0.9239017963409424, + "learning_rate": 0.00010948033236960294, + "loss": 0.6432, + "step": 5229 + }, + { + "epoch": 1.6792422539733505, + "grad_norm": 0.9103022813796997, + "learning_rate": 0.00010944558242618496, + "loss": 0.6079, + "step": 5230 + }, + { + "epoch": 1.6795633327982018, + "grad_norm": 1.2982368469238281, + "learning_rate": 0.00010941083133185146, + "loss": 0.7558, + "step": 5231 + }, + { + "epoch": 1.6798844116230534, + "grad_norm": 0.9765358567237854, + "learning_rate": 0.00010937607909083667, + "loss": 0.5457, + "step": 5232 + }, + { + "epoch": 1.680205490447905, + "grad_norm": 0.7929934859275818, + "learning_rate": 0.00010934132570737507, + "loss": 0.5489, + "step": 5233 + }, + { + "epoch": 1.6805265692727565, + "grad_norm": 0.8707826733589172, + "learning_rate": 0.00010930657118570126, + "loss": 0.5285, + "step": 5234 + }, + { + "epoch": 1.680847648097608, + "grad_norm": 0.8279105424880981, + "learning_rate": 0.00010927181553005002, + "loss": 0.6373, + "step": 5235 + }, + { + "epoch": 1.6811687269224596, + "grad_norm": 1.1090543270111084, + "learning_rate": 0.00010923705874465618, + "loss": 0.5366, + "step": 5236 + }, + { + "epoch": 1.681489805747311, + "grad_norm": 0.6475349068641663, + "learning_rate": 0.00010920230083375473, + "loss": 0.4386, + "step": 5237 + }, + { + "epoch": 1.6818108845721624, + "grad_norm": 1.031524419784546, + "learning_rate": 0.00010916754180158082, + "loss": 0.6921, + "step": 5238 + }, + { + "epoch": 1.682131963397014, + "grad_norm": 1.1546660661697388, + "learning_rate": 0.00010913278165236978, + "loss": 0.6612, + "step": 5239 + }, + { + "epoch": 1.6824530422218653, + "grad_norm": 1.4143015146255493, + "learning_rate": 0.00010909802039035701, + "loss": 0.7917, + "step": 5240 + }, + { + "epoch": 1.6827741210467169, + "grad_norm": 1.350466012954712, + "learning_rate": 0.00010906325801977804, + "loss": 0.7994, + "step": 5241 + }, + { + "epoch": 1.6830951998715684, + "grad_norm": 0.7425410747528076, + "learning_rate": 0.00010902849454486856, + "loss": 0.5108, + "step": 5242 + }, + { + "epoch": 1.68341627869642, + "grad_norm": 1.0292056798934937, + "learning_rate": 0.00010899372996986439, + "loss": 0.5466, + "step": 5243 + }, + { + "epoch": 1.6837373575212715, + "grad_norm": 0.9850762486457825, + "learning_rate": 0.00010895896429900154, + "loss": 0.5561, + "step": 5244 + }, + { + "epoch": 1.684058436346123, + "grad_norm": 1.2769955396652222, + "learning_rate": 0.00010892419753651606, + "loss": 0.614, + "step": 5245 + }, + { + "epoch": 1.6843795151709746, + "grad_norm": 0.877348005771637, + "learning_rate": 0.00010888942968664417, + "loss": 0.5857, + "step": 5246 + }, + { + "epoch": 1.684700593995826, + "grad_norm": 1.039420485496521, + "learning_rate": 0.00010885466075362223, + "loss": 0.5347, + "step": 5247 + }, + { + "epoch": 1.6850216728206775, + "grad_norm": 1.2962031364440918, + "learning_rate": 0.00010881989074168673, + "loss": 0.86, + "step": 5248 + }, + { + "epoch": 1.6853427516455288, + "grad_norm": 1.009063720703125, + "learning_rate": 0.00010878511965507434, + "loss": 0.6275, + "step": 5249 + }, + { + "epoch": 1.6856638304703804, + "grad_norm": 0.9977805614471436, + "learning_rate": 0.00010875034749802173, + "loss": 0.6225, + "step": 5250 + }, + { + "epoch": 1.685984909295232, + "grad_norm": 0.8478687405586243, + "learning_rate": 0.00010871557427476583, + "loss": 0.5791, + "step": 5251 + }, + { + "epoch": 1.6863059881200835, + "grad_norm": 1.0061284303665161, + "learning_rate": 0.00010868079998954364, + "loss": 0.5899, + "step": 5252 + }, + { + "epoch": 1.686627066944935, + "grad_norm": 0.7541671395301819, + "learning_rate": 0.0001086460246465923, + "loss": 0.5005, + "step": 5253 + }, + { + "epoch": 1.6869481457697866, + "grad_norm": 0.8530805706977844, + "learning_rate": 0.00010861124825014908, + "loss": 0.5079, + "step": 5254 + }, + { + "epoch": 1.6872692245946381, + "grad_norm": 0.8451884984970093, + "learning_rate": 0.00010857647080445139, + "loss": 0.5617, + "step": 5255 + }, + { + "epoch": 1.6875903034194895, + "grad_norm": 1.1053026914596558, + "learning_rate": 0.00010854169231373676, + "loss": 0.6804, + "step": 5256 + }, + { + "epoch": 1.687911382244341, + "grad_norm": 0.7350571751594543, + "learning_rate": 0.00010850691278224281, + "loss": 0.5118, + "step": 5257 + }, + { + "epoch": 1.6882324610691923, + "grad_norm": 0.7682427167892456, + "learning_rate": 0.00010847213221420736, + "loss": 0.4746, + "step": 5258 + }, + { + "epoch": 1.6885535398940439, + "grad_norm": 1.612838625907898, + "learning_rate": 0.00010843735061386828, + "loss": 0.6223, + "step": 5259 + }, + { + "epoch": 1.6888746187188954, + "grad_norm": 0.8637731671333313, + "learning_rate": 0.00010840256798546364, + "loss": 0.4417, + "step": 5260 + }, + { + "epoch": 1.689195697543747, + "grad_norm": 1.2856574058532715, + "learning_rate": 0.00010836778433323158, + "loss": 0.6902, + "step": 5261 + }, + { + "epoch": 1.6895167763685985, + "grad_norm": 1.089842438697815, + "learning_rate": 0.00010833299966141035, + "loss": 0.5839, + "step": 5262 + }, + { + "epoch": 1.68983785519345, + "grad_norm": 0.7094756364822388, + "learning_rate": 0.0001082982139742384, + "loss": 0.418, + "step": 5263 + }, + { + "epoch": 1.6901589340183016, + "grad_norm": 0.7684889435768127, + "learning_rate": 0.00010826342727595426, + "loss": 0.3724, + "step": 5264 + }, + { + "epoch": 1.690480012843153, + "grad_norm": 1.0270161628723145, + "learning_rate": 0.00010822863957079656, + "loss": 1.0122, + "step": 5265 + }, + { + "epoch": 1.6908010916680045, + "grad_norm": 0.5250154137611389, + "learning_rate": 0.0001081938508630041, + "loss": 0.8635, + "step": 5266 + }, + { + "epoch": 1.6911221704928558, + "grad_norm": 0.612006664276123, + "learning_rate": 0.00010815906115681578, + "loss": 0.8014, + "step": 5267 + }, + { + "epoch": 1.6914432493177074, + "grad_norm": 0.5928871631622314, + "learning_rate": 0.00010812427045647058, + "loss": 0.4363, + "step": 5268 + }, + { + "epoch": 1.691764328142559, + "grad_norm": 0.7500852346420288, + "learning_rate": 0.00010808947876620767, + "loss": 0.4729, + "step": 5269 + }, + { + "epoch": 1.6920854069674105, + "grad_norm": 0.7043389081954956, + "learning_rate": 0.00010805468609026632, + "loss": 0.225, + "step": 5270 + }, + { + "epoch": 1.692406485792262, + "grad_norm": 1.1725765466690063, + "learning_rate": 0.00010801989243288589, + "loss": 0.2793, + "step": 5271 + }, + { + "epoch": 1.6927275646171136, + "grad_norm": 0.7991273403167725, + "learning_rate": 0.0001079850977983059, + "loss": 0.4612, + "step": 5272 + }, + { + "epoch": 1.6930486434419652, + "grad_norm": 0.8243421316146851, + "learning_rate": 0.00010795030219076599, + "loss": 0.5367, + "step": 5273 + }, + { + "epoch": 1.6933697222668165, + "grad_norm": 0.7552120685577393, + "learning_rate": 0.00010791550561450586, + "loss": 0.5766, + "step": 5274 + }, + { + "epoch": 1.693690801091668, + "grad_norm": 0.7614461779594421, + "learning_rate": 0.00010788070807376536, + "loss": 0.5462, + "step": 5275 + }, + { + "epoch": 1.6940118799165194, + "grad_norm": 1.0038031339645386, + "learning_rate": 0.0001078459095727845, + "loss": 0.6166, + "step": 5276 + }, + { + "epoch": 1.694332958741371, + "grad_norm": 0.7951825857162476, + "learning_rate": 0.00010781111011580336, + "loss": 0.5819, + "step": 5277 + }, + { + "epoch": 1.6946540375662225, + "grad_norm": 0.9689728021621704, + "learning_rate": 0.00010777630970706217, + "loss": 0.5629, + "step": 5278 + }, + { + "epoch": 1.694975116391074, + "grad_norm": 1.189405918121338, + "learning_rate": 0.00010774150835080119, + "loss": 0.4832, + "step": 5279 + }, + { + "epoch": 1.6952961952159256, + "grad_norm": 0.8709222674369812, + "learning_rate": 0.00010770670605126092, + "loss": 0.5692, + "step": 5280 + }, + { + "epoch": 1.6956172740407771, + "grad_norm": 0.9132195115089417, + "learning_rate": 0.00010767190281268187, + "loss": 0.4311, + "step": 5281 + }, + { + "epoch": 1.6959383528656287, + "grad_norm": 0.7161700129508972, + "learning_rate": 0.00010763709863930476, + "loss": 0.4397, + "step": 5282 + }, + { + "epoch": 1.69625943169048, + "grad_norm": 0.9493613839149475, + "learning_rate": 0.00010760229353537033, + "loss": 0.6239, + "step": 5283 + }, + { + "epoch": 1.6965805105153315, + "grad_norm": 1.3455016613006592, + "learning_rate": 0.00010756748750511953, + "loss": 0.7217, + "step": 5284 + }, + { + "epoch": 1.6969015893401829, + "grad_norm": 0.9821035265922546, + "learning_rate": 0.00010753268055279329, + "loss": 0.5342, + "step": 5285 + }, + { + "epoch": 1.6972226681650344, + "grad_norm": 0.8994725346565247, + "learning_rate": 0.00010749787268263279, + "loss": 0.6187, + "step": 5286 + }, + { + "epoch": 1.697543746989886, + "grad_norm": 1.6420204639434814, + "learning_rate": 0.00010746306389887924, + "loss": 0.7116, + "step": 5287 + }, + { + "epoch": 1.6978648258147375, + "grad_norm": 1.0258111953735352, + "learning_rate": 0.000107428254205774, + "loss": 0.6124, + "step": 5288 + }, + { + "epoch": 1.698185904639589, + "grad_norm": 1.5440269708633423, + "learning_rate": 0.00010739344360755852, + "loss": 0.6184, + "step": 5289 + }, + { + "epoch": 1.6985069834644406, + "grad_norm": 0.8540660738945007, + "learning_rate": 0.00010735863210847433, + "loss": 0.496, + "step": 5290 + }, + { + "epoch": 1.6988280622892922, + "grad_norm": 0.9297248721122742, + "learning_rate": 0.00010732381971276318, + "loss": 0.5762, + "step": 5291 + }, + { + "epoch": 1.6991491411141435, + "grad_norm": 0.9052714705467224, + "learning_rate": 0.0001072890064246668, + "loss": 0.5208, + "step": 5292 + }, + { + "epoch": 1.699470219938995, + "grad_norm": 0.836807370185852, + "learning_rate": 0.0001072541922484271, + "loss": 0.4136, + "step": 5293 + }, + { + "epoch": 1.6997912987638464, + "grad_norm": 1.4985709190368652, + "learning_rate": 0.0001072193771882861, + "loss": 0.9237, + "step": 5294 + }, + { + "epoch": 1.700112377588698, + "grad_norm": 1.0867552757263184, + "learning_rate": 0.00010718456124848583, + "loss": 0.5195, + "step": 5295 + }, + { + "epoch": 1.7004334564135495, + "grad_norm": 1.109985589981079, + "learning_rate": 0.0001071497444332686, + "loss": 0.7572, + "step": 5296 + }, + { + "epoch": 1.700754535238401, + "grad_norm": 1.2597237825393677, + "learning_rate": 0.00010711492674687671, + "loss": 0.603, + "step": 5297 + }, + { + "epoch": 1.7010756140632526, + "grad_norm": 1.0145076513290405, + "learning_rate": 0.00010708010819355256, + "loss": 0.5953, + "step": 5298 + }, + { + "epoch": 1.7013966928881041, + "grad_norm": 1.2275587320327759, + "learning_rate": 0.0001070452887775387, + "loss": 0.6326, + "step": 5299 + }, + { + "epoch": 1.7017177717129557, + "grad_norm": 0.6137974858283997, + "learning_rate": 0.00010701046850307777, + "loss": 0.3397, + "step": 5300 + }, + { + "epoch": 1.702038850537807, + "grad_norm": 0.7091952562332153, + "learning_rate": 0.00010697564737441252, + "loss": 0.3673, + "step": 5301 + }, + { + "epoch": 1.7023599293626586, + "grad_norm": 0.9978876113891602, + "learning_rate": 0.00010694082539578585, + "loss": 0.5951, + "step": 5302 + }, + { + "epoch": 1.70268100818751, + "grad_norm": 1.3031730651855469, + "learning_rate": 0.00010690600257144061, + "loss": 0.72, + "step": 5303 + }, + { + "epoch": 1.7030020870123614, + "grad_norm": 1.1480228900909424, + "learning_rate": 0.00010687117890561988, + "loss": 0.6365, + "step": 5304 + }, + { + "epoch": 1.703323165837213, + "grad_norm": 0.8998190760612488, + "learning_rate": 0.00010683635440256687, + "loss": 0.4303, + "step": 5305 + }, + { + "epoch": 1.7036442446620645, + "grad_norm": 0.8919349312782288, + "learning_rate": 0.00010680152906652483, + "loss": 0.5207, + "step": 5306 + }, + { + "epoch": 1.703965323486916, + "grad_norm": 1.5924605131149292, + "learning_rate": 0.00010676670290173709, + "loss": 0.96, + "step": 5307 + }, + { + "epoch": 1.7042864023117676, + "grad_norm": 1.0313186645507812, + "learning_rate": 0.00010673187591244714, + "loss": 0.5008, + "step": 5308 + }, + { + "epoch": 1.7046074811366192, + "grad_norm": 0.8978110551834106, + "learning_rate": 0.00010669704810289851, + "loss": 0.5782, + "step": 5309 + }, + { + "epoch": 1.7049285599614705, + "grad_norm": 0.9183505177497864, + "learning_rate": 0.00010666221947733486, + "loss": 0.4536, + "step": 5310 + }, + { + "epoch": 1.705249638786322, + "grad_norm": 1.1860748529434204, + "learning_rate": 0.00010662739004000005, + "loss": 0.5646, + "step": 5311 + }, + { + "epoch": 1.7055707176111734, + "grad_norm": 0.6028177738189697, + "learning_rate": 0.0001065925597951378, + "loss": 0.3459, + "step": 5312 + }, + { + "epoch": 1.705891796436025, + "grad_norm": 0.9374269843101501, + "learning_rate": 0.00010655772874699217, + "loss": 0.4143, + "step": 5313 + }, + { + "epoch": 1.7062128752608765, + "grad_norm": 0.8540414571762085, + "learning_rate": 0.00010652289689980714, + "loss": 0.4658, + "step": 5314 + }, + { + "epoch": 1.706533954085728, + "grad_norm": 0.7660170197486877, + "learning_rate": 0.00010648806425782695, + "loss": 0.687, + "step": 5315 + }, + { + "epoch": 1.7068550329105796, + "grad_norm": 0.5973314642906189, + "learning_rate": 0.00010645323082529581, + "loss": 0.7844, + "step": 5316 + }, + { + "epoch": 1.7071761117354312, + "grad_norm": 0.6430346965789795, + "learning_rate": 0.00010641839660645805, + "loss": 0.6883, + "step": 5317 + }, + { + "epoch": 1.7074971905602827, + "grad_norm": 0.7452960014343262, + "learning_rate": 0.00010638356160555816, + "loss": 0.5505, + "step": 5318 + }, + { + "epoch": 1.707818269385134, + "grad_norm": 0.7918518781661987, + "learning_rate": 0.00010634872582684061, + "loss": 0.5305, + "step": 5319 + }, + { + "epoch": 1.7081393482099856, + "grad_norm": 0.7591331005096436, + "learning_rate": 0.00010631388927455013, + "loss": 0.2967, + "step": 5320 + }, + { + "epoch": 1.708460427034837, + "grad_norm": 0.6316062211990356, + "learning_rate": 0.00010627905195293135, + "loss": 0.3833, + "step": 5321 + }, + { + "epoch": 1.7087815058596885, + "grad_norm": 0.77117919921875, + "learning_rate": 0.00010624421386622916, + "loss": 0.5453, + "step": 5322 + }, + { + "epoch": 1.70910258468454, + "grad_norm": 0.8424499034881592, + "learning_rate": 0.00010620937501868841, + "loss": 0.684, + "step": 5323 + }, + { + "epoch": 1.7094236635093916, + "grad_norm": 0.6682990789413452, + "learning_rate": 0.0001061745354145542, + "loss": 0.4461, + "step": 5324 + }, + { + "epoch": 1.7097447423342431, + "grad_norm": 0.8375151753425598, + "learning_rate": 0.00010613969505807156, + "loss": 0.773, + "step": 5325 + }, + { + "epoch": 1.7100658211590947, + "grad_norm": 0.800901472568512, + "learning_rate": 0.00010610485395348571, + "loss": 0.6591, + "step": 5326 + }, + { + "epoch": 1.7103868999839462, + "grad_norm": 0.9238450527191162, + "learning_rate": 0.00010607001210504191, + "loss": 0.5129, + "step": 5327 + }, + { + "epoch": 1.7107079788087975, + "grad_norm": 0.8702148199081421, + "learning_rate": 0.00010603516951698556, + "loss": 0.5476, + "step": 5328 + }, + { + "epoch": 1.711029057633649, + "grad_norm": 0.7507149577140808, + "learning_rate": 0.00010600032619356209, + "loss": 0.6374, + "step": 5329 + }, + { + "epoch": 1.7113501364585004, + "grad_norm": 0.9397144317626953, + "learning_rate": 0.00010596548213901708, + "loss": 0.579, + "step": 5330 + }, + { + "epoch": 1.711671215283352, + "grad_norm": 0.8215104937553406, + "learning_rate": 0.00010593063735759618, + "loss": 0.726, + "step": 5331 + }, + { + "epoch": 1.7119922941082035, + "grad_norm": 1.175023078918457, + "learning_rate": 0.0001058957918535451, + "loss": 0.6882, + "step": 5332 + }, + { + "epoch": 1.712313372933055, + "grad_norm": 0.8529446125030518, + "learning_rate": 0.00010586094563110964, + "loss": 0.6363, + "step": 5333 + }, + { + "epoch": 1.7126344517579066, + "grad_norm": 1.1046276092529297, + "learning_rate": 0.00010582609869453577, + "loss": 0.6866, + "step": 5334 + }, + { + "epoch": 1.7129555305827582, + "grad_norm": 0.8943831920623779, + "learning_rate": 0.00010579125104806944, + "loss": 0.6993, + "step": 5335 + }, + { + "epoch": 1.7132766094076097, + "grad_norm": 0.9862803220748901, + "learning_rate": 0.00010575640269595675, + "loss": 0.6898, + "step": 5336 + }, + { + "epoch": 1.713597688232461, + "grad_norm": 0.8223783373832703, + "learning_rate": 0.00010572155364244382, + "loss": 0.6043, + "step": 5337 + }, + { + "epoch": 1.7139187670573126, + "grad_norm": 0.6628785729408264, + "learning_rate": 0.00010568670389177696, + "loss": 0.3829, + "step": 5338 + }, + { + "epoch": 1.714239845882164, + "grad_norm": 1.1625560522079468, + "learning_rate": 0.00010565185344820247, + "loss": 0.6914, + "step": 5339 + }, + { + "epoch": 1.7145609247070155, + "grad_norm": 1.0547271966934204, + "learning_rate": 0.00010561700231596678, + "loss": 0.7001, + "step": 5340 + }, + { + "epoch": 1.714882003531867, + "grad_norm": 0.8884708881378174, + "learning_rate": 0.00010558215049931638, + "loss": 0.6464, + "step": 5341 + }, + { + "epoch": 1.7152030823567186, + "grad_norm": 0.7549629807472229, + "learning_rate": 0.00010554729800249792, + "loss": 0.4945, + "step": 5342 + }, + { + "epoch": 1.7155241611815701, + "grad_norm": 0.7881496548652649, + "learning_rate": 0.00010551244482975798, + "loss": 0.6036, + "step": 5343 + }, + { + "epoch": 1.7158452400064217, + "grad_norm": 0.919924259185791, + "learning_rate": 0.00010547759098534335, + "loss": 0.5523, + "step": 5344 + }, + { + "epoch": 1.7161663188312732, + "grad_norm": 1.0688406229019165, + "learning_rate": 0.00010544273647350092, + "loss": 0.4713, + "step": 5345 + }, + { + "epoch": 1.7164873976561246, + "grad_norm": 1.295059323310852, + "learning_rate": 0.00010540788129847756, + "loss": 0.6204, + "step": 5346 + }, + { + "epoch": 1.7168084764809761, + "grad_norm": 1.1029874086380005, + "learning_rate": 0.00010537302546452022, + "loss": 0.7223, + "step": 5347 + }, + { + "epoch": 1.7171295553058274, + "grad_norm": 0.7325824499130249, + "learning_rate": 0.00010533816897587606, + "loss": 0.3234, + "step": 5348 + }, + { + "epoch": 1.717450634130679, + "grad_norm": 0.9354049563407898, + "learning_rate": 0.00010530331183679218, + "loss": 0.5902, + "step": 5349 + }, + { + "epoch": 1.7177717129555305, + "grad_norm": 0.8697565793991089, + "learning_rate": 0.00010526845405151586, + "loss": 0.4462, + "step": 5350 + }, + { + "epoch": 1.718092791780382, + "grad_norm": 0.930233895778656, + "learning_rate": 0.0001052335956242944, + "loss": 0.6314, + "step": 5351 + }, + { + "epoch": 1.7184138706052337, + "grad_norm": 0.7648993730545044, + "learning_rate": 0.00010519873655937516, + "loss": 0.471, + "step": 5352 + }, + { + "epoch": 1.7187349494300852, + "grad_norm": 0.9538902640342712, + "learning_rate": 0.00010516387686100566, + "loss": 0.5847, + "step": 5353 + }, + { + "epoch": 1.7190560282549368, + "grad_norm": 0.749451220035553, + "learning_rate": 0.00010512901653343344, + "loss": 0.4562, + "step": 5354 + }, + { + "epoch": 1.719377107079788, + "grad_norm": 0.8436141610145569, + "learning_rate": 0.00010509415558090609, + "loss": 0.5758, + "step": 5355 + }, + { + "epoch": 1.7196981859046396, + "grad_norm": 1.4007991552352905, + "learning_rate": 0.00010505929400767134, + "loss": 0.6263, + "step": 5356 + }, + { + "epoch": 1.720019264729491, + "grad_norm": 0.9559081196784973, + "learning_rate": 0.00010502443181797697, + "loss": 0.5488, + "step": 5357 + }, + { + "epoch": 1.7203403435543425, + "grad_norm": 0.9755504131317139, + "learning_rate": 0.00010498956901607083, + "loss": 0.6127, + "step": 5358 + }, + { + "epoch": 1.720661422379194, + "grad_norm": 0.9046206474304199, + "learning_rate": 0.00010495470560620083, + "loss": 0.4139, + "step": 5359 + }, + { + "epoch": 1.7209825012040456, + "grad_norm": 0.9097113609313965, + "learning_rate": 0.00010491984159261496, + "loss": 0.5977, + "step": 5360 + }, + { + "epoch": 1.7213035800288972, + "grad_norm": 0.5500661730766296, + "learning_rate": 0.00010488497697956135, + "loss": 0.3321, + "step": 5361 + }, + { + "epoch": 1.7216246588537487, + "grad_norm": 0.9392353296279907, + "learning_rate": 0.00010485011177128807, + "loss": 0.4554, + "step": 5362 + }, + { + "epoch": 1.7219457376786, + "grad_norm": 0.7440015077590942, + "learning_rate": 0.00010481524597204342, + "loss": 0.419, + "step": 5363 + }, + { + "epoch": 1.7222668165034516, + "grad_norm": 0.6105871796607971, + "learning_rate": 0.00010478037958607568, + "loss": 0.4671, + "step": 5364 + }, + { + "epoch": 1.7225878953283031, + "grad_norm": 0.8352953195571899, + "learning_rate": 0.00010474551261763314, + "loss": 0.6749, + "step": 5365 + }, + { + "epoch": 1.7229089741531545, + "grad_norm": 0.5296282768249512, + "learning_rate": 0.00010471064507096426, + "loss": 0.7636, + "step": 5366 + }, + { + "epoch": 1.723230052978006, + "grad_norm": 0.717930793762207, + "learning_rate": 0.00010467577695031762, + "loss": 0.3502, + "step": 5367 + }, + { + "epoch": 1.7235511318028576, + "grad_norm": 0.6417704820632935, + "learning_rate": 0.00010464090825994173, + "loss": 0.2754, + "step": 5368 + }, + { + "epoch": 1.7238722106277091, + "grad_norm": 0.6707653999328613, + "learning_rate": 0.00010460603900408523, + "loss": 0.1641, + "step": 5369 + }, + { + "epoch": 1.7241932894525607, + "grad_norm": 2.9203500747680664, + "learning_rate": 0.00010457116918699688, + "loss": 0.4299, + "step": 5370 + }, + { + "epoch": 1.7245143682774122, + "grad_norm": 0.8975229859352112, + "learning_rate": 0.00010453629881292538, + "loss": 0.689, + "step": 5371 + }, + { + "epoch": 1.7248354471022636, + "grad_norm": 0.8682465553283691, + "learning_rate": 0.00010450142788611965, + "loss": 0.6525, + "step": 5372 + }, + { + "epoch": 1.725156525927115, + "grad_norm": 1.0204635858535767, + "learning_rate": 0.00010446655641082862, + "loss": 0.6479, + "step": 5373 + }, + { + "epoch": 1.7254776047519667, + "grad_norm": 0.8298991918563843, + "learning_rate": 0.00010443168439130122, + "loss": 0.4763, + "step": 5374 + }, + { + "epoch": 1.725798683576818, + "grad_norm": 0.9152095913887024, + "learning_rate": 0.0001043968118317865, + "loss": 0.6022, + "step": 5375 + }, + { + "epoch": 1.7261197624016695, + "grad_norm": 0.5959125757217407, + "learning_rate": 0.00010436193873653361, + "loss": 0.414, + "step": 5376 + }, + { + "epoch": 1.726440841226521, + "grad_norm": 1.1426783800125122, + "learning_rate": 0.00010432706510979171, + "loss": 0.6795, + "step": 5377 + }, + { + "epoch": 1.7267619200513726, + "grad_norm": 0.8990039825439453, + "learning_rate": 0.00010429219095581007, + "loss": 0.6957, + "step": 5378 + }, + { + "epoch": 1.7270829988762242, + "grad_norm": 0.9482988119125366, + "learning_rate": 0.00010425731627883797, + "loss": 0.575, + "step": 5379 + }, + { + "epoch": 1.7274040777010757, + "grad_norm": 0.8645418286323547, + "learning_rate": 0.0001042224410831248, + "loss": 0.5164, + "step": 5380 + }, + { + "epoch": 1.727725156525927, + "grad_norm": 0.8730982542037964, + "learning_rate": 0.00010418756537291996, + "loss": 0.5756, + "step": 5381 + }, + { + "epoch": 1.7280462353507786, + "grad_norm": 0.8636887669563293, + "learning_rate": 0.00010415268915247303, + "loss": 0.6095, + "step": 5382 + }, + { + "epoch": 1.7283673141756302, + "grad_norm": 1.0389565229415894, + "learning_rate": 0.00010411781242603352, + "loss": 0.6065, + "step": 5383 + }, + { + "epoch": 1.7286883930004815, + "grad_norm": 1.2318190336227417, + "learning_rate": 0.00010408293519785101, + "loss": 0.6049, + "step": 5384 + }, + { + "epoch": 1.729009471825333, + "grad_norm": 1.0228158235549927, + "learning_rate": 0.00010404805747217526, + "loss": 0.7854, + "step": 5385 + }, + { + "epoch": 1.7293305506501846, + "grad_norm": 0.8208790421485901, + "learning_rate": 0.00010401317925325598, + "loss": 0.5619, + "step": 5386 + }, + { + "epoch": 1.7296516294750361, + "grad_norm": 0.8615589737892151, + "learning_rate": 0.000103978300545343, + "loss": 0.5876, + "step": 5387 + }, + { + "epoch": 1.7299727082998877, + "grad_norm": 0.7015345096588135, + "learning_rate": 0.00010394342135268613, + "loss": 0.4422, + "step": 5388 + }, + { + "epoch": 1.7302937871247392, + "grad_norm": 0.7104688286781311, + "learning_rate": 0.00010390854167953537, + "loss": 0.5317, + "step": 5389 + }, + { + "epoch": 1.7306148659495906, + "grad_norm": 1.4996254444122314, + "learning_rate": 0.00010387366153014062, + "loss": 0.6955, + "step": 5390 + }, + { + "epoch": 1.7309359447744421, + "grad_norm": 1.07838773727417, + "learning_rate": 0.00010383878090875201, + "loss": 0.4943, + "step": 5391 + }, + { + "epoch": 1.7312570235992937, + "grad_norm": 1.10397207736969, + "learning_rate": 0.00010380389981961958, + "loss": 0.8043, + "step": 5392 + }, + { + "epoch": 1.731578102424145, + "grad_norm": 1.2021815776824951, + "learning_rate": 0.00010376901826699348, + "loss": 0.5963, + "step": 5393 + }, + { + "epoch": 1.7318991812489966, + "grad_norm": 0.8675143122673035, + "learning_rate": 0.00010373413625512394, + "loss": 0.5774, + "step": 5394 + }, + { + "epoch": 1.732220260073848, + "grad_norm": 1.3191437721252441, + "learning_rate": 0.0001036992537882612, + "loss": 0.7901, + "step": 5395 + }, + { + "epoch": 1.7325413388986997, + "grad_norm": 0.9630816578865051, + "learning_rate": 0.00010366437087065564, + "loss": 0.6245, + "step": 5396 + }, + { + "epoch": 1.7328624177235512, + "grad_norm": 0.8633376359939575, + "learning_rate": 0.00010362948750655759, + "loss": 0.6033, + "step": 5397 + }, + { + "epoch": 1.7331834965484028, + "grad_norm": 1.0866016149520874, + "learning_rate": 0.0001035946037002175, + "loss": 0.6458, + "step": 5398 + }, + { + "epoch": 1.733504575373254, + "grad_norm": 1.0186740159988403, + "learning_rate": 0.00010355971945588585, + "loss": 0.6645, + "step": 5399 + }, + { + "epoch": 1.7338256541981056, + "grad_norm": 1.092057228088379, + "learning_rate": 0.0001035248347778132, + "loss": 0.5861, + "step": 5400 + }, + { + "epoch": 1.7341467330229572, + "grad_norm": 1.0183131694793701, + "learning_rate": 0.00010348994967025012, + "loss": 0.518, + "step": 5401 + }, + { + "epoch": 1.7344678118478085, + "grad_norm": 0.9326373338699341, + "learning_rate": 0.00010345506413744726, + "loss": 0.5674, + "step": 5402 + }, + { + "epoch": 1.73478889067266, + "grad_norm": 0.947853684425354, + "learning_rate": 0.0001034201781836553, + "loss": 0.5808, + "step": 5403 + }, + { + "epoch": 1.7351099694975116, + "grad_norm": 0.948047935962677, + "learning_rate": 0.00010338529181312497, + "loss": 0.4687, + "step": 5404 + }, + { + "epoch": 1.7354310483223632, + "grad_norm": 1.0557464361190796, + "learning_rate": 0.00010335040503010716, + "loss": 0.5766, + "step": 5405 + }, + { + "epoch": 1.7357521271472147, + "grad_norm": 1.128117322921753, + "learning_rate": 0.00010331551783885263, + "loss": 0.5139, + "step": 5406 + }, + { + "epoch": 1.7360732059720663, + "grad_norm": 0.9319320917129517, + "learning_rate": 0.00010328063024361232, + "loss": 0.5615, + "step": 5407 + }, + { + "epoch": 1.7363942847969176, + "grad_norm": 1.243923544883728, + "learning_rate": 0.00010324574224863717, + "loss": 0.6024, + "step": 5408 + }, + { + "epoch": 1.7367153636217691, + "grad_norm": 1.1786973476409912, + "learning_rate": 0.00010321085385817817, + "loss": 0.5292, + "step": 5409 + }, + { + "epoch": 1.7370364424466207, + "grad_norm": 1.0594767332077026, + "learning_rate": 0.00010317596507648636, + "loss": 0.5848, + "step": 5410 + }, + { + "epoch": 1.737357521271472, + "grad_norm": 0.8402320742607117, + "learning_rate": 0.00010314107590781284, + "loss": 0.4564, + "step": 5411 + }, + { + "epoch": 1.7376786000963236, + "grad_norm": 1.0813262462615967, + "learning_rate": 0.00010310618635640876, + "loss": 0.465, + "step": 5412 + }, + { + "epoch": 1.7379996789211751, + "grad_norm": 0.6268415451049805, + "learning_rate": 0.00010307129642652528, + "loss": 0.4441, + "step": 5413 + }, + { + "epoch": 1.7383207577460267, + "grad_norm": 0.6250267028808594, + "learning_rate": 0.00010303640612241363, + "loss": 0.3773, + "step": 5414 + }, + { + "epoch": 1.7386418365708782, + "grad_norm": 0.6857768893241882, + "learning_rate": 0.00010300151544832512, + "loss": 0.6693, + "step": 5415 + }, + { + "epoch": 1.7389629153957298, + "grad_norm": 0.5000283122062683, + "learning_rate": 0.00010296662440851108, + "loss": 0.6734, + "step": 5416 + }, + { + "epoch": 1.739283994220581, + "grad_norm": 0.5204218626022339, + "learning_rate": 0.00010293173300722285, + "loss": 0.6129, + "step": 5417 + }, + { + "epoch": 1.7396050730454327, + "grad_norm": 0.6385371685028076, + "learning_rate": 0.00010289684124871182, + "loss": 0.6554, + "step": 5418 + }, + { + "epoch": 1.7399261518702842, + "grad_norm": 0.5886783599853516, + "learning_rate": 0.00010286194913722948, + "loss": 0.1828, + "step": 5419 + }, + { + "epoch": 1.7402472306951355, + "grad_norm": 0.9899042844772339, + "learning_rate": 0.00010282705667702734, + "loss": 0.3291, + "step": 5420 + }, + { + "epoch": 1.740568309519987, + "grad_norm": 0.702520489692688, + "learning_rate": 0.0001027921638723569, + "loss": 0.3839, + "step": 5421 + }, + { + "epoch": 1.7408893883448386, + "grad_norm": 0.5995388627052307, + "learning_rate": 0.00010275727072746977, + "loss": 0.2942, + "step": 5422 + }, + { + "epoch": 1.7412104671696902, + "grad_norm": 1.077636957168579, + "learning_rate": 0.00010272237724661753, + "loss": 0.7508, + "step": 5423 + }, + { + "epoch": 1.7415315459945417, + "grad_norm": 0.8297303915023804, + "learning_rate": 0.00010268748343405192, + "loss": 0.6525, + "step": 5424 + }, + { + "epoch": 1.7418526248193933, + "grad_norm": 0.8193151950836182, + "learning_rate": 0.0001026525892940246, + "loss": 0.6898, + "step": 5425 + }, + { + "epoch": 1.7421737036442446, + "grad_norm": 1.1228710412979126, + "learning_rate": 0.00010261769483078733, + "loss": 0.7282, + "step": 5426 + }, + { + "epoch": 1.7424947824690962, + "grad_norm": 0.6626234650611877, + "learning_rate": 0.00010258280004859188, + "loss": 0.4688, + "step": 5427 + }, + { + "epoch": 1.7428158612939475, + "grad_norm": 1.0781296491622925, + "learning_rate": 0.00010254790495169006, + "loss": 0.5331, + "step": 5428 + }, + { + "epoch": 1.743136940118799, + "grad_norm": 0.7310062646865845, + "learning_rate": 0.00010251300954433376, + "loss": 0.5715, + "step": 5429 + }, + { + "epoch": 1.7434580189436506, + "grad_norm": 0.6906360387802124, + "learning_rate": 0.00010247811383077487, + "loss": 0.474, + "step": 5430 + }, + { + "epoch": 1.7437790977685022, + "grad_norm": 0.8053506016731262, + "learning_rate": 0.00010244321781526533, + "loss": 0.5084, + "step": 5431 + }, + { + "epoch": 1.7441001765933537, + "grad_norm": 0.930061399936676, + "learning_rate": 0.0001024083215020571, + "loss": 0.6584, + "step": 5432 + }, + { + "epoch": 1.7444212554182053, + "grad_norm": 1.089030146598816, + "learning_rate": 0.0001023734248954022, + "loss": 0.6163, + "step": 5433 + }, + { + "epoch": 1.7447423342430568, + "grad_norm": 0.9254440665245056, + "learning_rate": 0.00010233852799955268, + "loss": 0.6622, + "step": 5434 + }, + { + "epoch": 1.7450634130679081, + "grad_norm": 0.9075636267662048, + "learning_rate": 0.00010230363081876064, + "loss": 0.6482, + "step": 5435 + }, + { + "epoch": 1.7453844918927597, + "grad_norm": 1.084458589553833, + "learning_rate": 0.00010226873335727816, + "loss": 0.6901, + "step": 5436 + }, + { + "epoch": 1.745705570717611, + "grad_norm": 1.6565884351730347, + "learning_rate": 0.00010223383561935738, + "loss": 0.597, + "step": 5437 + }, + { + "epoch": 1.7460266495424626, + "grad_norm": 0.997448742389679, + "learning_rate": 0.00010219893760925052, + "loss": 0.6115, + "step": 5438 + }, + { + "epoch": 1.7463477283673141, + "grad_norm": 0.9751862287521362, + "learning_rate": 0.00010216403933120979, + "loss": 0.5718, + "step": 5439 + }, + { + "epoch": 1.7466688071921657, + "grad_norm": 0.9030267596244812, + "learning_rate": 0.0001021291407894874, + "loss": 0.6273, + "step": 5440 + }, + { + "epoch": 1.7469898860170172, + "grad_norm": 1.0032966136932373, + "learning_rate": 0.0001020942419883357, + "loss": 0.6734, + "step": 5441 + }, + { + "epoch": 1.7473109648418688, + "grad_norm": 0.7883462309837341, + "learning_rate": 0.00010205934293200696, + "loss": 0.4985, + "step": 5442 + }, + { + "epoch": 1.7476320436667203, + "grad_norm": 1.4333808422088623, + "learning_rate": 0.00010202444362475352, + "loss": 0.5427, + "step": 5443 + }, + { + "epoch": 1.7479531224915716, + "grad_norm": 0.9737802147865295, + "learning_rate": 0.0001019895440708278, + "loss": 0.6445, + "step": 5444 + }, + { + "epoch": 1.7482742013164232, + "grad_norm": 0.8467430472373962, + "learning_rate": 0.00010195464427448213, + "loss": 0.5475, + "step": 5445 + }, + { + "epoch": 1.7485952801412745, + "grad_norm": 0.855104923248291, + "learning_rate": 0.000101919744239969, + "loss": 0.5532, + "step": 5446 + }, + { + "epoch": 1.748916358966126, + "grad_norm": 1.96385657787323, + "learning_rate": 0.00010188484397154084, + "loss": 0.6679, + "step": 5447 + }, + { + "epoch": 1.7492374377909776, + "grad_norm": 0.9854167699813843, + "learning_rate": 0.00010184994347345016, + "loss": 0.5125, + "step": 5448 + }, + { + "epoch": 1.7495585166158292, + "grad_norm": 1.1345828771591187, + "learning_rate": 0.00010181504274994948, + "loss": 0.577, + "step": 5449 + }, + { + "epoch": 1.7498795954406807, + "grad_norm": 0.941738486289978, + "learning_rate": 0.00010178014180529136, + "loss": 0.4594, + "step": 5450 + }, + { + "epoch": 1.7502006742655323, + "grad_norm": 0.9602155685424805, + "learning_rate": 0.00010174524064372837, + "loss": 0.7691, + "step": 5451 + }, + { + "epoch": 1.7505217530903838, + "grad_norm": 1.012502908706665, + "learning_rate": 0.00010171033926951305, + "loss": 0.6743, + "step": 5452 + }, + { + "epoch": 1.7508428319152352, + "grad_norm": 1.0123480558395386, + "learning_rate": 0.00010167543768689815, + "loss": 0.5418, + "step": 5453 + }, + { + "epoch": 1.7511639107400867, + "grad_norm": 0.9171655774116516, + "learning_rate": 0.00010164053590013622, + "loss": 0.4985, + "step": 5454 + }, + { + "epoch": 1.751484989564938, + "grad_norm": 0.9921781420707703, + "learning_rate": 0.00010160563391347996, + "loss": 0.6584, + "step": 5455 + }, + { + "epoch": 1.7518060683897896, + "grad_norm": 1.2568939924240112, + "learning_rate": 0.00010157073173118208, + "loss": 0.7265, + "step": 5456 + }, + { + "epoch": 1.7521271472146411, + "grad_norm": 1.0861806869506836, + "learning_rate": 0.0001015358293574953, + "loss": 0.6727, + "step": 5457 + }, + { + "epoch": 1.7524482260394927, + "grad_norm": 0.9609785676002502, + "learning_rate": 0.00010150092679667238, + "loss": 0.52, + "step": 5458 + }, + { + "epoch": 1.7527693048643442, + "grad_norm": 1.046840786933899, + "learning_rate": 0.00010146602405296607, + "loss": 0.6536, + "step": 5459 + }, + { + "epoch": 1.7530903836891958, + "grad_norm": 1.095132827758789, + "learning_rate": 0.00010143112113062918, + "loss": 0.467, + "step": 5460 + }, + { + "epoch": 1.7534114625140473, + "grad_norm": 0.7272664904594421, + "learning_rate": 0.00010139621803391455, + "loss": 0.4159, + "step": 5461 + }, + { + "epoch": 1.7537325413388987, + "grad_norm": 0.7782832980155945, + "learning_rate": 0.00010136131476707496, + "loss": 0.5137, + "step": 5462 + }, + { + "epoch": 1.7540536201637502, + "grad_norm": 1.0231236219406128, + "learning_rate": 0.0001013264113343633, + "loss": 0.5287, + "step": 5463 + }, + { + "epoch": 1.7543746989886015, + "grad_norm": 0.7744680643081665, + "learning_rate": 0.00010129150774003245, + "loss": 0.4127, + "step": 5464 + }, + { + "epoch": 1.754695777813453, + "grad_norm": 0.7057910561561584, + "learning_rate": 0.00010125660398833528, + "loss": 0.6325, + "step": 5465 + }, + { + "epoch": 1.7550168566383046, + "grad_norm": 0.5882181525230408, + "learning_rate": 0.00010122170008352472, + "loss": 1.0208, + "step": 5466 + }, + { + "epoch": 1.7553379354631562, + "grad_norm": 0.5369687080383301, + "learning_rate": 0.00010118679602985373, + "loss": 0.6156, + "step": 5467 + }, + { + "epoch": 1.7556590142880077, + "grad_norm": 0.8291602730751038, + "learning_rate": 0.00010115189183157523, + "loss": 0.4136, + "step": 5468 + }, + { + "epoch": 1.7559800931128593, + "grad_norm": 0.6269962787628174, + "learning_rate": 0.00010111698749294223, + "loss": 0.2929, + "step": 5469 + }, + { + "epoch": 1.7563011719377108, + "grad_norm": 0.68831467628479, + "learning_rate": 0.00010108208301820767, + "loss": 0.2129, + "step": 5470 + }, + { + "epoch": 1.7566222507625622, + "grad_norm": 0.7378066778182983, + "learning_rate": 0.00010104717841162458, + "loss": 0.1843, + "step": 5471 + }, + { + "epoch": 1.7569433295874137, + "grad_norm": 0.9951292276382446, + "learning_rate": 0.000101012273677446, + "loss": 0.5378, + "step": 5472 + }, + { + "epoch": 1.757264408412265, + "grad_norm": 0.9426655173301697, + "learning_rate": 0.00010097736881992492, + "loss": 0.6999, + "step": 5473 + }, + { + "epoch": 1.7575854872371166, + "grad_norm": 0.7657575607299805, + "learning_rate": 0.00010094246384331442, + "loss": 0.5443, + "step": 5474 + }, + { + "epoch": 1.7579065660619682, + "grad_norm": 1.0222439765930176, + "learning_rate": 0.00010090755875186753, + "loss": 0.8449, + "step": 5475 + }, + { + "epoch": 1.7582276448868197, + "grad_norm": 0.7700451016426086, + "learning_rate": 0.0001008726535498374, + "loss": 0.729, + "step": 5476 + }, + { + "epoch": 1.7585487237116713, + "grad_norm": 0.9244915246963501, + "learning_rate": 0.00010083774824147708, + "loss": 0.595, + "step": 5477 + }, + { + "epoch": 1.7588698025365228, + "grad_norm": 0.7973812222480774, + "learning_rate": 0.00010080284283103965, + "loss": 0.6733, + "step": 5478 + }, + { + "epoch": 1.7591908813613744, + "grad_norm": 0.7147784233093262, + "learning_rate": 0.00010076793732277829, + "loss": 0.5151, + "step": 5479 + }, + { + "epoch": 1.7595119601862257, + "grad_norm": 0.7087793946266174, + "learning_rate": 0.00010073303172094606, + "loss": 0.4801, + "step": 5480 + }, + { + "epoch": 1.7598330390110772, + "grad_norm": 0.9641126394271851, + "learning_rate": 0.00010069812602979615, + "loss": 0.5686, + "step": 5481 + }, + { + "epoch": 1.7601541178359286, + "grad_norm": 0.755591630935669, + "learning_rate": 0.00010066322025358173, + "loss": 0.4766, + "step": 5482 + }, + { + "epoch": 1.7604751966607801, + "grad_norm": 0.9040724635124207, + "learning_rate": 0.00010062831439655591, + "loss": 0.578, + "step": 5483 + }, + { + "epoch": 1.7607962754856317, + "grad_norm": 0.9179210066795349, + "learning_rate": 0.00010059340846297189, + "loss": 0.7127, + "step": 5484 + }, + { + "epoch": 1.7611173543104832, + "grad_norm": 0.9386594891548157, + "learning_rate": 0.00010055850245708283, + "loss": 0.6266, + "step": 5485 + }, + { + "epoch": 1.7614384331353348, + "grad_norm": 0.9094860553741455, + "learning_rate": 0.00010052359638314195, + "loss": 0.5722, + "step": 5486 + }, + { + "epoch": 1.7617595119601863, + "grad_norm": 1.1115118265151978, + "learning_rate": 0.00010048869024540247, + "loss": 0.6238, + "step": 5487 + }, + { + "epoch": 1.7620805907850379, + "grad_norm": 0.83982914686203, + "learning_rate": 0.00010045378404811756, + "loss": 0.6233, + "step": 5488 + }, + { + "epoch": 1.7624016696098892, + "grad_norm": 1.0576977729797363, + "learning_rate": 0.0001004188777955404, + "loss": 0.7304, + "step": 5489 + }, + { + "epoch": 1.7627227484347407, + "grad_norm": 0.9903758764266968, + "learning_rate": 0.00010038397149192426, + "loss": 0.5852, + "step": 5490 + }, + { + "epoch": 1.763043827259592, + "grad_norm": 1.2004951238632202, + "learning_rate": 0.00010034906514152238, + "loss": 0.6996, + "step": 5491 + }, + { + "epoch": 1.7633649060844436, + "grad_norm": 1.251308798789978, + "learning_rate": 0.00010031415874858797, + "loss": 0.7741, + "step": 5492 + }, + { + "epoch": 1.7636859849092952, + "grad_norm": 1.0517253875732422, + "learning_rate": 0.00010027925231737428, + "loss": 0.4877, + "step": 5493 + }, + { + "epoch": 1.7640070637341467, + "grad_norm": 1.2016655206680298, + "learning_rate": 0.00010024434585213451, + "loss": 0.6398, + "step": 5494 + }, + { + "epoch": 1.7643281425589983, + "grad_norm": 1.0655850172042847, + "learning_rate": 0.00010020943935712192, + "loss": 0.5722, + "step": 5495 + }, + { + "epoch": 1.7646492213838498, + "grad_norm": 1.114829182624817, + "learning_rate": 0.00010017453283658984, + "loss": 0.5166, + "step": 5496 + }, + { + "epoch": 1.7649703002087014, + "grad_norm": 0.8498656153678894, + "learning_rate": 0.00010013962629479146, + "loss": 0.5585, + "step": 5497 + }, + { + "epoch": 1.7652913790335527, + "grad_norm": 0.9619441628456116, + "learning_rate": 0.00010010471973598002, + "loss": 0.6372, + "step": 5498 + }, + { + "epoch": 1.7656124578584043, + "grad_norm": 0.7192568778991699, + "learning_rate": 0.00010006981316440875, + "loss": 0.4638, + "step": 5499 + }, + { + "epoch": 1.7659335366832556, + "grad_norm": 0.8864771127700806, + "learning_rate": 0.00010003490658433101, + "loss": 0.6032, + "step": 5500 + }, + { + "epoch": 1.7662546155081071, + "grad_norm": 1.0561635494232178, + "learning_rate": 0.0001, + "loss": 0.5758, + "step": 5501 + }, + { + "epoch": 1.7665756943329587, + "grad_norm": 0.9847275018692017, + "learning_rate": 9.996509341566904e-05, + "loss": 0.5311, + "step": 5502 + }, + { + "epoch": 1.7668967731578102, + "grad_norm": 0.7542006373405457, + "learning_rate": 9.993018683559126e-05, + "loss": 0.4759, + "step": 5503 + }, + { + "epoch": 1.7672178519826618, + "grad_norm": 1.13614821434021, + "learning_rate": 9.989528026402003e-05, + "loss": 0.4959, + "step": 5504 + }, + { + "epoch": 1.7675389308075133, + "grad_norm": 0.8526762127876282, + "learning_rate": 9.986037370520857e-05, + "loss": 0.5481, + "step": 5505 + }, + { + "epoch": 1.767860009632365, + "grad_norm": 0.771710216999054, + "learning_rate": 9.98254671634102e-05, + "loss": 0.5153, + "step": 5506 + }, + { + "epoch": 1.7681810884572162, + "grad_norm": 0.8657694458961487, + "learning_rate": 9.979056064287806e-05, + "loss": 0.5242, + "step": 5507 + }, + { + "epoch": 1.7685021672820678, + "grad_norm": 0.9876973628997803, + "learning_rate": 9.975565414786551e-05, + "loss": 0.5711, + "step": 5508 + }, + { + "epoch": 1.768823246106919, + "grad_norm": 0.7581015229225159, + "learning_rate": 9.972074768262576e-05, + "loss": 0.5151, + "step": 5509 + }, + { + "epoch": 1.7691443249317707, + "grad_norm": 0.7137047052383423, + "learning_rate": 9.968584125141204e-05, + "loss": 0.3444, + "step": 5510 + }, + { + "epoch": 1.7694654037566222, + "grad_norm": 0.4818606376647949, + "learning_rate": 9.965093485847767e-05, + "loss": 0.3025, + "step": 5511 + }, + { + "epoch": 1.7697864825814738, + "grad_norm": 0.567258894443512, + "learning_rate": 9.961602850807576e-05, + "loss": 0.3447, + "step": 5512 + }, + { + "epoch": 1.7701075614063253, + "grad_norm": 0.9824522137641907, + "learning_rate": 9.958112220445963e-05, + "loss": 0.3899, + "step": 5513 + }, + { + "epoch": 1.7704286402311769, + "grad_norm": 0.9963672161102295, + "learning_rate": 9.954621595188247e-05, + "loss": 0.4292, + "step": 5514 + }, + { + "epoch": 1.7707497190560284, + "grad_norm": 0.6725934147834778, + "learning_rate": 9.951130975459757e-05, + "loss": 0.8659, + "step": 5515 + }, + { + "epoch": 1.7710707978808797, + "grad_norm": 0.6370208263397217, + "learning_rate": 9.947640361685804e-05, + "loss": 1.0013, + "step": 5516 + }, + { + "epoch": 1.7713918767057313, + "grad_norm": 0.7263522744178772, + "learning_rate": 9.94414975429172e-05, + "loss": 0.1642, + "step": 5517 + }, + { + "epoch": 1.7717129555305826, + "grad_norm": 0.8269543647766113, + "learning_rate": 9.940659153702813e-05, + "loss": 0.3893, + "step": 5518 + }, + { + "epoch": 1.7720340343554342, + "grad_norm": 0.6357668042182922, + "learning_rate": 9.937168560344412e-05, + "loss": 0.1586, + "step": 5519 + }, + { + "epoch": 1.7723551131802857, + "grad_norm": 0.5424615144729614, + "learning_rate": 9.933677974641831e-05, + "loss": 0.2211, + "step": 5520 + }, + { + "epoch": 1.7726761920051373, + "grad_norm": 0.8307551741600037, + "learning_rate": 9.930187397020386e-05, + "loss": 0.5533, + "step": 5521 + }, + { + "epoch": 1.7729972708299888, + "grad_norm": 0.8896917104721069, + "learning_rate": 9.926696827905395e-05, + "loss": 0.5725, + "step": 5522 + }, + { + "epoch": 1.7733183496548404, + "grad_norm": 1.096503496170044, + "learning_rate": 9.923206267722173e-05, + "loss": 0.8626, + "step": 5523 + }, + { + "epoch": 1.773639428479692, + "grad_norm": 0.8159449696540833, + "learning_rate": 9.919715716896036e-05, + "loss": 0.6556, + "step": 5524 + }, + { + "epoch": 1.7739605073045432, + "grad_norm": 0.8842390775680542, + "learning_rate": 9.916225175852293e-05, + "loss": 0.6443, + "step": 5525 + }, + { + "epoch": 1.7742815861293948, + "grad_norm": 0.7416340708732605, + "learning_rate": 9.912734645016263e-05, + "loss": 0.4824, + "step": 5526 + }, + { + "epoch": 1.7746026649542461, + "grad_norm": 0.804015040397644, + "learning_rate": 9.909244124813247e-05, + "loss": 0.4857, + "step": 5527 + }, + { + "epoch": 1.7749237437790977, + "grad_norm": 0.7028852105140686, + "learning_rate": 9.90575361566856e-05, + "loss": 0.4962, + "step": 5528 + }, + { + "epoch": 1.7752448226039492, + "grad_norm": 0.8853591084480286, + "learning_rate": 9.902263118007513e-05, + "loss": 0.6105, + "step": 5529 + }, + { + "epoch": 1.7755659014288008, + "grad_norm": 0.9714551568031311, + "learning_rate": 9.898772632255403e-05, + "loss": 0.6712, + "step": 5530 + }, + { + "epoch": 1.7758869802536523, + "grad_norm": 0.8881363272666931, + "learning_rate": 9.895282158837545e-05, + "loss": 0.5934, + "step": 5531 + }, + { + "epoch": 1.7762080590785039, + "grad_norm": 1.1298437118530273, + "learning_rate": 9.891791698179235e-05, + "loss": 0.6977, + "step": 5532 + }, + { + "epoch": 1.7765291379033554, + "grad_norm": 0.7341294288635254, + "learning_rate": 9.888301250705779e-05, + "loss": 0.522, + "step": 5533 + }, + { + "epoch": 1.7768502167282068, + "grad_norm": 0.9081159234046936, + "learning_rate": 9.884810816842475e-05, + "loss": 0.6074, + "step": 5534 + }, + { + "epoch": 1.7771712955530583, + "grad_norm": 1.22751784324646, + "learning_rate": 9.881320397014629e-05, + "loss": 0.7584, + "step": 5535 + }, + { + "epoch": 1.7774923743779096, + "grad_norm": 0.7882629632949829, + "learning_rate": 9.877829991647528e-05, + "loss": 0.4237, + "step": 5536 + }, + { + "epoch": 1.7778134532027612, + "grad_norm": 0.7696181535720825, + "learning_rate": 9.874339601166473e-05, + "loss": 0.3827, + "step": 5537 + }, + { + "epoch": 1.7781345320276127, + "grad_norm": 1.0486514568328857, + "learning_rate": 9.87084922599676e-05, + "loss": 0.4434, + "step": 5538 + }, + { + "epoch": 1.7784556108524643, + "grad_norm": 0.6344658732414246, + "learning_rate": 9.867358866563673e-05, + "loss": 0.3858, + "step": 5539 + }, + { + "epoch": 1.7787766896773158, + "grad_norm": 0.9014244079589844, + "learning_rate": 9.863868523292509e-05, + "loss": 0.6374, + "step": 5540 + }, + { + "epoch": 1.7790977685021674, + "grad_norm": 0.9677129983901978, + "learning_rate": 9.860378196608549e-05, + "loss": 0.652, + "step": 5541 + }, + { + "epoch": 1.779418847327019, + "grad_norm": 1.439113736152649, + "learning_rate": 9.856887886937083e-05, + "loss": 0.7972, + "step": 5542 + }, + { + "epoch": 1.7797399261518703, + "grad_norm": 1.0146809816360474, + "learning_rate": 9.853397594703394e-05, + "loss": 0.5427, + "step": 5543 + }, + { + "epoch": 1.7800610049767218, + "grad_norm": 1.0371732711791992, + "learning_rate": 9.849907320332766e-05, + "loss": 0.526, + "step": 5544 + }, + { + "epoch": 1.7803820838015731, + "grad_norm": 0.9912870526313782, + "learning_rate": 9.846417064250471e-05, + "loss": 0.6929, + "step": 5545 + }, + { + "epoch": 1.7807031626264247, + "grad_norm": 0.9019957780838013, + "learning_rate": 9.842926826881796e-05, + "loss": 0.5505, + "step": 5546 + }, + { + "epoch": 1.7810242414512762, + "grad_norm": 0.9842864274978638, + "learning_rate": 9.839436608652007e-05, + "loss": 0.6031, + "step": 5547 + }, + { + "epoch": 1.7813453202761278, + "grad_norm": 1.212700605392456, + "learning_rate": 9.83594640998638e-05, + "loss": 0.5913, + "step": 5548 + }, + { + "epoch": 1.7816663991009793, + "grad_norm": 1.029821753501892, + "learning_rate": 9.832456231310189e-05, + "loss": 0.546, + "step": 5549 + }, + { + "epoch": 1.781987477925831, + "grad_norm": 0.77622389793396, + "learning_rate": 9.828966073048693e-05, + "loss": 0.4759, + "step": 5550 + }, + { + "epoch": 1.7823085567506824, + "grad_norm": 0.9894078373908997, + "learning_rate": 9.825475935627165e-05, + "loss": 0.6129, + "step": 5551 + }, + { + "epoch": 1.7826296355755338, + "grad_norm": 0.7435919642448425, + "learning_rate": 9.821985819470863e-05, + "loss": 0.5363, + "step": 5552 + }, + { + "epoch": 1.7829507144003853, + "grad_norm": 0.7949634790420532, + "learning_rate": 9.818495725005054e-05, + "loss": 0.492, + "step": 5553 + }, + { + "epoch": 1.7832717932252367, + "grad_norm": 0.7472631931304932, + "learning_rate": 9.815005652654985e-05, + "loss": 0.4438, + "step": 5554 + }, + { + "epoch": 1.7835928720500882, + "grad_norm": 1.0186270475387573, + "learning_rate": 9.81151560284592e-05, + "loss": 0.602, + "step": 5555 + }, + { + "epoch": 1.7839139508749398, + "grad_norm": 0.807303786277771, + "learning_rate": 9.808025576003104e-05, + "loss": 0.5087, + "step": 5556 + }, + { + "epoch": 1.7842350296997913, + "grad_norm": 0.872272253036499, + "learning_rate": 9.804535572551789e-05, + "loss": 0.5801, + "step": 5557 + }, + { + "epoch": 1.7845561085246429, + "grad_norm": 0.9321494102478027, + "learning_rate": 9.801045592917226e-05, + "loss": 0.4829, + "step": 5558 + }, + { + "epoch": 1.7848771873494944, + "grad_norm": 1.4904452562332153, + "learning_rate": 9.797555637524649e-05, + "loss": 0.5219, + "step": 5559 + }, + { + "epoch": 1.785198266174346, + "grad_norm": 1.060113787651062, + "learning_rate": 9.794065706799306e-05, + "loss": 0.641, + "step": 5560 + }, + { + "epoch": 1.7855193449991973, + "grad_norm": 0.7635378241539001, + "learning_rate": 9.790575801166432e-05, + "loss": 0.4252, + "step": 5561 + }, + { + "epoch": 1.7858404238240488, + "grad_norm": 0.670326292514801, + "learning_rate": 9.78708592105126e-05, + "loss": 0.3945, + "step": 5562 + }, + { + "epoch": 1.7861615026489002, + "grad_norm": 0.8592783212661743, + "learning_rate": 9.783596066879022e-05, + "loss": 0.3788, + "step": 5563 + }, + { + "epoch": 1.7864825814737517, + "grad_norm": 0.7472242712974548, + "learning_rate": 9.78010623907495e-05, + "loss": 0.353, + "step": 5564 + }, + { + "epoch": 1.7868036602986033, + "grad_norm": 0.5249154567718506, + "learning_rate": 9.776616438064264e-05, + "loss": 0.6485, + "step": 5565 + }, + { + "epoch": 1.7871247391234548, + "grad_norm": 0.6192828416824341, + "learning_rate": 9.773126664272186e-05, + "loss": 0.9866, + "step": 5566 + }, + { + "epoch": 1.7874458179483064, + "grad_norm": 0.5821790099143982, + "learning_rate": 9.76963691812394e-05, + "loss": 0.3801, + "step": 5567 + }, + { + "epoch": 1.787766896773158, + "grad_norm": 0.7041050791740417, + "learning_rate": 9.766147200044732e-05, + "loss": 0.2845, + "step": 5568 + }, + { + "epoch": 1.7880879755980095, + "grad_norm": 0.7531072497367859, + "learning_rate": 9.762657510459783e-05, + "loss": 0.2994, + "step": 5569 + }, + { + "epoch": 1.7884090544228608, + "grad_norm": 0.6308495998382568, + "learning_rate": 9.759167849794292e-05, + "loss": 0.1686, + "step": 5570 + }, + { + "epoch": 1.7887301332477124, + "grad_norm": 0.5816466808319092, + "learning_rate": 9.755678218473469e-05, + "loss": 0.2938, + "step": 5571 + }, + { + "epoch": 1.7890512120725637, + "grad_norm": 0.8111523389816284, + "learning_rate": 9.752188616922518e-05, + "loss": 0.7485, + "step": 5572 + }, + { + "epoch": 1.7893722908974152, + "grad_norm": 0.9488875865936279, + "learning_rate": 9.748699045566626e-05, + "loss": 0.6345, + "step": 5573 + }, + { + "epoch": 1.7896933697222668, + "grad_norm": 0.9654393196105957, + "learning_rate": 9.745209504830996e-05, + "loss": 0.669, + "step": 5574 + }, + { + "epoch": 1.7900144485471183, + "grad_norm": 0.9864271879196167, + "learning_rate": 9.741719995140814e-05, + "loss": 0.6693, + "step": 5575 + }, + { + "epoch": 1.7903355273719699, + "grad_norm": 0.7702199220657349, + "learning_rate": 9.73823051692127e-05, + "loss": 0.6806, + "step": 5576 + }, + { + "epoch": 1.7906566061968214, + "grad_norm": 0.7644592523574829, + "learning_rate": 9.734741070597539e-05, + "loss": 0.6401, + "step": 5577 + }, + { + "epoch": 1.790977685021673, + "grad_norm": 0.8069849610328674, + "learning_rate": 9.73125165659481e-05, + "loss": 0.5884, + "step": 5578 + }, + { + "epoch": 1.7912987638465243, + "grad_norm": 1.0152876377105713, + "learning_rate": 9.727762275338246e-05, + "loss": 0.5491, + "step": 5579 + }, + { + "epoch": 1.7916198426713759, + "grad_norm": 0.7692520618438721, + "learning_rate": 9.724272927253025e-05, + "loss": 0.5617, + "step": 5580 + }, + { + "epoch": 1.7919409214962272, + "grad_norm": 0.7524055242538452, + "learning_rate": 9.720783612764314e-05, + "loss": 0.535, + "step": 5581 + }, + { + "epoch": 1.7922620003210787, + "grad_norm": 0.7909640669822693, + "learning_rate": 9.717294332297268e-05, + "loss": 0.4679, + "step": 5582 + }, + { + "epoch": 1.7925830791459303, + "grad_norm": 1.540922999382019, + "learning_rate": 9.713805086277054e-05, + "loss": 0.6385, + "step": 5583 + }, + { + "epoch": 1.7929041579707818, + "grad_norm": 1.7500687837600708, + "learning_rate": 9.710315875128819e-05, + "loss": 0.6036, + "step": 5584 + }, + { + "epoch": 1.7932252367956334, + "grad_norm": 0.8995754718780518, + "learning_rate": 9.706826699277718e-05, + "loss": 0.7047, + "step": 5585 + }, + { + "epoch": 1.793546315620485, + "grad_norm": 0.9600510001182556, + "learning_rate": 9.703337559148892e-05, + "loss": 0.4654, + "step": 5586 + }, + { + "epoch": 1.7938673944453365, + "grad_norm": 3.3275082111358643, + "learning_rate": 9.699848455167489e-05, + "loss": 0.7396, + "step": 5587 + }, + { + "epoch": 1.7941884732701878, + "grad_norm": 1.0256354808807373, + "learning_rate": 9.696359387758636e-05, + "loss": 0.7023, + "step": 5588 + }, + { + "epoch": 1.7945095520950394, + "grad_norm": 0.8502017855644226, + "learning_rate": 9.692870357347473e-05, + "loss": 0.4102, + "step": 5589 + }, + { + "epoch": 1.7948306309198907, + "grad_norm": 0.9669037461280823, + "learning_rate": 9.689381364359129e-05, + "loss": 0.668, + "step": 5590 + }, + { + "epoch": 1.7951517097447423, + "grad_norm": 1.028659462928772, + "learning_rate": 9.685892409218717e-05, + "loss": 0.493, + "step": 5591 + }, + { + "epoch": 1.7954727885695938, + "grad_norm": 0.9129486680030823, + "learning_rate": 9.682403492351369e-05, + "loss": 0.6085, + "step": 5592 + }, + { + "epoch": 1.7957938673944454, + "grad_norm": 1.1581437587738037, + "learning_rate": 9.678914614182185e-05, + "loss": 0.6057, + "step": 5593 + }, + { + "epoch": 1.796114946219297, + "grad_norm": 1.030382513999939, + "learning_rate": 9.675425775136286e-05, + "loss": 0.6529, + "step": 5594 + }, + { + "epoch": 1.7964360250441485, + "grad_norm": 0.7557877898216248, + "learning_rate": 9.671936975638768e-05, + "loss": 0.417, + "step": 5595 + }, + { + "epoch": 1.796757103869, + "grad_norm": 0.8263580799102783, + "learning_rate": 9.668448216114739e-05, + "loss": 0.4406, + "step": 5596 + }, + { + "epoch": 1.7970781826938513, + "grad_norm": 1.1736435890197754, + "learning_rate": 9.664959496989285e-05, + "loss": 0.7772, + "step": 5597 + }, + { + "epoch": 1.7973992615187029, + "grad_norm": 0.8506044149398804, + "learning_rate": 9.661470818687503e-05, + "loss": 0.541, + "step": 5598 + }, + { + "epoch": 1.7977203403435542, + "grad_norm": 1.1008727550506592, + "learning_rate": 9.657982181634475e-05, + "loss": 0.6642, + "step": 5599 + }, + { + "epoch": 1.7980414191684058, + "grad_norm": 1.1743552684783936, + "learning_rate": 9.654493586255278e-05, + "loss": 0.6096, + "step": 5600 + }, + { + "epoch": 1.7983624979932573, + "grad_norm": 0.5854097604751587, + "learning_rate": 9.651005032974994e-05, + "loss": 0.3086, + "step": 5601 + }, + { + "epoch": 1.7986835768181089, + "grad_norm": 1.1236282587051392, + "learning_rate": 9.647516522218683e-05, + "loss": 0.3826, + "step": 5602 + }, + { + "epoch": 1.7990046556429604, + "grad_norm": 1.1320160627365112, + "learning_rate": 9.644028054411416e-05, + "loss": 0.6647, + "step": 5603 + }, + { + "epoch": 1.799325734467812, + "grad_norm": 0.9424121975898743, + "learning_rate": 9.64053962997825e-05, + "loss": 0.4697, + "step": 5604 + }, + { + "epoch": 1.7996468132926633, + "grad_norm": 0.765177845954895, + "learning_rate": 9.637051249344243e-05, + "loss": 0.401, + "step": 5605 + }, + { + "epoch": 1.7999678921175148, + "grad_norm": 0.8307339549064636, + "learning_rate": 9.633562912934436e-05, + "loss": 0.3938, + "step": 5606 + }, + { + "epoch": 1.8002889709423664, + "grad_norm": 1.0224002599716187, + "learning_rate": 9.630074621173883e-05, + "loss": 0.6176, + "step": 5607 + }, + { + "epoch": 1.8006100497672177, + "grad_norm": 1.1550755500793457, + "learning_rate": 9.62658637448761e-05, + "loss": 0.4616, + "step": 5608 + }, + { + "epoch": 1.8009311285920693, + "grad_norm": 0.8593536615371704, + "learning_rate": 9.623098173300654e-05, + "loss": 0.443, + "step": 5609 + }, + { + "epoch": 1.8012522074169208, + "grad_norm": 1.4475479125976562, + "learning_rate": 9.619610018038048e-05, + "loss": 0.6054, + "step": 5610 + }, + { + "epoch": 1.8015732862417724, + "grad_norm": 1.1326322555541992, + "learning_rate": 9.616121909124801e-05, + "loss": 0.3655, + "step": 5611 + }, + { + "epoch": 1.801894365066624, + "grad_norm": 0.7792235016822815, + "learning_rate": 9.612633846985941e-05, + "loss": 0.4951, + "step": 5612 + }, + { + "epoch": 1.8022154438914755, + "grad_norm": 0.576023280620575, + "learning_rate": 9.609145832046465e-05, + "loss": 0.3493, + "step": 5613 + }, + { + "epoch": 1.8025365227163268, + "grad_norm": 0.7932665944099426, + "learning_rate": 9.605657864731388e-05, + "loss": 0.3994, + "step": 5614 + }, + { + "epoch": 1.8028576015411784, + "grad_norm": 0.6708626747131348, + "learning_rate": 9.602169945465702e-05, + "loss": 0.5825, + "step": 5615 + }, + { + "epoch": 1.80317868036603, + "grad_norm": 0.534807026386261, + "learning_rate": 9.598682074674405e-05, + "loss": 0.8931, + "step": 5616 + }, + { + "epoch": 1.8034997591908812, + "grad_norm": 0.5789874196052551, + "learning_rate": 9.595194252782477e-05, + "loss": 0.3229, + "step": 5617 + }, + { + "epoch": 1.8038208380157328, + "grad_norm": 0.7923842668533325, + "learning_rate": 9.591706480214901e-05, + "loss": 0.4642, + "step": 5618 + }, + { + "epoch": 1.8041419168405843, + "grad_norm": 0.6035703420639038, + "learning_rate": 9.588218757396655e-05, + "loss": 0.36, + "step": 5619 + }, + { + "epoch": 1.8044629956654359, + "grad_norm": 0.6047983765602112, + "learning_rate": 9.584731084752699e-05, + "loss": 0.2533, + "step": 5620 + }, + { + "epoch": 1.8047840744902874, + "grad_norm": 0.8523005843162537, + "learning_rate": 9.581243462708006e-05, + "loss": 0.2647, + "step": 5621 + }, + { + "epoch": 1.805105153315139, + "grad_norm": 0.5535145998001099, + "learning_rate": 9.577755891687523e-05, + "loss": 0.137, + "step": 5622 + }, + { + "epoch": 1.8054262321399903, + "grad_norm": 0.7448154091835022, + "learning_rate": 9.574268372116205e-05, + "loss": 0.4489, + "step": 5623 + }, + { + "epoch": 1.8057473109648419, + "grad_norm": 1.0387903451919556, + "learning_rate": 9.570780904418993e-05, + "loss": 0.7846, + "step": 5624 + }, + { + "epoch": 1.8060683897896934, + "grad_norm": 0.9373582005500793, + "learning_rate": 9.567293489020831e-05, + "loss": 0.7087, + "step": 5625 + }, + { + "epoch": 1.8063894686145447, + "grad_norm": 0.9567878842353821, + "learning_rate": 9.563806126346642e-05, + "loss": 0.5429, + "step": 5626 + }, + { + "epoch": 1.8067105474393963, + "grad_norm": 1.1631771326065063, + "learning_rate": 9.560318816821353e-05, + "loss": 0.778, + "step": 5627 + }, + { + "epoch": 1.8070316262642478, + "grad_norm": 0.7982397675514221, + "learning_rate": 9.556831560869882e-05, + "loss": 0.6363, + "step": 5628 + }, + { + "epoch": 1.8073527050890994, + "grad_norm": 0.7375602126121521, + "learning_rate": 9.55334435891714e-05, + "loss": 0.5203, + "step": 5629 + }, + { + "epoch": 1.807673783913951, + "grad_norm": 0.8686415553092957, + "learning_rate": 9.549857211388037e-05, + "loss": 0.6616, + "step": 5630 + }, + { + "epoch": 1.8079948627388025, + "grad_norm": 0.9008349776268005, + "learning_rate": 9.546370118707463e-05, + "loss": 0.7094, + "step": 5631 + }, + { + "epoch": 1.8083159415636538, + "grad_norm": 0.7942934036254883, + "learning_rate": 9.542883081300316e-05, + "loss": 0.5465, + "step": 5632 + }, + { + "epoch": 1.8086370203885054, + "grad_norm": 0.939593493938446, + "learning_rate": 9.539396099591476e-05, + "loss": 0.6448, + "step": 5633 + }, + { + "epoch": 1.808958099213357, + "grad_norm": 0.8011131286621094, + "learning_rate": 9.53590917400583e-05, + "loss": 0.5164, + "step": 5634 + }, + { + "epoch": 1.8092791780382083, + "grad_norm": 0.973583459854126, + "learning_rate": 9.532422304968243e-05, + "loss": 0.6263, + "step": 5635 + }, + { + "epoch": 1.8096002568630598, + "grad_norm": 0.788236677646637, + "learning_rate": 9.528935492903575e-05, + "loss": 0.4855, + "step": 5636 + }, + { + "epoch": 1.8099213356879114, + "grad_norm": 0.9553484916687012, + "learning_rate": 9.525448738236691e-05, + "loss": 0.7004, + "step": 5637 + }, + { + "epoch": 1.810242414512763, + "grad_norm": 0.9981603622436523, + "learning_rate": 9.521962041392436e-05, + "loss": 0.64, + "step": 5638 + }, + { + "epoch": 1.8105634933376145, + "grad_norm": 0.7514563798904419, + "learning_rate": 9.518475402795661e-05, + "loss": 0.5227, + "step": 5639 + }, + { + "epoch": 1.810884572162466, + "grad_norm": 0.8019845485687256, + "learning_rate": 9.514988822871193e-05, + "loss": 0.4965, + "step": 5640 + }, + { + "epoch": 1.8112056509873173, + "grad_norm": 0.9751089811325073, + "learning_rate": 9.511502302043868e-05, + "loss": 0.4515, + "step": 5641 + }, + { + "epoch": 1.811526729812169, + "grad_norm": 1.0139288902282715, + "learning_rate": 9.508015840738503e-05, + "loss": 0.6349, + "step": 5642 + }, + { + "epoch": 1.8118478086370204, + "grad_norm": 0.7157825231552124, + "learning_rate": 9.504529439379921e-05, + "loss": 0.4664, + "step": 5643 + }, + { + "epoch": 1.8121688874618718, + "grad_norm": 0.942258358001709, + "learning_rate": 9.501043098392924e-05, + "loss": 0.5903, + "step": 5644 + }, + { + "epoch": 1.8124899662867233, + "grad_norm": 0.9379916787147522, + "learning_rate": 9.497556818202306e-05, + "loss": 0.432, + "step": 5645 + }, + { + "epoch": 1.8128110451115749, + "grad_norm": 0.9485198259353638, + "learning_rate": 9.494070599232868e-05, + "loss": 0.5559, + "step": 5646 + }, + { + "epoch": 1.8131321239364264, + "grad_norm": 1.1162331104278564, + "learning_rate": 9.490584441909392e-05, + "loss": 0.613, + "step": 5647 + }, + { + "epoch": 1.813453202761278, + "grad_norm": 1.1678434610366821, + "learning_rate": 9.48709834665666e-05, + "loss": 0.5808, + "step": 5648 + }, + { + "epoch": 1.8137742815861295, + "grad_norm": 0.8702452182769775, + "learning_rate": 9.483612313899435e-05, + "loss": 0.583, + "step": 5649 + }, + { + "epoch": 1.8140953604109809, + "grad_norm": 1.071449637413025, + "learning_rate": 9.480126344062487e-05, + "loss": 0.5968, + "step": 5650 + }, + { + "epoch": 1.8144164392358324, + "grad_norm": 1.1288082599639893, + "learning_rate": 9.476640437570562e-05, + "loss": 0.5764, + "step": 5651 + }, + { + "epoch": 1.814737518060684, + "grad_norm": 1.0096368789672852, + "learning_rate": 9.473154594848415e-05, + "loss": 0.5777, + "step": 5652 + }, + { + "epoch": 1.8150585968855353, + "grad_norm": 0.9093414545059204, + "learning_rate": 9.469668816320784e-05, + "loss": 0.5467, + "step": 5653 + }, + { + "epoch": 1.8153796757103868, + "grad_norm": 0.9198897480964661, + "learning_rate": 9.466183102412395e-05, + "loss": 0.5423, + "step": 5654 + }, + { + "epoch": 1.8157007545352384, + "grad_norm": 0.9140397310256958, + "learning_rate": 9.462697453547979e-05, + "loss": 0.5262, + "step": 5655 + }, + { + "epoch": 1.81602183336009, + "grad_norm": 1.1737873554229736, + "learning_rate": 9.459211870152245e-05, + "loss": 0.572, + "step": 5656 + }, + { + "epoch": 1.8163429121849415, + "grad_norm": 0.6453090906143188, + "learning_rate": 9.455726352649911e-05, + "loss": 0.3562, + "step": 5657 + }, + { + "epoch": 1.816663991009793, + "grad_norm": 0.9809330105781555, + "learning_rate": 9.452240901465663e-05, + "loss": 0.4382, + "step": 5658 + }, + { + "epoch": 1.8169850698346444, + "grad_norm": 1.092984914779663, + "learning_rate": 9.448755517024206e-05, + "loss": 0.5466, + "step": 5659 + }, + { + "epoch": 1.817306148659496, + "grad_norm": 0.7490816712379456, + "learning_rate": 9.445270199750212e-05, + "loss": 0.4304, + "step": 5660 + }, + { + "epoch": 1.8176272274843472, + "grad_norm": 0.6709809303283691, + "learning_rate": 9.441784950068362e-05, + "loss": 0.3968, + "step": 5661 + }, + { + "epoch": 1.8179483063091988, + "grad_norm": 0.8209671378135681, + "learning_rate": 9.438299768403327e-05, + "loss": 0.4619, + "step": 5662 + }, + { + "epoch": 1.8182693851340503, + "grad_norm": 0.5862700939178467, + "learning_rate": 9.434814655179755e-05, + "loss": 0.3755, + "step": 5663 + }, + { + "epoch": 1.818590463958902, + "grad_norm": 0.5392091870307922, + "learning_rate": 9.43132961082231e-05, + "loss": 0.3564, + "step": 5664 + }, + { + "epoch": 1.8189115427837534, + "grad_norm": 0.6820465326309204, + "learning_rate": 9.427844635755619e-05, + "loss": 0.6728, + "step": 5665 + }, + { + "epoch": 1.819232621608605, + "grad_norm": 0.4257480800151825, + "learning_rate": 9.424359730404329e-05, + "loss": 0.4742, + "step": 5666 + }, + { + "epoch": 1.8195537004334565, + "grad_norm": 0.7783001065254211, + "learning_rate": 9.420874895193056e-05, + "loss": 0.3928, + "step": 5667 + }, + { + "epoch": 1.8198747792583079, + "grad_norm": 0.7277984619140625, + "learning_rate": 9.417390130546426e-05, + "loss": 0.4529, + "step": 5668 + }, + { + "epoch": 1.8201958580831594, + "grad_norm": 0.4899481236934662, + "learning_rate": 9.413905436889035e-05, + "loss": 0.2177, + "step": 5669 + }, + { + "epoch": 1.8205169369080108, + "grad_norm": 0.7720094323158264, + "learning_rate": 9.410420814645493e-05, + "loss": 0.2418, + "step": 5670 + }, + { + "epoch": 1.8208380157328623, + "grad_norm": 0.7921152114868164, + "learning_rate": 9.406936264240386e-05, + "loss": 0.3818, + "step": 5671 + }, + { + "epoch": 1.8211590945577139, + "grad_norm": 0.8348581194877625, + "learning_rate": 9.403451786098294e-05, + "loss": 0.5024, + "step": 5672 + }, + { + "epoch": 1.8214801733825654, + "grad_norm": 0.9092313051223755, + "learning_rate": 9.399967380643796e-05, + "loss": 0.6206, + "step": 5673 + }, + { + "epoch": 1.821801252207417, + "grad_norm": 0.9595973491668701, + "learning_rate": 9.396483048301448e-05, + "loss": 0.5598, + "step": 5674 + }, + { + "epoch": 1.8221223310322685, + "grad_norm": 0.7221679091453552, + "learning_rate": 9.392998789495811e-05, + "loss": 0.463, + "step": 5675 + }, + { + "epoch": 1.82244340985712, + "grad_norm": 0.6640863418579102, + "learning_rate": 9.38951460465143e-05, + "loss": 0.4028, + "step": 5676 + }, + { + "epoch": 1.8227644886819714, + "grad_norm": 0.8685281872749329, + "learning_rate": 9.386030494192846e-05, + "loss": 0.4908, + "step": 5677 + }, + { + "epoch": 1.823085567506823, + "grad_norm": 0.8399313688278198, + "learning_rate": 9.382546458544582e-05, + "loss": 0.6339, + "step": 5678 + }, + { + "epoch": 1.8234066463316743, + "grad_norm": 0.8233359456062317, + "learning_rate": 9.37906249813116e-05, + "loss": 0.5707, + "step": 5679 + }, + { + "epoch": 1.8237277251565258, + "grad_norm": 0.9246627688407898, + "learning_rate": 9.375578613377089e-05, + "loss": 0.5633, + "step": 5680 + }, + { + "epoch": 1.8240488039813774, + "grad_norm": 0.9575353860855103, + "learning_rate": 9.372094804706867e-05, + "loss": 0.622, + "step": 5681 + }, + { + "epoch": 1.824369882806229, + "grad_norm": 0.6907000541687012, + "learning_rate": 9.368611072544992e-05, + "loss": 0.4413, + "step": 5682 + }, + { + "epoch": 1.8246909616310805, + "grad_norm": 0.8728267550468445, + "learning_rate": 9.36512741731594e-05, + "loss": 0.4828, + "step": 5683 + }, + { + "epoch": 1.825012040455932, + "grad_norm": 0.8942257165908813, + "learning_rate": 9.361643839444188e-05, + "loss": 0.6054, + "step": 5684 + }, + { + "epoch": 1.8253331192807836, + "grad_norm": 1.4165688753128052, + "learning_rate": 9.358160339354194e-05, + "loss": 0.6781, + "step": 5685 + }, + { + "epoch": 1.825654198105635, + "grad_norm": 1.0946552753448486, + "learning_rate": 9.354676917470422e-05, + "loss": 0.6444, + "step": 5686 + }, + { + "epoch": 1.8259752769304864, + "grad_norm": 0.9636126756668091, + "learning_rate": 9.351193574217306e-05, + "loss": 0.5896, + "step": 5687 + }, + { + "epoch": 1.8262963557553378, + "grad_norm": 0.9439002871513367, + "learning_rate": 9.347710310019288e-05, + "loss": 0.5716, + "step": 5688 + }, + { + "epoch": 1.8266174345801893, + "grad_norm": 1.3570852279663086, + "learning_rate": 9.344227125300788e-05, + "loss": 0.5469, + "step": 5689 + }, + { + "epoch": 1.8269385134050409, + "grad_norm": 1.0195239782333374, + "learning_rate": 9.340744020486222e-05, + "loss": 0.6181, + "step": 5690 + }, + { + "epoch": 1.8272595922298924, + "grad_norm": 0.9529807567596436, + "learning_rate": 9.337260996000002e-05, + "loss": 0.5538, + "step": 5691 + }, + { + "epoch": 1.827580671054744, + "grad_norm": 1.627726674079895, + "learning_rate": 9.333778052266513e-05, + "loss": 0.6368, + "step": 5692 + }, + { + "epoch": 1.8279017498795955, + "grad_norm": 1.326042652130127, + "learning_rate": 9.330295189710152e-05, + "loss": 0.6902, + "step": 5693 + }, + { + "epoch": 1.828222828704447, + "grad_norm": 0.8660145998001099, + "learning_rate": 9.32681240875529e-05, + "loss": 0.5452, + "step": 5694 + }, + { + "epoch": 1.8285439075292984, + "grad_norm": 0.9196065664291382, + "learning_rate": 9.323329709826294e-05, + "loss": 0.5653, + "step": 5695 + }, + { + "epoch": 1.82886498635415, + "grad_norm": 0.7746536135673523, + "learning_rate": 9.319847093347522e-05, + "loss": 0.4199, + "step": 5696 + }, + { + "epoch": 1.8291860651790013, + "grad_norm": 0.7608351707458496, + "learning_rate": 9.316364559743314e-05, + "loss": 0.4165, + "step": 5697 + }, + { + "epoch": 1.8295071440038528, + "grad_norm": 0.7243719696998596, + "learning_rate": 9.312882109438013e-05, + "loss": 0.4451, + "step": 5698 + }, + { + "epoch": 1.8298282228287044, + "grad_norm": 1.2507250308990479, + "learning_rate": 9.309399742855942e-05, + "loss": 0.7069, + "step": 5699 + }, + { + "epoch": 1.830149301653556, + "grad_norm": 0.8393792510032654, + "learning_rate": 9.30591746042142e-05, + "loss": 0.4762, + "step": 5700 + }, + { + "epoch": 1.8304703804784075, + "grad_norm": 0.8377349376678467, + "learning_rate": 9.302435262558747e-05, + "loss": 0.4423, + "step": 5701 + }, + { + "epoch": 1.830791459303259, + "grad_norm": 0.9344905614852905, + "learning_rate": 9.298953149692225e-05, + "loss": 0.5336, + "step": 5702 + }, + { + "epoch": 1.8311125381281106, + "grad_norm": 0.8581468462944031, + "learning_rate": 9.295471122246131e-05, + "loss": 0.5264, + "step": 5703 + }, + { + "epoch": 1.831433616952962, + "grad_norm": 0.9583359956741333, + "learning_rate": 9.291989180644747e-05, + "loss": 0.5861, + "step": 5704 + }, + { + "epoch": 1.8317546957778135, + "grad_norm": 1.2595868110656738, + "learning_rate": 9.288507325312335e-05, + "loss": 0.5907, + "step": 5705 + }, + { + "epoch": 1.8320757746026648, + "grad_norm": 0.8175548315048218, + "learning_rate": 9.285025556673141e-05, + "loss": 0.5518, + "step": 5706 + }, + { + "epoch": 1.8323968534275163, + "grad_norm": 1.2646697759628296, + "learning_rate": 9.281543875151419e-05, + "loss": 0.5245, + "step": 5707 + }, + { + "epoch": 1.832717932252368, + "grad_norm": 0.8525310754776001, + "learning_rate": 9.278062281171393e-05, + "loss": 0.5834, + "step": 5708 + }, + { + "epoch": 1.8330390110772194, + "grad_norm": 1.1546502113342285, + "learning_rate": 9.274580775157294e-05, + "loss": 0.6688, + "step": 5709 + }, + { + "epoch": 1.833360089902071, + "grad_norm": 0.5779723525047302, + "learning_rate": 9.271099357533321e-05, + "loss": 0.3218, + "step": 5710 + }, + { + "epoch": 1.8336811687269226, + "grad_norm": 0.8706488013267517, + "learning_rate": 9.267618028723686e-05, + "loss": 0.4382, + "step": 5711 + }, + { + "epoch": 1.834002247551774, + "grad_norm": 0.9869682788848877, + "learning_rate": 9.264136789152567e-05, + "loss": 0.3801, + "step": 5712 + }, + { + "epoch": 1.8343233263766254, + "grad_norm": 0.890150785446167, + "learning_rate": 9.26065563924415e-05, + "loss": 0.5917, + "step": 5713 + }, + { + "epoch": 1.834644405201477, + "grad_norm": 0.7428485155105591, + "learning_rate": 9.257174579422605e-05, + "loss": 0.4025, + "step": 5714 + }, + { + "epoch": 1.8349654840263283, + "grad_norm": 0.6964389085769653, + "learning_rate": 9.253693610112078e-05, + "loss": 0.5247, + "step": 5715 + }, + { + "epoch": 1.8352865628511799, + "grad_norm": 0.4811036288738251, + "learning_rate": 9.250212731736726e-05, + "loss": 0.7248, + "step": 5716 + }, + { + "epoch": 1.8356076416760314, + "grad_norm": 0.4891749322414398, + "learning_rate": 9.246731944720675e-05, + "loss": 0.5256, + "step": 5717 + }, + { + "epoch": 1.835928720500883, + "grad_norm": 0.6794456839561462, + "learning_rate": 9.243251249488052e-05, + "loss": 0.3973, + "step": 5718 + }, + { + "epoch": 1.8362497993257345, + "grad_norm": 0.5766389966011047, + "learning_rate": 9.239770646462968e-05, + "loss": 0.2345, + "step": 5719 + }, + { + "epoch": 1.836570878150586, + "grad_norm": 0.8062403202056885, + "learning_rate": 9.236290136069528e-05, + "loss": 0.5047, + "step": 5720 + }, + { + "epoch": 1.8368919569754376, + "grad_norm": 0.6303856372833252, + "learning_rate": 9.232809718731814e-05, + "loss": 0.2518, + "step": 5721 + }, + { + "epoch": 1.837213035800289, + "grad_norm": 1.0692548751831055, + "learning_rate": 9.229329394873911e-05, + "loss": 0.8745, + "step": 5722 + }, + { + "epoch": 1.8375341146251405, + "grad_norm": 0.8411319255828857, + "learning_rate": 9.225849164919885e-05, + "loss": 0.5388, + "step": 5723 + }, + { + "epoch": 1.8378551934499918, + "grad_norm": 0.8031153678894043, + "learning_rate": 9.222369029293787e-05, + "loss": 0.6806, + "step": 5724 + }, + { + "epoch": 1.8381762722748434, + "grad_norm": 0.6173913478851318, + "learning_rate": 9.218888988419668e-05, + "loss": 0.4643, + "step": 5725 + }, + { + "epoch": 1.838497351099695, + "grad_norm": 0.7925007939338684, + "learning_rate": 9.215409042721552e-05, + "loss": 0.5676, + "step": 5726 + }, + { + "epoch": 1.8388184299245465, + "grad_norm": 0.9955210089683533, + "learning_rate": 9.211929192623467e-05, + "loss": 0.7278, + "step": 5727 + }, + { + "epoch": 1.839139508749398, + "grad_norm": 0.7641330361366272, + "learning_rate": 9.208449438549415e-05, + "loss": 0.4462, + "step": 5728 + }, + { + "epoch": 1.8394605875742496, + "grad_norm": 0.7429258227348328, + "learning_rate": 9.204969780923403e-05, + "loss": 0.6244, + "step": 5729 + }, + { + "epoch": 1.8397816663991011, + "grad_norm": 0.8412753939628601, + "learning_rate": 9.201490220169408e-05, + "loss": 0.7808, + "step": 5730 + }, + { + "epoch": 1.8401027452239525, + "grad_norm": 0.8708707094192505, + "learning_rate": 9.198010756711412e-05, + "loss": 0.6767, + "step": 5731 + }, + { + "epoch": 1.840423824048804, + "grad_norm": 0.7798470258712769, + "learning_rate": 9.194531390973371e-05, + "loss": 0.5248, + "step": 5732 + }, + { + "epoch": 1.8407449028736553, + "grad_norm": 0.8160192966461182, + "learning_rate": 9.191052123379234e-05, + "loss": 0.5404, + "step": 5733 + }, + { + "epoch": 1.8410659816985069, + "grad_norm": 1.1252083778381348, + "learning_rate": 9.187572954352947e-05, + "loss": 0.7795, + "step": 5734 + }, + { + "epoch": 1.8413870605233584, + "grad_norm": 0.8956120610237122, + "learning_rate": 9.184093884318425e-05, + "loss": 0.6233, + "step": 5735 + }, + { + "epoch": 1.84170813934821, + "grad_norm": 0.9571072459220886, + "learning_rate": 9.180614913699592e-05, + "loss": 0.6217, + "step": 5736 + }, + { + "epoch": 1.8420292181730615, + "grad_norm": 0.7272530794143677, + "learning_rate": 9.177136042920344e-05, + "loss": 0.4461, + "step": 5737 + }, + { + "epoch": 1.842350296997913, + "grad_norm": 1.0026044845581055, + "learning_rate": 9.173657272404576e-05, + "loss": 0.7131, + "step": 5738 + }, + { + "epoch": 1.8426713758227646, + "grad_norm": 0.8981397747993469, + "learning_rate": 9.17017860257616e-05, + "loss": 0.5629, + "step": 5739 + }, + { + "epoch": 1.842992454647616, + "grad_norm": 1.2423765659332275, + "learning_rate": 9.166700033858969e-05, + "loss": 0.7977, + "step": 5740 + }, + { + "epoch": 1.8433135334724675, + "grad_norm": 0.7401012778282166, + "learning_rate": 9.163221566676847e-05, + "loss": 0.4719, + "step": 5741 + }, + { + "epoch": 1.8436346122973188, + "grad_norm": 0.9083794951438904, + "learning_rate": 9.159743201453638e-05, + "loss": 0.5664, + "step": 5742 + }, + { + "epoch": 1.8439556911221704, + "grad_norm": 0.7999395728111267, + "learning_rate": 9.156264938613174e-05, + "loss": 0.5195, + "step": 5743 + }, + { + "epoch": 1.844276769947022, + "grad_norm": 0.8803184628486633, + "learning_rate": 9.152786778579267e-05, + "loss": 0.4888, + "step": 5744 + }, + { + "epoch": 1.8445978487718735, + "grad_norm": 0.923516571521759, + "learning_rate": 9.14930872177572e-05, + "loss": 0.4521, + "step": 5745 + }, + { + "epoch": 1.844918927596725, + "grad_norm": 0.7799448370933533, + "learning_rate": 9.145830768626327e-05, + "loss": 0.5848, + "step": 5746 + }, + { + "epoch": 1.8452400064215766, + "grad_norm": 0.9645082354545593, + "learning_rate": 9.142352919554862e-05, + "loss": 0.5881, + "step": 5747 + }, + { + "epoch": 1.8455610852464281, + "grad_norm": 0.8418881297111511, + "learning_rate": 9.138875174985091e-05, + "loss": 0.4551, + "step": 5748 + }, + { + "epoch": 1.8458821640712795, + "grad_norm": 0.8868610858917236, + "learning_rate": 9.135397535340773e-05, + "loss": 0.3873, + "step": 5749 + }, + { + "epoch": 1.846203242896131, + "grad_norm": 1.052437424659729, + "learning_rate": 9.131920001045638e-05, + "loss": 0.5356, + "step": 5750 + }, + { + "epoch": 1.8465243217209824, + "grad_norm": 0.9833017587661743, + "learning_rate": 9.128442572523417e-05, + "loss": 0.5061, + "step": 5751 + }, + { + "epoch": 1.846845400545834, + "grad_norm": 1.3260668516159058, + "learning_rate": 9.12496525019783e-05, + "loss": 0.5013, + "step": 5752 + }, + { + "epoch": 1.8471664793706855, + "grad_norm": 1.056301474571228, + "learning_rate": 9.121488034492569e-05, + "loss": 0.6949, + "step": 5753 + }, + { + "epoch": 1.847487558195537, + "grad_norm": 0.8964768648147583, + "learning_rate": 9.11801092583133e-05, + "loss": 0.5426, + "step": 5754 + }, + { + "epoch": 1.8478086370203886, + "grad_norm": 0.8216196894645691, + "learning_rate": 9.114533924637778e-05, + "loss": 0.4087, + "step": 5755 + }, + { + "epoch": 1.84812971584524, + "grad_norm": 0.7172057032585144, + "learning_rate": 9.111057031335585e-05, + "loss": 0.417, + "step": 5756 + }, + { + "epoch": 1.8484507946700917, + "grad_norm": 1.1012330055236816, + "learning_rate": 9.107580246348395e-05, + "loss": 0.3882, + "step": 5757 + }, + { + "epoch": 1.848771873494943, + "grad_norm": 0.856521725654602, + "learning_rate": 9.104103570099848e-05, + "loss": 0.4772, + "step": 5758 + }, + { + "epoch": 1.8490929523197945, + "grad_norm": 1.0229532718658447, + "learning_rate": 9.100627003013562e-05, + "loss": 0.4683, + "step": 5759 + }, + { + "epoch": 1.8494140311446459, + "grad_norm": 1.44857919216156, + "learning_rate": 9.097150545513145e-05, + "loss": 0.7295, + "step": 5760 + }, + { + "epoch": 1.8497351099694974, + "grad_norm": 0.8216368556022644, + "learning_rate": 9.093674198022201e-05, + "loss": 0.4156, + "step": 5761 + }, + { + "epoch": 1.850056188794349, + "grad_norm": 0.5136415362358093, + "learning_rate": 9.090197960964301e-05, + "loss": 0.3143, + "step": 5762 + }, + { + "epoch": 1.8503772676192005, + "grad_norm": 0.7286933660507202, + "learning_rate": 9.086721834763024e-05, + "loss": 0.3688, + "step": 5763 + }, + { + "epoch": 1.850698346444052, + "grad_norm": 0.9348329305648804, + "learning_rate": 9.083245819841918e-05, + "loss": 0.5614, + "step": 5764 + }, + { + "epoch": 1.8510194252689036, + "grad_norm": 1.0801033973693848, + "learning_rate": 9.07976991662453e-05, + "loss": 0.7294, + "step": 5765 + }, + { + "epoch": 1.8513405040937552, + "grad_norm": 0.467505544424057, + "learning_rate": 9.076294125534383e-05, + "loss": 0.9349, + "step": 5766 + }, + { + "epoch": 1.8516615829186065, + "grad_norm": 0.6000930070877075, + "learning_rate": 9.072818446994999e-05, + "loss": 0.5034, + "step": 5767 + }, + { + "epoch": 1.851982661743458, + "grad_norm": 0.7102051973342896, + "learning_rate": 9.069342881429876e-05, + "loss": 0.2786, + "step": 5768 + }, + { + "epoch": 1.8523037405683094, + "grad_norm": 0.6671398878097534, + "learning_rate": 9.065867429262496e-05, + "loss": 0.1777, + "step": 5769 + }, + { + "epoch": 1.852624819393161, + "grad_norm": 0.6379199624061584, + "learning_rate": 9.062392090916337e-05, + "loss": 0.3455, + "step": 5770 + }, + { + "epoch": 1.8529458982180125, + "grad_norm": 0.7193529009819031, + "learning_rate": 9.058916866814858e-05, + "loss": 0.2457, + "step": 5771 + }, + { + "epoch": 1.853266977042864, + "grad_norm": 1.0792187452316284, + "learning_rate": 9.055441757381506e-05, + "loss": 0.7677, + "step": 5772 + }, + { + "epoch": 1.8535880558677156, + "grad_norm": 0.8163480758666992, + "learning_rate": 9.051966763039707e-05, + "loss": 0.5803, + "step": 5773 + }, + { + "epoch": 1.8539091346925671, + "grad_norm": 0.7993916869163513, + "learning_rate": 9.048491884212884e-05, + "loss": 0.5471, + "step": 5774 + }, + { + "epoch": 1.8542302135174187, + "grad_norm": 0.9378834962844849, + "learning_rate": 9.045017121324438e-05, + "loss": 0.5756, + "step": 5775 + }, + { + "epoch": 1.85455129234227, + "grad_norm": 0.7326672077178955, + "learning_rate": 9.04154247479776e-05, + "loss": 0.3347, + "step": 5776 + }, + { + "epoch": 1.8548723711671216, + "grad_norm": 0.8225162625312805, + "learning_rate": 9.038067945056227e-05, + "loss": 0.6454, + "step": 5777 + }, + { + "epoch": 1.8551934499919729, + "grad_norm": 1.0159610509872437, + "learning_rate": 9.034593532523193e-05, + "loss": 0.5996, + "step": 5778 + }, + { + "epoch": 1.8555145288168244, + "grad_norm": 0.8151166439056396, + "learning_rate": 9.03111923762201e-05, + "loss": 0.6339, + "step": 5779 + }, + { + "epoch": 1.855835607641676, + "grad_norm": 0.956063449382782, + "learning_rate": 9.027645060776006e-05, + "loss": 0.5437, + "step": 5780 + }, + { + "epoch": 1.8561566864665275, + "grad_norm": 0.8743849396705627, + "learning_rate": 9.024171002408506e-05, + "loss": 0.5587, + "step": 5781 + }, + { + "epoch": 1.856477765291379, + "grad_norm": 0.8803306818008423, + "learning_rate": 9.020697062942807e-05, + "loss": 0.5749, + "step": 5782 + }, + { + "epoch": 1.8567988441162306, + "grad_norm": 1.1139497756958008, + "learning_rate": 9.017223242802204e-05, + "loss": 0.5244, + "step": 5783 + }, + { + "epoch": 1.8571199229410822, + "grad_norm": 0.8290317058563232, + "learning_rate": 9.013749542409963e-05, + "loss": 0.5412, + "step": 5784 + }, + { + "epoch": 1.8574410017659335, + "grad_norm": 0.9109243154525757, + "learning_rate": 9.01027596218935e-05, + "loss": 0.7158, + "step": 5785 + }, + { + "epoch": 1.857762080590785, + "grad_norm": 0.8047156929969788, + "learning_rate": 9.006802502563612e-05, + "loss": 0.4727, + "step": 5786 + }, + { + "epoch": 1.8580831594156364, + "grad_norm": 0.9587835073471069, + "learning_rate": 9.003329163955972e-05, + "loss": 0.6326, + "step": 5787 + }, + { + "epoch": 1.858404238240488, + "grad_norm": 0.8253732919692993, + "learning_rate": 8.999855946789653e-05, + "loss": 0.5527, + "step": 5788 + }, + { + "epoch": 1.8587253170653395, + "grad_norm": 1.1531040668487549, + "learning_rate": 8.99638285148785e-05, + "loss": 0.4517, + "step": 5789 + }, + { + "epoch": 1.859046395890191, + "grad_norm": 0.7936963438987732, + "learning_rate": 8.992909878473758e-05, + "loss": 0.492, + "step": 5790 + }, + { + "epoch": 1.8593674747150426, + "grad_norm": 1.0635939836502075, + "learning_rate": 8.989437028170537e-05, + "loss": 0.5887, + "step": 5791 + }, + { + "epoch": 1.8596885535398942, + "grad_norm": 4.376264572143555, + "learning_rate": 8.985964301001353e-05, + "loss": 0.5299, + "step": 5792 + }, + { + "epoch": 1.8600096323647457, + "grad_norm": 0.9289653897285461, + "learning_rate": 8.982491697389338e-05, + "loss": 0.6141, + "step": 5793 + }, + { + "epoch": 1.860330711189597, + "grad_norm": 1.19081449508667, + "learning_rate": 8.979019217757625e-05, + "loss": 0.9054, + "step": 5794 + }, + { + "epoch": 1.8606517900144486, + "grad_norm": 0.8353030681610107, + "learning_rate": 8.975546862529328e-05, + "loss": 0.5393, + "step": 5795 + }, + { + "epoch": 1.8609728688393, + "grad_norm": 0.9625294804573059, + "learning_rate": 8.972074632127533e-05, + "loss": 0.5229, + "step": 5796 + }, + { + "epoch": 1.8612939476641515, + "grad_norm": 0.9256024956703186, + "learning_rate": 8.96860252697533e-05, + "loss": 0.3256, + "step": 5797 + }, + { + "epoch": 1.861615026489003, + "grad_norm": 0.9383046627044678, + "learning_rate": 8.965130547495776e-05, + "loss": 0.7085, + "step": 5798 + }, + { + "epoch": 1.8619361053138546, + "grad_norm": 0.8655479550361633, + "learning_rate": 8.961658694111929e-05, + "loss": 0.5807, + "step": 5799 + }, + { + "epoch": 1.8622571841387061, + "grad_norm": 1.0678274631500244, + "learning_rate": 8.958186967246816e-05, + "loss": 0.6035, + "step": 5800 + }, + { + "epoch": 1.8625782629635577, + "grad_norm": 0.6425728797912598, + "learning_rate": 8.954715367323468e-05, + "loss": 0.3561, + "step": 5801 + }, + { + "epoch": 1.8628993417884092, + "grad_norm": 0.835169792175293, + "learning_rate": 8.951243894764876e-05, + "loss": 0.5272, + "step": 5802 + }, + { + "epoch": 1.8632204206132605, + "grad_norm": 0.9119851589202881, + "learning_rate": 8.947772549994035e-05, + "loss": 0.5669, + "step": 5803 + }, + { + "epoch": 1.863541499438112, + "grad_norm": 0.8746310472488403, + "learning_rate": 8.944301333433922e-05, + "loss": 0.5494, + "step": 5804 + }, + { + "epoch": 1.8638625782629634, + "grad_norm": 1.2345632314682007, + "learning_rate": 8.940830245507483e-05, + "loss": 0.751, + "step": 5805 + }, + { + "epoch": 1.864183657087815, + "grad_norm": 1.1940757036209106, + "learning_rate": 8.93735928663767e-05, + "loss": 0.6379, + "step": 5806 + }, + { + "epoch": 1.8645047359126665, + "grad_norm": 1.1962798833847046, + "learning_rate": 8.933888457247402e-05, + "loss": 0.5358, + "step": 5807 + }, + { + "epoch": 1.864825814737518, + "grad_norm": 1.165390968322754, + "learning_rate": 8.930417757759592e-05, + "loss": 0.5781, + "step": 5808 + }, + { + "epoch": 1.8651468935623696, + "grad_norm": 0.9763917922973633, + "learning_rate": 8.926947188597134e-05, + "loss": 0.4743, + "step": 5809 + }, + { + "epoch": 1.8654679723872212, + "grad_norm": 1.0017492771148682, + "learning_rate": 8.923476750182908e-05, + "loss": 0.6824, + "step": 5810 + }, + { + "epoch": 1.8657890512120727, + "grad_norm": 1.386108160018921, + "learning_rate": 8.920006442939772e-05, + "loss": 0.595, + "step": 5811 + }, + { + "epoch": 1.866110130036924, + "grad_norm": 0.5562759041786194, + "learning_rate": 8.916536267290578e-05, + "loss": 0.3123, + "step": 5812 + }, + { + "epoch": 1.8664312088617756, + "grad_norm": 0.5364285111427307, + "learning_rate": 8.913066223658151e-05, + "loss": 0.3485, + "step": 5813 + }, + { + "epoch": 1.866752287686627, + "grad_norm": 0.5442401170730591, + "learning_rate": 8.909596312465306e-05, + "loss": 0.3492, + "step": 5814 + }, + { + "epoch": 1.8670733665114785, + "grad_norm": 0.5919818878173828, + "learning_rate": 8.906126534134848e-05, + "loss": 0.8261, + "step": 5815 + }, + { + "epoch": 1.86739444533633, + "grad_norm": 0.5136668682098389, + "learning_rate": 8.902656889089548e-05, + "loss": 0.7693, + "step": 5816 + }, + { + "epoch": 1.8677155241611816, + "grad_norm": 0.7814684510231018, + "learning_rate": 8.89918737775218e-05, + "loss": 0.456, + "step": 5817 + }, + { + "epoch": 1.8680366029860331, + "grad_norm": 0.5206146240234375, + "learning_rate": 8.895718000545489e-05, + "loss": 0.2985, + "step": 5818 + }, + { + "epoch": 1.8683576818108847, + "grad_norm": 0.5971502065658569, + "learning_rate": 8.892248757892214e-05, + "loss": 0.2403, + "step": 5819 + }, + { + "epoch": 1.8686787606357362, + "grad_norm": 0.8546911478042603, + "learning_rate": 8.888779650215068e-05, + "loss": 0.2383, + "step": 5820 + }, + { + "epoch": 1.8689998394605876, + "grad_norm": 0.9832369089126587, + "learning_rate": 8.885310677936746e-05, + "loss": 0.7438, + "step": 5821 + }, + { + "epoch": 1.8693209182854391, + "grad_norm": 1.0020122528076172, + "learning_rate": 8.88184184147994e-05, + "loss": 0.8816, + "step": 5822 + }, + { + "epoch": 1.8696419971102904, + "grad_norm": 0.7644429206848145, + "learning_rate": 8.878373141267311e-05, + "loss": 0.6192, + "step": 5823 + }, + { + "epoch": 1.869963075935142, + "grad_norm": 0.6598174571990967, + "learning_rate": 8.874904577721518e-05, + "loss": 0.4108, + "step": 5824 + }, + { + "epoch": 1.8702841547599935, + "grad_norm": 0.7471329569816589, + "learning_rate": 8.871436151265184e-05, + "loss": 0.5137, + "step": 5825 + }, + { + "epoch": 1.870605233584845, + "grad_norm": 0.910190999507904, + "learning_rate": 8.867967862320934e-05, + "loss": 0.5835, + "step": 5826 + }, + { + "epoch": 1.8709263124096966, + "grad_norm": 0.7468095421791077, + "learning_rate": 8.864499711311362e-05, + "loss": 0.417, + "step": 5827 + }, + { + "epoch": 1.8712473912345482, + "grad_norm": 0.8930068016052246, + "learning_rate": 8.861031698659063e-05, + "loss": 0.6094, + "step": 5828 + }, + { + "epoch": 1.8715684700593997, + "grad_norm": 0.793305516242981, + "learning_rate": 8.857563824786596e-05, + "loss": 0.5365, + "step": 5829 + }, + { + "epoch": 1.871889548884251, + "grad_norm": 0.6843454837799072, + "learning_rate": 8.854096090116508e-05, + "loss": 0.4672, + "step": 5830 + }, + { + "epoch": 1.8722106277091026, + "grad_norm": 0.9117445349693298, + "learning_rate": 8.850628495071336e-05, + "loss": 0.5671, + "step": 5831 + }, + { + "epoch": 1.872531706533954, + "grad_norm": 0.6631526350975037, + "learning_rate": 8.847161040073594e-05, + "loss": 0.3177, + "step": 5832 + }, + { + "epoch": 1.8728527853588055, + "grad_norm": 0.716529130935669, + "learning_rate": 8.843693725545786e-05, + "loss": 0.4629, + "step": 5833 + }, + { + "epoch": 1.873173864183657, + "grad_norm": 1.5731027126312256, + "learning_rate": 8.840226551910387e-05, + "loss": 0.8211, + "step": 5834 + }, + { + "epoch": 1.8734949430085086, + "grad_norm": 1.7772483825683594, + "learning_rate": 8.836759519589867e-05, + "loss": 0.7299, + "step": 5835 + }, + { + "epoch": 1.8738160218333602, + "grad_norm": 1.1696412563323975, + "learning_rate": 8.833292629006668e-05, + "loss": 0.5431, + "step": 5836 + }, + { + "epoch": 1.8741371006582117, + "grad_norm": 1.2018239498138428, + "learning_rate": 8.829825880583226e-05, + "loss": 0.5802, + "step": 5837 + }, + { + "epoch": 1.8744581794830633, + "grad_norm": 1.0494779348373413, + "learning_rate": 8.826359274741953e-05, + "loss": 0.7327, + "step": 5838 + }, + { + "epoch": 1.8747792583079146, + "grad_norm": 1.106572151184082, + "learning_rate": 8.822892811905237e-05, + "loss": 0.6034, + "step": 5839 + }, + { + "epoch": 1.8751003371327661, + "grad_norm": 0.8521847724914551, + "learning_rate": 8.819426492495464e-05, + "loss": 0.504, + "step": 5840 + }, + { + "epoch": 1.8754214159576175, + "grad_norm": 0.9584136605262756, + "learning_rate": 8.81596031693499e-05, + "loss": 0.546, + "step": 5841 + }, + { + "epoch": 1.875742494782469, + "grad_norm": 1.2079963684082031, + "learning_rate": 8.812494285646163e-05, + "loss": 0.491, + "step": 5842 + }, + { + "epoch": 1.8760635736073206, + "grad_norm": 0.8575901389122009, + "learning_rate": 8.809028399051302e-05, + "loss": 0.6095, + "step": 5843 + }, + { + "epoch": 1.8763846524321721, + "grad_norm": 0.8896117806434631, + "learning_rate": 8.805562657572723e-05, + "loss": 0.5603, + "step": 5844 + }, + { + "epoch": 1.8767057312570237, + "grad_norm": 0.928081750869751, + "learning_rate": 8.802097061632705e-05, + "loss": 0.5653, + "step": 5845 + }, + { + "epoch": 1.8770268100818752, + "grad_norm": 1.1048555374145508, + "learning_rate": 8.79863161165353e-05, + "loss": 0.5764, + "step": 5846 + }, + { + "epoch": 1.8773478889067265, + "grad_norm": 0.8389851450920105, + "learning_rate": 8.79516630805745e-05, + "loss": 0.6391, + "step": 5847 + }, + { + "epoch": 1.877668967731578, + "grad_norm": 0.9550890922546387, + "learning_rate": 8.791701151266696e-05, + "loss": 0.6019, + "step": 5848 + }, + { + "epoch": 1.8779900465564296, + "grad_norm": 0.8240407109260559, + "learning_rate": 8.788236141703498e-05, + "loss": 0.4589, + "step": 5849 + }, + { + "epoch": 1.878311125381281, + "grad_norm": 0.9926547408103943, + "learning_rate": 8.784771279790044e-05, + "loss": 0.5949, + "step": 5850 + }, + { + "epoch": 1.8786322042061325, + "grad_norm": 1.015940546989441, + "learning_rate": 8.781306565948528e-05, + "loss": 0.611, + "step": 5851 + }, + { + "epoch": 1.878953283030984, + "grad_norm": 0.912274181842804, + "learning_rate": 8.777842000601105e-05, + "loss": 0.488, + "step": 5852 + }, + { + "epoch": 1.8792743618558356, + "grad_norm": 1.0201482772827148, + "learning_rate": 8.774377584169933e-05, + "loss": 0.6077, + "step": 5853 + }, + { + "epoch": 1.8795954406806872, + "grad_norm": 1.027444839477539, + "learning_rate": 8.77091331707713e-05, + "loss": 0.6085, + "step": 5854 + }, + { + "epoch": 1.8799165195055387, + "grad_norm": 0.821168065071106, + "learning_rate": 8.767449199744814e-05, + "loss": 0.5152, + "step": 5855 + }, + { + "epoch": 1.88023759833039, + "grad_norm": 1.338505506515503, + "learning_rate": 8.763985232595075e-05, + "loss": 0.5792, + "step": 5856 + }, + { + "epoch": 1.8805586771552416, + "grad_norm": 0.9771504402160645, + "learning_rate": 8.760521416049983e-05, + "loss": 0.6102, + "step": 5857 + }, + { + "epoch": 1.8808797559800932, + "grad_norm": 0.9123767018318176, + "learning_rate": 8.7570577505316e-05, + "loss": 0.5544, + "step": 5858 + }, + { + "epoch": 1.8812008348049445, + "grad_norm": 0.6781209707260132, + "learning_rate": 8.753594236461957e-05, + "loss": 0.4612, + "step": 5859 + }, + { + "epoch": 1.881521913629796, + "grad_norm": 0.8213598132133484, + "learning_rate": 8.750130874263077e-05, + "loss": 0.338, + "step": 5860 + }, + { + "epoch": 1.8818429924546476, + "grad_norm": 1.2444860935211182, + "learning_rate": 8.746667664356956e-05, + "loss": 0.6963, + "step": 5861 + }, + { + "epoch": 1.8821640712794991, + "grad_norm": 0.9669928550720215, + "learning_rate": 8.743204607165583e-05, + "loss": 0.3941, + "step": 5862 + }, + { + "epoch": 1.8824851501043507, + "grad_norm": 0.7179502248764038, + "learning_rate": 8.739741703110913e-05, + "loss": 0.3945, + "step": 5863 + }, + { + "epoch": 1.8828062289292022, + "grad_norm": 0.9342414736747742, + "learning_rate": 8.736278952614898e-05, + "loss": 0.4656, + "step": 5864 + }, + { + "epoch": 1.8831273077540536, + "grad_norm": 0.5712354779243469, + "learning_rate": 8.732816356099456e-05, + "loss": 0.8414, + "step": 5865 + }, + { + "epoch": 1.8834483865789051, + "grad_norm": 0.5650877952575684, + "learning_rate": 8.729353913986496e-05, + "loss": 0.77, + "step": 5866 + }, + { + "epoch": 1.8837694654037567, + "grad_norm": 0.6072726845741272, + "learning_rate": 8.72589162669791e-05, + "loss": 0.3555, + "step": 5867 + }, + { + "epoch": 1.884090544228608, + "grad_norm": 0.5450513362884521, + "learning_rate": 8.722429494655561e-05, + "loss": 0.2534, + "step": 5868 + }, + { + "epoch": 1.8844116230534596, + "grad_norm": 0.7012900114059448, + "learning_rate": 8.718967518281307e-05, + "loss": 0.4425, + "step": 5869 + }, + { + "epoch": 1.884732701878311, + "grad_norm": 0.7500693798065186, + "learning_rate": 8.715505697996971e-05, + "loss": 0.221, + "step": 5870 + }, + { + "epoch": 1.8850537807031627, + "grad_norm": 0.9719879627227783, + "learning_rate": 8.712044034224374e-05, + "loss": 0.8356, + "step": 5871 + }, + { + "epoch": 1.8853748595280142, + "grad_norm": 0.8122969269752502, + "learning_rate": 8.708582527385301e-05, + "loss": 0.623, + "step": 5872 + }, + { + "epoch": 1.8856959383528658, + "grad_norm": 1.0290989875793457, + "learning_rate": 8.705121177901532e-05, + "loss": 0.7414, + "step": 5873 + }, + { + "epoch": 1.886017017177717, + "grad_norm": 1.0142052173614502, + "learning_rate": 8.70165998619482e-05, + "loss": 0.6093, + "step": 5874 + }, + { + "epoch": 1.8863380960025686, + "grad_norm": 0.9030125141143799, + "learning_rate": 8.698198952686896e-05, + "loss": 0.6493, + "step": 5875 + }, + { + "epoch": 1.8866591748274202, + "grad_norm": 0.8314472436904907, + "learning_rate": 8.694738077799488e-05, + "loss": 0.6468, + "step": 5876 + }, + { + "epoch": 1.8869802536522715, + "grad_norm": 0.7704334855079651, + "learning_rate": 8.691277361954279e-05, + "loss": 0.4876, + "step": 5877 + }, + { + "epoch": 1.887301332477123, + "grad_norm": 0.8338825702667236, + "learning_rate": 8.687816805572956e-05, + "loss": 0.6363, + "step": 5878 + }, + { + "epoch": 1.8876224113019746, + "grad_norm": 0.8600116968154907, + "learning_rate": 8.684356409077176e-05, + "loss": 0.5554, + "step": 5879 + }, + { + "epoch": 1.8879434901268262, + "grad_norm": 0.8333651423454285, + "learning_rate": 8.680896172888576e-05, + "loss": 0.6546, + "step": 5880 + }, + { + "epoch": 1.8882645689516777, + "grad_norm": 0.9315165877342224, + "learning_rate": 8.677436097428775e-05, + "loss": 0.5263, + "step": 5881 + }, + { + "epoch": 1.8885856477765293, + "grad_norm": 0.9922060370445251, + "learning_rate": 8.673976183119376e-05, + "loss": 0.6409, + "step": 5882 + }, + { + "epoch": 1.8889067266013806, + "grad_norm": 0.9471839070320129, + "learning_rate": 8.670516430381958e-05, + "loss": 0.7208, + "step": 5883 + }, + { + "epoch": 1.8892278054262321, + "grad_norm": 1.207506537437439, + "learning_rate": 8.667056839638075e-05, + "loss": 0.8187, + "step": 5884 + }, + { + "epoch": 1.8895488842510837, + "grad_norm": 1.2479336261749268, + "learning_rate": 8.663597411309279e-05, + "loss": 0.4916, + "step": 5885 + }, + { + "epoch": 1.889869963075935, + "grad_norm": 0.7998619675636292, + "learning_rate": 8.660138145817079e-05, + "loss": 0.5774, + "step": 5886 + }, + { + "epoch": 1.8901910419007866, + "grad_norm": 0.6762514710426331, + "learning_rate": 8.656679043582986e-05, + "loss": 0.4435, + "step": 5887 + }, + { + "epoch": 1.8905121207256381, + "grad_norm": 2.044832944869995, + "learning_rate": 8.653220105028474e-05, + "loss": 0.6974, + "step": 5888 + }, + { + "epoch": 1.8908331995504897, + "grad_norm": 0.8140920400619507, + "learning_rate": 8.64976133057501e-05, + "loss": 0.6755, + "step": 5889 + }, + { + "epoch": 1.8911542783753412, + "grad_norm": 0.837049663066864, + "learning_rate": 8.646302720644027e-05, + "loss": 0.4787, + "step": 5890 + }, + { + "epoch": 1.8914753572001928, + "grad_norm": 0.9118859767913818, + "learning_rate": 8.642844275656957e-05, + "loss": 0.6674, + "step": 5891 + }, + { + "epoch": 1.891796436025044, + "grad_norm": 0.8164528012275696, + "learning_rate": 8.639385996035194e-05, + "loss": 0.5175, + "step": 5892 + }, + { + "epoch": 1.8921175148498957, + "grad_norm": 1.1621068716049194, + "learning_rate": 8.635927882200116e-05, + "loss": 0.5832, + "step": 5893 + }, + { + "epoch": 1.8924385936747472, + "grad_norm": 0.7055744528770447, + "learning_rate": 8.632469934573094e-05, + "loss": 0.5454, + "step": 5894 + }, + { + "epoch": 1.8927596724995985, + "grad_norm": 1.0053250789642334, + "learning_rate": 8.629012153575458e-05, + "loss": 0.7157, + "step": 5895 + }, + { + "epoch": 1.89308075132445, + "grad_norm": 0.8789286613464355, + "learning_rate": 8.625554539628535e-05, + "loss": 0.5651, + "step": 5896 + }, + { + "epoch": 1.8934018301493016, + "grad_norm": 0.8847569227218628, + "learning_rate": 8.62209709315362e-05, + "loss": 0.648, + "step": 5897 + }, + { + "epoch": 1.8937229089741532, + "grad_norm": 0.8232566714286804, + "learning_rate": 8.618639814571996e-05, + "loss": 0.473, + "step": 5898 + }, + { + "epoch": 1.8940439877990047, + "grad_norm": 0.8590362668037415, + "learning_rate": 8.615182704304918e-05, + "loss": 0.6142, + "step": 5899 + }, + { + "epoch": 1.8943650666238563, + "grad_norm": 1.0845537185668945, + "learning_rate": 8.611725762773631e-05, + "loss": 0.5017, + "step": 5900 + }, + { + "epoch": 1.8946861454487076, + "grad_norm": 0.7190443277359009, + "learning_rate": 8.608268990399349e-05, + "loss": 0.4462, + "step": 5901 + }, + { + "epoch": 1.8950072242735592, + "grad_norm": 0.7313873171806335, + "learning_rate": 8.604812387603265e-05, + "loss": 0.4465, + "step": 5902 + }, + { + "epoch": 1.8953283030984105, + "grad_norm": 0.816182553768158, + "learning_rate": 8.601355954806561e-05, + "loss": 0.4299, + "step": 5903 + }, + { + "epoch": 1.895649381923262, + "grad_norm": 1.0076704025268555, + "learning_rate": 8.597899692430389e-05, + "loss": 0.5754, + "step": 5904 + }, + { + "epoch": 1.8959704607481136, + "grad_norm": 0.9564604759216309, + "learning_rate": 8.594443600895892e-05, + "loss": 0.4443, + "step": 5905 + }, + { + "epoch": 1.8962915395729651, + "grad_norm": 0.6225574612617493, + "learning_rate": 8.590987680624174e-05, + "loss": 0.3169, + "step": 5906 + }, + { + "epoch": 1.8966126183978167, + "grad_norm": 1.0120649337768555, + "learning_rate": 8.587531932036335e-05, + "loss": 0.6126, + "step": 5907 + }, + { + "epoch": 1.8969336972226682, + "grad_norm": 1.029392957687378, + "learning_rate": 8.584076355553444e-05, + "loss": 0.5688, + "step": 5908 + }, + { + "epoch": 1.8972547760475198, + "grad_norm": 1.1245523691177368, + "learning_rate": 8.580620951596557e-05, + "loss": 0.5589, + "step": 5909 + }, + { + "epoch": 1.8975758548723711, + "grad_norm": 0.8369296789169312, + "learning_rate": 8.577165720586703e-05, + "loss": 0.3727, + "step": 5910 + }, + { + "epoch": 1.8978969336972227, + "grad_norm": 0.8176161646842957, + "learning_rate": 8.573710662944885e-05, + "loss": 0.3624, + "step": 5911 + }, + { + "epoch": 1.898218012522074, + "grad_norm": 0.9345711469650269, + "learning_rate": 8.570255779092098e-05, + "loss": 0.4462, + "step": 5912 + }, + { + "epoch": 1.8985390913469256, + "grad_norm": 1.294392466545105, + "learning_rate": 8.566801069449305e-05, + "loss": 0.3714, + "step": 5913 + }, + { + "epoch": 1.898860170171777, + "grad_norm": 0.5765014886856079, + "learning_rate": 8.56334653443746e-05, + "loss": 0.3126, + "step": 5914 + }, + { + "epoch": 1.8991812489966287, + "grad_norm": 0.6456901431083679, + "learning_rate": 8.559892174477479e-05, + "loss": 0.3416, + "step": 5915 + }, + { + "epoch": 1.8995023278214802, + "grad_norm": 0.4262344539165497, + "learning_rate": 8.55643798999027e-05, + "loss": 0.4155, + "step": 5916 + }, + { + "epoch": 1.8998234066463318, + "grad_norm": 0.4280303120613098, + "learning_rate": 8.55298398139671e-05, + "loss": 0.4822, + "step": 5917 + }, + { + "epoch": 1.9001444854711833, + "grad_norm": 0.6191977858543396, + "learning_rate": 8.549530149117664e-05, + "loss": 0.5129, + "step": 5918 + }, + { + "epoch": 1.9004655642960346, + "grad_norm": 0.5976957678794861, + "learning_rate": 8.546076493573972e-05, + "loss": 0.2694, + "step": 5919 + }, + { + "epoch": 1.9007866431208862, + "grad_norm": 0.6138757467269897, + "learning_rate": 8.542623015186445e-05, + "loss": 0.1946, + "step": 5920 + }, + { + "epoch": 1.9011077219457375, + "grad_norm": 0.624046802520752, + "learning_rate": 8.539169714375885e-05, + "loss": 0.131, + "step": 5921 + }, + { + "epoch": 1.901428800770589, + "grad_norm": 0.7090097069740295, + "learning_rate": 8.535716591563062e-05, + "loss": 0.3913, + "step": 5922 + }, + { + "epoch": 1.9017498795954406, + "grad_norm": 1.0457631349563599, + "learning_rate": 8.532263647168735e-05, + "loss": 0.6625, + "step": 5923 + }, + { + "epoch": 1.9020709584202922, + "grad_norm": 0.8407219648361206, + "learning_rate": 8.528810881613626e-05, + "loss": 0.6366, + "step": 5924 + }, + { + "epoch": 1.9023920372451437, + "grad_norm": 0.9645373225212097, + "learning_rate": 8.525358295318454e-05, + "loss": 0.524, + "step": 5925 + }, + { + "epoch": 1.9027131160699953, + "grad_norm": 0.8919804096221924, + "learning_rate": 8.521905888703893e-05, + "loss": 0.5851, + "step": 5926 + }, + { + "epoch": 1.9030341948948468, + "grad_norm": 0.740583598613739, + "learning_rate": 8.51845366219062e-05, + "loss": 0.5141, + "step": 5927 + }, + { + "epoch": 1.9033552737196981, + "grad_norm": 0.7482326030731201, + "learning_rate": 8.515001616199279e-05, + "loss": 0.4804, + "step": 5928 + }, + { + "epoch": 1.9036763525445497, + "grad_norm": 0.9648141860961914, + "learning_rate": 8.511549751150479e-05, + "loss": 0.6938, + "step": 5929 + }, + { + "epoch": 1.903997431369401, + "grad_norm": 0.8778332471847534, + "learning_rate": 8.508098067464832e-05, + "loss": 0.6264, + "step": 5930 + }, + { + "epoch": 1.9043185101942526, + "grad_norm": 0.9466199278831482, + "learning_rate": 8.504646565562906e-05, + "loss": 0.6328, + "step": 5931 + }, + { + "epoch": 1.9046395890191041, + "grad_norm": 1.0974069833755493, + "learning_rate": 8.501195245865263e-05, + "loss": 0.8376, + "step": 5932 + }, + { + "epoch": 1.9049606678439557, + "grad_norm": 1.1989012956619263, + "learning_rate": 8.497744108792429e-05, + "loss": 0.7932, + "step": 5933 + }, + { + "epoch": 1.9052817466688072, + "grad_norm": 1.0475575923919678, + "learning_rate": 8.494293154764924e-05, + "loss": 0.6127, + "step": 5934 + }, + { + "epoch": 1.9056028254936588, + "grad_norm": 0.8935070633888245, + "learning_rate": 8.490842384203225e-05, + "loss": 0.4574, + "step": 5935 + }, + { + "epoch": 1.9059239043185103, + "grad_norm": 0.9734428524971008, + "learning_rate": 8.487391797527808e-05, + "loss": 0.6899, + "step": 5936 + }, + { + "epoch": 1.9062449831433617, + "grad_norm": 1.1841446161270142, + "learning_rate": 8.483941395159114e-05, + "loss": 0.5652, + "step": 5937 + }, + { + "epoch": 1.9065660619682132, + "grad_norm": 0.7886440753936768, + "learning_rate": 8.480491177517557e-05, + "loss": 0.5065, + "step": 5938 + }, + { + "epoch": 1.9068871407930645, + "grad_norm": 1.349183440208435, + "learning_rate": 8.477041145023546e-05, + "loss": 0.6979, + "step": 5939 + }, + { + "epoch": 1.907208219617916, + "grad_norm": 0.7376853227615356, + "learning_rate": 8.473591298097448e-05, + "loss": 0.485, + "step": 5940 + }, + { + "epoch": 1.9075292984427676, + "grad_norm": 0.7252330183982849, + "learning_rate": 8.47014163715962e-05, + "loss": 0.4728, + "step": 5941 + }, + { + "epoch": 1.9078503772676192, + "grad_norm": 1.054677128791809, + "learning_rate": 8.466692162630392e-05, + "loss": 0.7022, + "step": 5942 + }, + { + "epoch": 1.9081714560924707, + "grad_norm": 0.9820946455001831, + "learning_rate": 8.463242874930079e-05, + "loss": 0.4489, + "step": 5943 + }, + { + "epoch": 1.9084925349173223, + "grad_norm": 0.9880334734916687, + "learning_rate": 8.459793774478957e-05, + "loss": 0.5668, + "step": 5944 + }, + { + "epoch": 1.9088136137421738, + "grad_norm": 0.8617029190063477, + "learning_rate": 8.456344861697289e-05, + "loss": 0.5731, + "step": 5945 + }, + { + "epoch": 1.9091346925670252, + "grad_norm": 0.9127218127250671, + "learning_rate": 8.452896137005321e-05, + "loss": 0.5747, + "step": 5946 + }, + { + "epoch": 1.9094557713918767, + "grad_norm": 0.8752137422561646, + "learning_rate": 8.449447600823262e-05, + "loss": 0.533, + "step": 5947 + }, + { + "epoch": 1.909776850216728, + "grad_norm": 0.9676142334938049, + "learning_rate": 8.445999253571315e-05, + "loss": 0.6118, + "step": 5948 + }, + { + "epoch": 1.9100979290415796, + "grad_norm": 1.485176920890808, + "learning_rate": 8.442551095669639e-05, + "loss": 0.491, + "step": 5949 + }, + { + "epoch": 1.9104190078664312, + "grad_norm": 0.8751528263092041, + "learning_rate": 8.439103127538392e-05, + "loss": 0.5723, + "step": 5950 + }, + { + "epoch": 1.9107400866912827, + "grad_norm": 0.8707427382469177, + "learning_rate": 8.435655349597689e-05, + "loss": 0.5169, + "step": 5951 + }, + { + "epoch": 1.9110611655161343, + "grad_norm": 1.3300323486328125, + "learning_rate": 8.432207762267644e-05, + "loss": 0.7869, + "step": 5952 + }, + { + "epoch": 1.9113822443409858, + "grad_norm": 0.8493139147758484, + "learning_rate": 8.428760365968326e-05, + "loss": 0.4855, + "step": 5953 + }, + { + "epoch": 1.9117033231658374, + "grad_norm": 0.7276837825775146, + "learning_rate": 8.425313161119787e-05, + "loss": 0.449, + "step": 5954 + }, + { + "epoch": 1.9120244019906887, + "grad_norm": 0.7487722635269165, + "learning_rate": 8.421866148142066e-05, + "loss": 0.5161, + "step": 5955 + }, + { + "epoch": 1.9123454808155402, + "grad_norm": 0.7644703984260559, + "learning_rate": 8.418419327455164e-05, + "loss": 0.4887, + "step": 5956 + }, + { + "epoch": 1.9126665596403916, + "grad_norm": 1.1047660112380981, + "learning_rate": 8.414972699479075e-05, + "loss": 0.6635, + "step": 5957 + }, + { + "epoch": 1.9129876384652431, + "grad_norm": 1.2923800945281982, + "learning_rate": 8.41152626463375e-05, + "loss": 0.8645, + "step": 5958 + }, + { + "epoch": 1.9133087172900947, + "grad_norm": 0.7669944167137146, + "learning_rate": 8.408080023339133e-05, + "loss": 0.5723, + "step": 5959 + }, + { + "epoch": 1.9136297961149462, + "grad_norm": 0.7324369549751282, + "learning_rate": 8.404633976015134e-05, + "loss": 0.4862, + "step": 5960 + }, + { + "epoch": 1.9139508749397978, + "grad_norm": 0.8924105763435364, + "learning_rate": 8.401188123081653e-05, + "loss": 0.4537, + "step": 5961 + }, + { + "epoch": 1.9142719537646493, + "grad_norm": 0.8175060153007507, + "learning_rate": 8.397742464958547e-05, + "loss": 0.5257, + "step": 5962 + }, + { + "epoch": 1.9145930325895009, + "grad_norm": 0.9657145738601685, + "learning_rate": 8.394297002065658e-05, + "loss": 0.4518, + "step": 5963 + }, + { + "epoch": 1.9149141114143522, + "grad_norm": 0.5297245383262634, + "learning_rate": 8.390851734822808e-05, + "loss": 0.366, + "step": 5964 + }, + { + "epoch": 1.9152351902392037, + "grad_norm": 0.5978273153305054, + "learning_rate": 8.387406663649795e-05, + "loss": 0.5728, + "step": 5965 + }, + { + "epoch": 1.915556269064055, + "grad_norm": 0.3594565987586975, + "learning_rate": 8.383961788966391e-05, + "loss": 0.4309, + "step": 5966 + }, + { + "epoch": 1.9158773478889066, + "grad_norm": 0.5129295587539673, + "learning_rate": 8.380517111192337e-05, + "loss": 0.7305, + "step": 5967 + }, + { + "epoch": 1.9161984267137582, + "grad_norm": 0.6634910702705383, + "learning_rate": 8.377072630747364e-05, + "loss": 0.5793, + "step": 5968 + }, + { + "epoch": 1.9165195055386097, + "grad_norm": 0.5111119747161865, + "learning_rate": 8.373628348051165e-05, + "loss": 0.2366, + "step": 5969 + }, + { + "epoch": 1.9168405843634613, + "grad_norm": 0.6472846269607544, + "learning_rate": 8.37018426352342e-05, + "loss": 0.1341, + "step": 5970 + }, + { + "epoch": 1.9171616631883128, + "grad_norm": 0.5201217532157898, + "learning_rate": 8.366740377583781e-05, + "loss": 0.2317, + "step": 5971 + }, + { + "epoch": 1.9174827420131644, + "grad_norm": 0.4701801836490631, + "learning_rate": 8.363296690651868e-05, + "loss": 0.3096, + "step": 5972 + }, + { + "epoch": 1.9178038208380157, + "grad_norm": 0.9514907598495483, + "learning_rate": 8.359853203147291e-05, + "loss": 0.7662, + "step": 5973 + }, + { + "epoch": 1.9181248996628673, + "grad_norm": 0.7471129298210144, + "learning_rate": 8.356409915489625e-05, + "loss": 0.4633, + "step": 5974 + }, + { + "epoch": 1.9184459784877186, + "grad_norm": 0.7893948554992676, + "learning_rate": 8.352966828098428e-05, + "loss": 0.5721, + "step": 5975 + }, + { + "epoch": 1.9187670573125701, + "grad_norm": 0.6973623633384705, + "learning_rate": 8.349523941393224e-05, + "loss": 0.5921, + "step": 5976 + }, + { + "epoch": 1.9190881361374217, + "grad_norm": 0.6330721378326416, + "learning_rate": 8.346081255793525e-05, + "loss": 0.4046, + "step": 5977 + }, + { + "epoch": 1.9194092149622732, + "grad_norm": 0.844613254070282, + "learning_rate": 8.342638771718802e-05, + "loss": 0.6565, + "step": 5978 + }, + { + "epoch": 1.9197302937871248, + "grad_norm": 0.8681631088256836, + "learning_rate": 8.339196489588523e-05, + "loss": 0.5315, + "step": 5979 + }, + { + "epoch": 1.9200513726119763, + "grad_norm": 0.8076524138450623, + "learning_rate": 8.335754409822114e-05, + "loss": 0.621, + "step": 5980 + }, + { + "epoch": 1.9203724514368279, + "grad_norm": 0.8629617094993591, + "learning_rate": 8.332312532838978e-05, + "loss": 0.5126, + "step": 5981 + }, + { + "epoch": 1.9206935302616792, + "grad_norm": 0.8805716633796692, + "learning_rate": 8.328870859058506e-05, + "loss": 0.6354, + "step": 5982 + }, + { + "epoch": 1.9210146090865308, + "grad_norm": 0.9122462272644043, + "learning_rate": 8.325429388900046e-05, + "loss": 0.6502, + "step": 5983 + }, + { + "epoch": 1.921335687911382, + "grad_norm": 0.7924925684928894, + "learning_rate": 8.321988122782937e-05, + "loss": 0.6105, + "step": 5984 + }, + { + "epoch": 1.9216567667362336, + "grad_norm": 0.9744467735290527, + "learning_rate": 8.318547061126485e-05, + "loss": 0.6674, + "step": 5985 + }, + { + "epoch": 1.9219778455610852, + "grad_norm": 0.8236846327781677, + "learning_rate": 8.315106204349976e-05, + "loss": 0.5076, + "step": 5986 + }, + { + "epoch": 1.9222989243859367, + "grad_norm": 0.8699111342430115, + "learning_rate": 8.311665552872662e-05, + "loss": 0.6371, + "step": 5987 + }, + { + "epoch": 1.9226200032107883, + "grad_norm": 0.9178052544593811, + "learning_rate": 8.30822510711378e-05, + "loss": 0.7132, + "step": 5988 + }, + { + "epoch": 1.9229410820356398, + "grad_norm": 1.2815251350402832, + "learning_rate": 8.30478486749254e-05, + "loss": 0.6409, + "step": 5989 + }, + { + "epoch": 1.9232621608604914, + "grad_norm": 0.7497451901435852, + "learning_rate": 8.301344834428116e-05, + "loss": 0.4567, + "step": 5990 + }, + { + "epoch": 1.9235832396853427, + "grad_norm": 0.8337712287902832, + "learning_rate": 8.297905008339677e-05, + "loss": 0.603, + "step": 5991 + }, + { + "epoch": 1.9239043185101943, + "grad_norm": 1.0814270973205566, + "learning_rate": 8.294465389646345e-05, + "loss": 0.7705, + "step": 5992 + }, + { + "epoch": 1.9242253973350456, + "grad_norm": 0.9741120934486389, + "learning_rate": 8.291025978767235e-05, + "loss": 0.7132, + "step": 5993 + }, + { + "epoch": 1.9245464761598972, + "grad_norm": 0.6710167527198792, + "learning_rate": 8.287586776121423e-05, + "loss": 0.416, + "step": 5994 + }, + { + "epoch": 1.9248675549847487, + "grad_norm": 0.6828485131263733, + "learning_rate": 8.284147782127971e-05, + "loss": 0.4111, + "step": 5995 + }, + { + "epoch": 1.9251886338096003, + "grad_norm": 1.174616813659668, + "learning_rate": 8.280708997205904e-05, + "loss": 0.6958, + "step": 5996 + }, + { + "epoch": 1.9255097126344518, + "grad_norm": 1.070745825767517, + "learning_rate": 8.277270421774235e-05, + "loss": 0.7379, + "step": 5997 + }, + { + "epoch": 1.9258307914593034, + "grad_norm": 0.9961045384407043, + "learning_rate": 8.273832056251937e-05, + "loss": 0.467, + "step": 5998 + }, + { + "epoch": 1.926151870284155, + "grad_norm": 0.8946678638458252, + "learning_rate": 8.270393901057964e-05, + "loss": 0.5288, + "step": 5999 + }, + { + "epoch": 1.9264729491090062, + "grad_norm": 0.7546229362487793, + "learning_rate": 8.266955956611253e-05, + "loss": 0.4404, + "step": 6000 + }, + { + "epoch": 1.9267940279338578, + "grad_norm": 0.8234146237373352, + "learning_rate": 8.263518223330697e-05, + "loss": 0.5327, + "step": 6001 + }, + { + "epoch": 1.9271151067587091, + "grad_norm": 0.760515570640564, + "learning_rate": 8.26008070163518e-05, + "loss": 0.4604, + "step": 6002 + }, + { + "epoch": 1.9274361855835607, + "grad_norm": 1.1221221685409546, + "learning_rate": 8.256643391943551e-05, + "loss": 0.5529, + "step": 6003 + }, + { + "epoch": 1.9277572644084122, + "grad_norm": 0.8846840858459473, + "learning_rate": 8.25320629467464e-05, + "loss": 0.4192, + "step": 6004 + }, + { + "epoch": 1.9280783432332638, + "grad_norm": 0.8447346091270447, + "learning_rate": 8.249769410247239e-05, + "loss": 0.4966, + "step": 6005 + }, + { + "epoch": 1.9283994220581153, + "grad_norm": 1.1361325979232788, + "learning_rate": 8.24633273908013e-05, + "loss": 0.555, + "step": 6006 + }, + { + "epoch": 1.9287205008829669, + "grad_norm": 1.2147115468978882, + "learning_rate": 8.242896281592057e-05, + "loss": 0.699, + "step": 6007 + }, + { + "epoch": 1.9290415797078184, + "grad_norm": 0.7966561317443848, + "learning_rate": 8.239460038201739e-05, + "loss": 0.4552, + "step": 6008 + }, + { + "epoch": 1.9293626585326697, + "grad_norm": 0.9599836468696594, + "learning_rate": 8.236024009327879e-05, + "loss": 0.6289, + "step": 6009 + }, + { + "epoch": 1.9296837373575213, + "grad_norm": 1.475778341293335, + "learning_rate": 8.23258819538914e-05, + "loss": 0.4814, + "step": 6010 + }, + { + "epoch": 1.9300048161823726, + "grad_norm": 1.2509891986846924, + "learning_rate": 8.229152596804168e-05, + "loss": 0.4725, + "step": 6011 + }, + { + "epoch": 1.9303258950072242, + "grad_norm": 0.7716514468193054, + "learning_rate": 8.225717213991579e-05, + "loss": 0.4775, + "step": 6012 + }, + { + "epoch": 1.9306469738320757, + "grad_norm": 0.9235790967941284, + "learning_rate": 8.222282047369971e-05, + "loss": 0.6091, + "step": 6013 + }, + { + "epoch": 1.9309680526569273, + "grad_norm": 0.5926783084869385, + "learning_rate": 8.218847097357898e-05, + "loss": 0.3537, + "step": 6014 + }, + { + "epoch": 1.9312891314817788, + "grad_norm": 0.5469971299171448, + "learning_rate": 8.215412364373907e-05, + "loss": 0.5765, + "step": 6015 + }, + { + "epoch": 1.9316102103066304, + "grad_norm": 0.5453217029571533, + "learning_rate": 8.211977848836506e-05, + "loss": 0.8021, + "step": 6016 + }, + { + "epoch": 1.931931289131482, + "grad_norm": 0.5584077835083008, + "learning_rate": 8.208543551164178e-05, + "loss": 0.2779, + "step": 6017 + }, + { + "epoch": 1.9322523679563333, + "grad_norm": 1.3637664318084717, + "learning_rate": 8.205109471775387e-05, + "loss": 0.5096, + "step": 6018 + }, + { + "epoch": 1.9325734467811848, + "grad_norm": 0.6770474910736084, + "learning_rate": 8.201675611088558e-05, + "loss": 0.3166, + "step": 6019 + }, + { + "epoch": 1.9328945256060361, + "grad_norm": 0.7160546183586121, + "learning_rate": 8.198241969522107e-05, + "loss": 0.1541, + "step": 6020 + }, + { + "epoch": 1.9332156044308877, + "grad_norm": 0.6660668253898621, + "learning_rate": 8.194808547494401e-05, + "loss": 0.5054, + "step": 6021 + }, + { + "epoch": 1.9335366832557392, + "grad_norm": 0.7240803241729736, + "learning_rate": 8.191375345423799e-05, + "loss": 0.5679, + "step": 6022 + }, + { + "epoch": 1.9338577620805908, + "grad_norm": 0.9651069045066833, + "learning_rate": 8.187942363728625e-05, + "loss": 0.5552, + "step": 6023 + }, + { + "epoch": 1.9341788409054423, + "grad_norm": 0.6748064160346985, + "learning_rate": 8.184509602827181e-05, + "loss": 0.5365, + "step": 6024 + }, + { + "epoch": 1.934499919730294, + "grad_norm": 0.7690969705581665, + "learning_rate": 8.181077063137733e-05, + "loss": 0.5248, + "step": 6025 + }, + { + "epoch": 1.9348209985551454, + "grad_norm": 0.7835337519645691, + "learning_rate": 8.177644745078526e-05, + "loss": 0.5647, + "step": 6026 + }, + { + "epoch": 1.9351420773799968, + "grad_norm": 0.7683063745498657, + "learning_rate": 8.174212649067781e-05, + "loss": 0.485, + "step": 6027 + }, + { + "epoch": 1.9354631562048483, + "grad_norm": 0.9228124618530273, + "learning_rate": 8.170780775523684e-05, + "loss": 0.6106, + "step": 6028 + }, + { + "epoch": 1.9357842350296997, + "grad_norm": 0.8772586584091187, + "learning_rate": 8.167349124864405e-05, + "loss": 0.5138, + "step": 6029 + }, + { + "epoch": 1.9361053138545512, + "grad_norm": 0.8027740716934204, + "learning_rate": 8.163917697508072e-05, + "loss": 0.583, + "step": 6030 + }, + { + "epoch": 1.9364263926794028, + "grad_norm": 0.9190700054168701, + "learning_rate": 8.160486493872798e-05, + "loss": 0.7091, + "step": 6031 + }, + { + "epoch": 1.9367474715042543, + "grad_norm": 0.9426003098487854, + "learning_rate": 8.157055514376666e-05, + "loss": 0.6263, + "step": 6032 + }, + { + "epoch": 1.9370685503291059, + "grad_norm": 0.918766975402832, + "learning_rate": 8.153624759437732e-05, + "loss": 0.669, + "step": 6033 + }, + { + "epoch": 1.9373896291539574, + "grad_norm": 0.9351176023483276, + "learning_rate": 8.15019422947402e-05, + "loss": 0.6859, + "step": 6034 + }, + { + "epoch": 1.937710707978809, + "grad_norm": 0.8434003591537476, + "learning_rate": 8.146763924903527e-05, + "loss": 0.5469, + "step": 6035 + }, + { + "epoch": 1.9380317868036603, + "grad_norm": 0.8188439607620239, + "learning_rate": 8.14333384614423e-05, + "loss": 0.5615, + "step": 6036 + }, + { + "epoch": 1.9383528656285118, + "grad_norm": 0.782618522644043, + "learning_rate": 8.139903993614068e-05, + "loss": 0.5117, + "step": 6037 + }, + { + "epoch": 1.9386739444533632, + "grad_norm": 0.882308840751648, + "learning_rate": 8.136474367730969e-05, + "loss": 0.5433, + "step": 6038 + }, + { + "epoch": 1.9389950232782147, + "grad_norm": 1.1491224765777588, + "learning_rate": 8.133044968912811e-05, + "loss": 0.5036, + "step": 6039 + }, + { + "epoch": 1.9393161021030663, + "grad_norm": 1.1491897106170654, + "learning_rate": 8.129615797577461e-05, + "loss": 0.63, + "step": 6040 + }, + { + "epoch": 1.9396371809279178, + "grad_norm": 0.8906580805778503, + "learning_rate": 8.126186854142752e-05, + "loss": 0.4457, + "step": 6041 + }, + { + "epoch": 1.9399582597527694, + "grad_norm": 0.7406003475189209, + "learning_rate": 8.122758139026495e-05, + "loss": 0.4659, + "step": 6042 + }, + { + "epoch": 1.940279338577621, + "grad_norm": 0.8870121240615845, + "learning_rate": 8.119329652646463e-05, + "loss": 0.6007, + "step": 6043 + }, + { + "epoch": 1.9406004174024725, + "grad_norm": 1.128095269203186, + "learning_rate": 8.115901395420407e-05, + "loss": 0.6736, + "step": 6044 + }, + { + "epoch": 1.9409214962273238, + "grad_norm": 1.0389516353607178, + "learning_rate": 8.11247336776605e-05, + "loss": 0.6387, + "step": 6045 + }, + { + "epoch": 1.9412425750521753, + "grad_norm": 1.045128583908081, + "learning_rate": 8.109045570101086e-05, + "loss": 0.6013, + "step": 6046 + }, + { + "epoch": 1.9415636538770267, + "grad_norm": 0.6581218838691711, + "learning_rate": 8.105618002843189e-05, + "loss": 0.4185, + "step": 6047 + }, + { + "epoch": 1.9418847327018782, + "grad_norm": 0.7015986442565918, + "learning_rate": 8.102190666409987e-05, + "loss": 0.3757, + "step": 6048 + }, + { + "epoch": 1.9422058115267298, + "grad_norm": 1.0713157653808594, + "learning_rate": 8.0987635612191e-05, + "loss": 0.5619, + "step": 6049 + }, + { + "epoch": 1.9425268903515813, + "grad_norm": 0.5126787424087524, + "learning_rate": 8.095336687688102e-05, + "loss": 0.2985, + "step": 6050 + }, + { + "epoch": 1.9428479691764329, + "grad_norm": 0.9810903668403625, + "learning_rate": 8.091910046234552e-05, + "loss": 0.5227, + "step": 6051 + }, + { + "epoch": 1.9431690480012844, + "grad_norm": 1.0264313220977783, + "learning_rate": 8.088483637275979e-05, + "loss": 0.591, + "step": 6052 + }, + { + "epoch": 1.943490126826136, + "grad_norm": 0.7621950507164001, + "learning_rate": 8.085057461229872e-05, + "loss": 0.4689, + "step": 6053 + }, + { + "epoch": 1.9438112056509873, + "grad_norm": 0.6205912828445435, + "learning_rate": 8.081631518513704e-05, + "loss": 0.339, + "step": 6054 + }, + { + "epoch": 1.9441322844758389, + "grad_norm": 0.9565242528915405, + "learning_rate": 8.078205809544917e-05, + "loss": 0.5811, + "step": 6055 + }, + { + "epoch": 1.9444533633006902, + "grad_norm": 1.0997697114944458, + "learning_rate": 8.074780334740928e-05, + "loss": 0.6262, + "step": 6056 + }, + { + "epoch": 1.9447744421255417, + "grad_norm": 0.8352198004722595, + "learning_rate": 8.071355094519109e-05, + "loss": 0.455, + "step": 6057 + }, + { + "epoch": 1.9450955209503933, + "grad_norm": 1.3071821928024292, + "learning_rate": 8.067930089296827e-05, + "loss": 0.6426, + "step": 6058 + }, + { + "epoch": 1.9454165997752448, + "grad_norm": 0.8504396677017212, + "learning_rate": 8.064505319491398e-05, + "loss": 0.4168, + "step": 6059 + }, + { + "epoch": 1.9457376786000964, + "grad_norm": 1.1026358604431152, + "learning_rate": 8.061080785520126e-05, + "loss": 0.5795, + "step": 6060 + }, + { + "epoch": 1.946058757424948, + "grad_norm": 0.9876211285591125, + "learning_rate": 8.057656487800282e-05, + "loss": 0.4293, + "step": 6061 + }, + { + "epoch": 1.9463798362497995, + "grad_norm": 0.8381431102752686, + "learning_rate": 8.0542324267491e-05, + "loss": 0.4613, + "step": 6062 + }, + { + "epoch": 1.9467009150746508, + "grad_norm": 0.9041491150856018, + "learning_rate": 8.050808602783795e-05, + "loss": 0.4021, + "step": 6063 + }, + { + "epoch": 1.9470219938995024, + "grad_norm": 0.8426972031593323, + "learning_rate": 8.047385016321552e-05, + "loss": 0.5205, + "step": 6064 + }, + { + "epoch": 1.9473430727243537, + "grad_norm": 0.5573900938034058, + "learning_rate": 8.04396166777952e-05, + "loss": 0.6376, + "step": 6065 + }, + { + "epoch": 1.9476641515492052, + "grad_norm": 0.5124462246894836, + "learning_rate": 8.040538557574822e-05, + "loss": 0.7874, + "step": 6066 + }, + { + "epoch": 1.9479852303740568, + "grad_norm": 0.7582365870475769, + "learning_rate": 8.037115686124564e-05, + "loss": 0.3713, + "step": 6067 + }, + { + "epoch": 1.9483063091989083, + "grad_norm": 1.0878230333328247, + "learning_rate": 8.033693053845801e-05, + "loss": 0.5994, + "step": 6068 + }, + { + "epoch": 1.94862738802376, + "grad_norm": 0.8553785681724548, + "learning_rate": 8.030270661155574e-05, + "loss": 0.2631, + "step": 6069 + }, + { + "epoch": 1.9489484668486114, + "grad_norm": 0.5939182043075562, + "learning_rate": 8.026848508470897e-05, + "loss": 0.2166, + "step": 6070 + }, + { + "epoch": 1.949269545673463, + "grad_norm": 0.5438021421432495, + "learning_rate": 8.023426596208739e-05, + "loss": 0.1312, + "step": 6071 + }, + { + "epoch": 1.9495906244983143, + "grad_norm": 0.6619899272918701, + "learning_rate": 8.020004924786059e-05, + "loss": 0.2189, + "step": 6072 + }, + { + "epoch": 1.9499117033231659, + "grad_norm": 0.8623759150505066, + "learning_rate": 8.016583494619769e-05, + "loss": 0.7344, + "step": 6073 + }, + { + "epoch": 1.9502327821480172, + "grad_norm": 0.8690182566642761, + "learning_rate": 8.013162306126765e-05, + "loss": 0.6706, + "step": 6074 + }, + { + "epoch": 1.9505538609728688, + "grad_norm": 0.8359203338623047, + "learning_rate": 8.009741359723906e-05, + "loss": 0.7241, + "step": 6075 + }, + { + "epoch": 1.9508749397977203, + "grad_norm": 0.9023215770721436, + "learning_rate": 8.00632065582803e-05, + "loss": 0.662, + "step": 6076 + }, + { + "epoch": 1.9511960186225719, + "grad_norm": 0.8437222242355347, + "learning_rate": 8.002900194855932e-05, + "loss": 0.4732, + "step": 6077 + }, + { + "epoch": 1.9515170974474234, + "grad_norm": 1.18110990524292, + "learning_rate": 7.999479977224384e-05, + "loss": 0.7354, + "step": 6078 + }, + { + "epoch": 1.951838176272275, + "grad_norm": 0.7391937971115112, + "learning_rate": 7.996060003350139e-05, + "loss": 0.5063, + "step": 6079 + }, + { + "epoch": 1.9521592550971263, + "grad_norm": 0.937690794467926, + "learning_rate": 7.992640273649898e-05, + "loss": 0.5532, + "step": 6080 + }, + { + "epoch": 1.9524803339219778, + "grad_norm": 0.7518649101257324, + "learning_rate": 7.989220788540355e-05, + "loss": 0.5798, + "step": 6081 + }, + { + "epoch": 1.9528014127468294, + "grad_norm": 1.227014422416687, + "learning_rate": 7.985801548438157e-05, + "loss": 0.5997, + "step": 6082 + }, + { + "epoch": 1.9531224915716807, + "grad_norm": 0.9930256009101868, + "learning_rate": 7.982382553759931e-05, + "loss": 0.7134, + "step": 6083 + }, + { + "epoch": 1.9534435703965323, + "grad_norm": 0.8083244562149048, + "learning_rate": 7.97896380492227e-05, + "loss": 0.5235, + "step": 6084 + }, + { + "epoch": 1.9537646492213838, + "grad_norm": 0.8794909119606018, + "learning_rate": 7.975545302341743e-05, + "loss": 0.6455, + "step": 6085 + }, + { + "epoch": 1.9540857280462354, + "grad_norm": 0.7878838181495667, + "learning_rate": 7.972127046434878e-05, + "loss": 0.5327, + "step": 6086 + }, + { + "epoch": 1.954406806871087, + "grad_norm": 0.861558198928833, + "learning_rate": 7.96870903761818e-05, + "loss": 0.4608, + "step": 6087 + }, + { + "epoch": 1.9547278856959385, + "grad_norm": 0.7832056283950806, + "learning_rate": 7.965291276308124e-05, + "loss": 0.5346, + "step": 6088 + }, + { + "epoch": 1.9550489645207898, + "grad_norm": 0.9764854907989502, + "learning_rate": 7.961873762921153e-05, + "loss": 0.5555, + "step": 6089 + }, + { + "epoch": 1.9553700433456414, + "grad_norm": 1.0838629007339478, + "learning_rate": 7.958456497873685e-05, + "loss": 0.6255, + "step": 6090 + }, + { + "epoch": 1.955691122170493, + "grad_norm": 1.2348774671554565, + "learning_rate": 7.955039481582097e-05, + "loss": 0.4626, + "step": 6091 + }, + { + "epoch": 1.9560122009953442, + "grad_norm": 0.8627070784568787, + "learning_rate": 7.951622714462746e-05, + "loss": 0.5652, + "step": 6092 + }, + { + "epoch": 1.9563332798201958, + "grad_norm": 1.0902684926986694, + "learning_rate": 7.948206196931954e-05, + "loss": 0.3824, + "step": 6093 + }, + { + "epoch": 1.9566543586450473, + "grad_norm": 0.9122470617294312, + "learning_rate": 7.944789929406016e-05, + "loss": 0.4996, + "step": 6094 + }, + { + "epoch": 1.9569754374698989, + "grad_norm": 0.9444713592529297, + "learning_rate": 7.941373912301189e-05, + "loss": 0.782, + "step": 6095 + }, + { + "epoch": 1.9572965162947504, + "grad_norm": 0.7925707697868347, + "learning_rate": 7.937958146033705e-05, + "loss": 0.5442, + "step": 6096 + }, + { + "epoch": 1.957617595119602, + "grad_norm": 0.7286838889122009, + "learning_rate": 7.934542631019768e-05, + "loss": 0.4491, + "step": 6097 + }, + { + "epoch": 1.9579386739444533, + "grad_norm": 0.904128909111023, + "learning_rate": 7.931127367675543e-05, + "loss": 0.6565, + "step": 6098 + }, + { + "epoch": 1.9582597527693049, + "grad_norm": 0.8943635821342468, + "learning_rate": 7.927712356417176e-05, + "loss": 0.5327, + "step": 6099 + }, + { + "epoch": 1.9585808315941564, + "grad_norm": 0.8871841430664062, + "learning_rate": 7.92429759766077e-05, + "loss": 0.5109, + "step": 6100 + }, + { + "epoch": 1.9589019104190077, + "grad_norm": 0.6855352520942688, + "learning_rate": 7.920883091822408e-05, + "loss": 0.4418, + "step": 6101 + }, + { + "epoch": 1.9592229892438593, + "grad_norm": 0.8009575009346008, + "learning_rate": 7.917468839318132e-05, + "loss": 0.4312, + "step": 6102 + }, + { + "epoch": 1.9595440680687108, + "grad_norm": 0.8051626682281494, + "learning_rate": 7.914054840563963e-05, + "loss": 0.4664, + "step": 6103 + }, + { + "epoch": 1.9598651468935624, + "grad_norm": 0.9816034436225891, + "learning_rate": 7.910641095975886e-05, + "loss": 0.5121, + "step": 6104 + }, + { + "epoch": 1.960186225718414, + "grad_norm": 1.6955353021621704, + "learning_rate": 7.907227605969849e-05, + "loss": 0.6544, + "step": 6105 + }, + { + "epoch": 1.9605073045432655, + "grad_norm": 0.8258131146430969, + "learning_rate": 7.903814370961784e-05, + "loss": 0.4505, + "step": 6106 + }, + { + "epoch": 1.9608283833681168, + "grad_norm": 1.0741841793060303, + "learning_rate": 7.900401391367576e-05, + "loss": 0.636, + "step": 6107 + }, + { + "epoch": 1.9611494621929684, + "grad_norm": 0.7029876112937927, + "learning_rate": 7.896988667603093e-05, + "loss": 0.4387, + "step": 6108 + }, + { + "epoch": 1.96147054101782, + "grad_norm": 0.8194742798805237, + "learning_rate": 7.893576200084159e-05, + "loss": 0.5288, + "step": 6109 + }, + { + "epoch": 1.9617916198426713, + "grad_norm": 0.9146851301193237, + "learning_rate": 7.89016398922658e-05, + "loss": 0.504, + "step": 6110 + }, + { + "epoch": 1.9621126986675228, + "grad_norm": 0.760689914226532, + "learning_rate": 7.886752035446114e-05, + "loss": 0.4793, + "step": 6111 + }, + { + "epoch": 1.9624337774923744, + "grad_norm": 0.5260710120201111, + "learning_rate": 7.883340339158505e-05, + "loss": 0.3192, + "step": 6112 + }, + { + "epoch": 1.962754856317226, + "grad_norm": 0.7600740194320679, + "learning_rate": 7.879928900779456e-05, + "loss": 0.3748, + "step": 6113 + }, + { + "epoch": 1.9630759351420775, + "grad_norm": 0.8149203658103943, + "learning_rate": 7.876517720724636e-05, + "loss": 0.5003, + "step": 6114 + }, + { + "epoch": 1.963397013966929, + "grad_norm": 0.630042314529419, + "learning_rate": 7.873106799409695e-05, + "loss": 0.6224, + "step": 6115 + }, + { + "epoch": 1.9637180927917803, + "grad_norm": 0.6109962463378906, + "learning_rate": 7.869696137250235e-05, + "loss": 0.6135, + "step": 6116 + }, + { + "epoch": 1.9640391716166319, + "grad_norm": 0.4882936179637909, + "learning_rate": 7.866285734661841e-05, + "loss": 0.3282, + "step": 6117 + }, + { + "epoch": 1.9643602504414834, + "grad_norm": 0.5604743957519531, + "learning_rate": 7.862875592060056e-05, + "loss": 0.2431, + "step": 6118 + }, + { + "epoch": 1.9646813292663348, + "grad_norm": 0.4672786593437195, + "learning_rate": 7.8594657098604e-05, + "loss": 0.183, + "step": 6119 + }, + { + "epoch": 1.9650024080911863, + "grad_norm": 0.4110962450504303, + "learning_rate": 7.856056088478352e-05, + "loss": 0.1327, + "step": 6120 + }, + { + "epoch": 1.9653234869160379, + "grad_norm": 0.5428380370140076, + "learning_rate": 7.852646728329368e-05, + "loss": 0.2487, + "step": 6121 + }, + { + "epoch": 1.9656445657408894, + "grad_norm": 0.7151157855987549, + "learning_rate": 7.849237629828869e-05, + "loss": 0.2885, + "step": 6122 + }, + { + "epoch": 1.965965644565741, + "grad_norm": 0.8936823606491089, + "learning_rate": 7.845828793392236e-05, + "loss": 0.7795, + "step": 6123 + }, + { + "epoch": 1.9662867233905925, + "grad_norm": 0.8891395330429077, + "learning_rate": 7.842420219434833e-05, + "loss": 0.4893, + "step": 6124 + }, + { + "epoch": 1.9666078022154438, + "grad_norm": 0.8358086943626404, + "learning_rate": 7.83901190837198e-05, + "loss": 0.7111, + "step": 6125 + }, + { + "epoch": 1.9669288810402954, + "grad_norm": 0.7215904593467712, + "learning_rate": 7.835603860618972e-05, + "loss": 0.402, + "step": 6126 + }, + { + "epoch": 1.967249959865147, + "grad_norm": 0.7132273316383362, + "learning_rate": 7.832196076591067e-05, + "loss": 0.4719, + "step": 6127 + }, + { + "epoch": 1.9675710386899983, + "grad_norm": 0.9818544387817383, + "learning_rate": 7.828788556703498e-05, + "loss": 0.8142, + "step": 6128 + }, + { + "epoch": 1.9678921175148498, + "grad_norm": 1.1569818258285522, + "learning_rate": 7.825381301371452e-05, + "loss": 0.5118, + "step": 6129 + }, + { + "epoch": 1.9682131963397014, + "grad_norm": 0.9389197826385498, + "learning_rate": 7.821974311010102e-05, + "loss": 0.6396, + "step": 6130 + }, + { + "epoch": 1.968534275164553, + "grad_norm": 1.039969563484192, + "learning_rate": 7.818567586034577e-05, + "loss": 0.7603, + "step": 6131 + }, + { + "epoch": 1.9688553539894045, + "grad_norm": 1.0185346603393555, + "learning_rate": 7.81516112685997e-05, + "loss": 0.6472, + "step": 6132 + }, + { + "epoch": 1.969176432814256, + "grad_norm": 0.9028285145759583, + "learning_rate": 7.811754933901358e-05, + "loss": 0.7387, + "step": 6133 + }, + { + "epoch": 1.9694975116391074, + "grad_norm": 1.1110644340515137, + "learning_rate": 7.808349007573763e-05, + "loss": 0.5489, + "step": 6134 + }, + { + "epoch": 1.969818590463959, + "grad_norm": 1.0398848056793213, + "learning_rate": 7.804943348292197e-05, + "loss": 0.7001, + "step": 6135 + }, + { + "epoch": 1.9701396692888105, + "grad_norm": 0.7271793484687805, + "learning_rate": 7.801537956471624e-05, + "loss": 0.5753, + "step": 6136 + }, + { + "epoch": 1.9704607481136618, + "grad_norm": 1.480493426322937, + "learning_rate": 7.798132832526986e-05, + "loss": 0.657, + "step": 6137 + }, + { + "epoch": 1.9707818269385133, + "grad_norm": 0.8976219892501831, + "learning_rate": 7.79472797687318e-05, + "loss": 0.5007, + "step": 6138 + }, + { + "epoch": 1.9711029057633649, + "grad_norm": 0.8589239716529846, + "learning_rate": 7.791323389925084e-05, + "loss": 0.603, + "step": 6139 + }, + { + "epoch": 1.9714239845882164, + "grad_norm": 0.8826812505722046, + "learning_rate": 7.787919072097531e-05, + "loss": 0.6224, + "step": 6140 + }, + { + "epoch": 1.971745063413068, + "grad_norm": 0.7189533114433289, + "learning_rate": 7.784515023805328e-05, + "loss": 0.4416, + "step": 6141 + }, + { + "epoch": 1.9720661422379195, + "grad_norm": 1.1884037256240845, + "learning_rate": 7.781111245463252e-05, + "loss": 0.8795, + "step": 6142 + }, + { + "epoch": 1.9723872210627709, + "grad_norm": 1.107159972190857, + "learning_rate": 7.777707737486037e-05, + "loss": 0.6781, + "step": 6143 + }, + { + "epoch": 1.9727082998876224, + "grad_norm": 0.7705017924308777, + "learning_rate": 7.774304500288394e-05, + "loss": 0.5458, + "step": 6144 + }, + { + "epoch": 1.9730293787124737, + "grad_norm": 0.8601257801055908, + "learning_rate": 7.770901534284995e-05, + "loss": 0.4366, + "step": 6145 + }, + { + "epoch": 1.9733504575373253, + "grad_norm": 0.6789402961730957, + "learning_rate": 7.767498839890488e-05, + "loss": 0.4362, + "step": 6146 + }, + { + "epoch": 1.9736715363621768, + "grad_norm": 0.7042021155357361, + "learning_rate": 7.76409641751947e-05, + "loss": 0.4543, + "step": 6147 + }, + { + "epoch": 1.9739926151870284, + "grad_norm": 0.9554926753044128, + "learning_rate": 7.760694267586525e-05, + "loss": 0.7007, + "step": 6148 + }, + { + "epoch": 1.97431369401188, + "grad_norm": 0.9321420192718506, + "learning_rate": 7.75729239050619e-05, + "loss": 0.6791, + "step": 6149 + }, + { + "epoch": 1.9746347728367315, + "grad_norm": 0.8884396553039551, + "learning_rate": 7.753890786692972e-05, + "loss": 0.4627, + "step": 6150 + }, + { + "epoch": 1.974955851661583, + "grad_norm": 0.805990993976593, + "learning_rate": 7.750489456561352e-05, + "loss": 0.4353, + "step": 6151 + }, + { + "epoch": 1.9752769304864344, + "grad_norm": 0.9722745418548584, + "learning_rate": 7.747088400525766e-05, + "loss": 0.5465, + "step": 6152 + }, + { + "epoch": 1.975598009311286, + "grad_norm": 0.662632405757904, + "learning_rate": 7.743687619000626e-05, + "loss": 0.4702, + "step": 6153 + }, + { + "epoch": 1.9759190881361373, + "grad_norm": 0.8062264919281006, + "learning_rate": 7.740287112400303e-05, + "loss": 0.5142, + "step": 6154 + }, + { + "epoch": 1.9762401669609888, + "grad_norm": 0.8860646486282349, + "learning_rate": 7.736886881139142e-05, + "loss": 0.4718, + "step": 6155 + }, + { + "epoch": 1.9765612457858404, + "grad_norm": 0.928923487663269, + "learning_rate": 7.733486925631447e-05, + "loss": 0.5866, + "step": 6156 + }, + { + "epoch": 1.976882324610692, + "grad_norm": 0.9381113648414612, + "learning_rate": 7.730087246291502e-05, + "loss": 0.4805, + "step": 6157 + }, + { + "epoch": 1.9772034034355435, + "grad_norm": 0.8858740329742432, + "learning_rate": 7.726687843533538e-05, + "loss": 0.5572, + "step": 6158 + }, + { + "epoch": 1.977524482260395, + "grad_norm": 0.9326648712158203, + "learning_rate": 7.723288717771761e-05, + "loss": 0.5227, + "step": 6159 + }, + { + "epoch": 1.9778455610852466, + "grad_norm": 0.9480475187301636, + "learning_rate": 7.719889869420353e-05, + "loss": 0.4524, + "step": 6160 + }, + { + "epoch": 1.978166639910098, + "grad_norm": 0.7123903632164001, + "learning_rate": 7.716491298893442e-05, + "loss": 0.4944, + "step": 6161 + }, + { + "epoch": 1.9784877187349494, + "grad_norm": 0.5559878349304199, + "learning_rate": 7.713093006605145e-05, + "loss": 0.3082, + "step": 6162 + }, + { + "epoch": 1.9788087975598008, + "grad_norm": 0.6076563000679016, + "learning_rate": 7.709694992969526e-05, + "loss": 0.4055, + "step": 6163 + }, + { + "epoch": 1.9791298763846523, + "grad_norm": 0.5390298366546631, + "learning_rate": 7.706297258400624e-05, + "loss": 0.3068, + "step": 6164 + }, + { + "epoch": 1.9794509552095039, + "grad_norm": 0.533420205116272, + "learning_rate": 7.702899803312443e-05, + "loss": 0.5381, + "step": 6165 + }, + { + "epoch": 1.9797720340343554, + "grad_norm": 0.5672399997711182, + "learning_rate": 7.699502628118958e-05, + "loss": 0.9303, + "step": 6166 + }, + { + "epoch": 1.980093112859207, + "grad_norm": 0.7802035212516785, + "learning_rate": 7.696105733234098e-05, + "loss": 0.5104, + "step": 6167 + }, + { + "epoch": 1.9804141916840585, + "grad_norm": 0.6914035081863403, + "learning_rate": 7.692709119071762e-05, + "loss": 0.5653, + "step": 6168 + }, + { + "epoch": 1.98073527050891, + "grad_norm": 0.6555715203285217, + "learning_rate": 7.689312786045823e-05, + "loss": 0.3267, + "step": 6169 + }, + { + "epoch": 1.9810563493337614, + "grad_norm": 0.8108289241790771, + "learning_rate": 7.685916734570112e-05, + "loss": 0.2342, + "step": 6170 + }, + { + "epoch": 1.981377428158613, + "grad_norm": 0.6541457772254944, + "learning_rate": 7.682520965058428e-05, + "loss": 0.4736, + "step": 6171 + }, + { + "epoch": 1.9816985069834643, + "grad_norm": 0.9171504378318787, + "learning_rate": 7.679125477924534e-05, + "loss": 0.6021, + "step": 6172 + }, + { + "epoch": 1.9820195858083158, + "grad_norm": 0.798708975315094, + "learning_rate": 7.67573027358216e-05, + "loss": 0.537, + "step": 6173 + }, + { + "epoch": 1.9823406646331674, + "grad_norm": 0.8044041991233826, + "learning_rate": 7.672335352445002e-05, + "loss": 0.5214, + "step": 6174 + }, + { + "epoch": 1.982661743458019, + "grad_norm": 0.9100499153137207, + "learning_rate": 7.668940714926725e-05, + "loss": 0.6087, + "step": 6175 + }, + { + "epoch": 1.9829828222828705, + "grad_norm": 0.7136359810829163, + "learning_rate": 7.66554636144095e-05, + "loss": 0.4973, + "step": 6176 + }, + { + "epoch": 1.983303901107722, + "grad_norm": 0.6627728343009949, + "learning_rate": 7.662152292401264e-05, + "loss": 0.4051, + "step": 6177 + }, + { + "epoch": 1.9836249799325736, + "grad_norm": 0.6324448585510254, + "learning_rate": 7.658758508221234e-05, + "loss": 0.4427, + "step": 6178 + }, + { + "epoch": 1.983946058757425, + "grad_norm": 0.8954194188117981, + "learning_rate": 7.655365009314374e-05, + "loss": 0.6454, + "step": 6179 + }, + { + "epoch": 1.9842671375822765, + "grad_norm": 0.952394425868988, + "learning_rate": 7.651971796094183e-05, + "loss": 0.5537, + "step": 6180 + }, + { + "epoch": 1.9845882164071278, + "grad_norm": 0.9424409866333008, + "learning_rate": 7.6485788689741e-05, + "loss": 0.5597, + "step": 6181 + }, + { + "epoch": 1.9849092952319793, + "grad_norm": 1.0070812702178955, + "learning_rate": 7.645186228367554e-05, + "loss": 0.7455, + "step": 6182 + }, + { + "epoch": 1.985230374056831, + "grad_norm": 0.8608502149581909, + "learning_rate": 7.641793874687918e-05, + "loss": 0.678, + "step": 6183 + }, + { + "epoch": 1.9855514528816824, + "grad_norm": 1.0382908582687378, + "learning_rate": 7.638401808348548e-05, + "loss": 0.6999, + "step": 6184 + }, + { + "epoch": 1.985872531706534, + "grad_norm": 1.0702378749847412, + "learning_rate": 7.635010029762756e-05, + "loss": 0.8023, + "step": 6185 + }, + { + "epoch": 1.9861936105313855, + "grad_norm": 0.9120118618011475, + "learning_rate": 7.631618539343814e-05, + "loss": 0.6575, + "step": 6186 + }, + { + "epoch": 1.986514689356237, + "grad_norm": 0.8130465149879456, + "learning_rate": 7.628227337504972e-05, + "loss": 0.5327, + "step": 6187 + }, + { + "epoch": 1.9868357681810884, + "grad_norm": 1.0626096725463867, + "learning_rate": 7.62483642465943e-05, + "loss": 0.6589, + "step": 6188 + }, + { + "epoch": 1.98715684700594, + "grad_norm": 1.0040377378463745, + "learning_rate": 7.621445801220371e-05, + "loss": 0.6529, + "step": 6189 + }, + { + "epoch": 1.9874779258307913, + "grad_norm": 0.877011775970459, + "learning_rate": 7.618055467600922e-05, + "loss": 0.5563, + "step": 6190 + }, + { + "epoch": 1.9877990046556429, + "grad_norm": 0.8776607513427734, + "learning_rate": 7.614665424214193e-05, + "loss": 0.6501, + "step": 6191 + }, + { + "epoch": 1.9881200834804944, + "grad_norm": 1.0523321628570557, + "learning_rate": 7.611275671473245e-05, + "loss": 0.6605, + "step": 6192 + }, + { + "epoch": 1.988441162305346, + "grad_norm": 0.7821586728096008, + "learning_rate": 7.607886209791107e-05, + "loss": 0.5578, + "step": 6193 + }, + { + "epoch": 1.9887622411301975, + "grad_norm": 0.8066548109054565, + "learning_rate": 7.604497039580785e-05, + "loss": 0.5154, + "step": 6194 + }, + { + "epoch": 1.989083319955049, + "grad_norm": 1.202035665512085, + "learning_rate": 7.601108161255226e-05, + "loss": 0.5195, + "step": 6195 + }, + { + "epoch": 1.9894043987799006, + "grad_norm": 0.838572084903717, + "learning_rate": 7.597719575227364e-05, + "loss": 0.5041, + "step": 6196 + }, + { + "epoch": 1.989725477604752, + "grad_norm": 0.9432242512702942, + "learning_rate": 7.594331281910082e-05, + "loss": 0.6335, + "step": 6197 + }, + { + "epoch": 1.9900465564296035, + "grad_norm": 0.9394798278808594, + "learning_rate": 7.590943281716241e-05, + "loss": 0.4824, + "step": 6198 + }, + { + "epoch": 1.9903676352544548, + "grad_norm": 0.6274194717407227, + "learning_rate": 7.587555575058649e-05, + "loss": 0.377, + "step": 6199 + }, + { + "epoch": 1.9906887140793064, + "grad_norm": 1.0759146213531494, + "learning_rate": 7.584168162350098e-05, + "loss": 0.5105, + "step": 6200 + }, + { + "epoch": 1.991009792904158, + "grad_norm": 0.6160650253295898, + "learning_rate": 7.580781044003324e-05, + "loss": 0.4023, + "step": 6201 + }, + { + "epoch": 1.9913308717290095, + "grad_norm": 0.6983049511909485, + "learning_rate": 7.577394220431042e-05, + "loss": 0.3685, + "step": 6202 + }, + { + "epoch": 1.991651950553861, + "grad_norm": 0.6337112784385681, + "learning_rate": 7.574007692045928e-05, + "loss": 0.41, + "step": 6203 + }, + { + "epoch": 1.9919730293787126, + "grad_norm": 0.9221318960189819, + "learning_rate": 7.570621459260615e-05, + "loss": 0.5264, + "step": 6204 + }, + { + "epoch": 1.9922941082035641, + "grad_norm": 0.8658796548843384, + "learning_rate": 7.567235522487712e-05, + "loss": 0.5192, + "step": 6205 + }, + { + "epoch": 1.9926151870284154, + "grad_norm": 0.8173865079879761, + "learning_rate": 7.563849882139776e-05, + "loss": 0.464, + "step": 6206 + }, + { + "epoch": 1.992936265853267, + "grad_norm": 0.7882019877433777, + "learning_rate": 7.560464538629344e-05, + "loss": 0.4191, + "step": 6207 + }, + { + "epoch": 1.9932573446781183, + "grad_norm": 1.4528001546859741, + "learning_rate": 7.557079492368909e-05, + "loss": 0.3734, + "step": 6208 + }, + { + "epoch": 1.9935784235029699, + "grad_norm": 1.0732624530792236, + "learning_rate": 7.553694743770928e-05, + "loss": 0.5081, + "step": 6209 + }, + { + "epoch": 1.9938995023278214, + "grad_norm": 0.9734475612640381, + "learning_rate": 7.550310293247823e-05, + "loss": 0.4522, + "step": 6210 + }, + { + "epoch": 1.994220581152673, + "grad_norm": 1.118553638458252, + "learning_rate": 7.546926141211974e-05, + "loss": 0.5817, + "step": 6211 + }, + { + "epoch": 1.9945416599775245, + "grad_norm": 0.7407261729240417, + "learning_rate": 7.543542288075739e-05, + "loss": 0.3776, + "step": 6212 + }, + { + "epoch": 1.994862738802376, + "grad_norm": 0.7426897287368774, + "learning_rate": 7.54015873425142e-05, + "loss": 0.3893, + "step": 6213 + }, + { + "epoch": 1.9951838176272276, + "grad_norm": 0.4387853443622589, + "learning_rate": 7.536775480151303e-05, + "loss": 0.2971, + "step": 6214 + }, + { + "epoch": 1.995504896452079, + "grad_norm": 0.4856942594051361, + "learning_rate": 7.533392526187617e-05, + "loss": 0.659, + "step": 6215 + }, + { + "epoch": 1.9958259752769305, + "grad_norm": 0.49128448963165283, + "learning_rate": 7.530009872772572e-05, + "loss": 0.6117, + "step": 6216 + }, + { + "epoch": 1.9961470541017818, + "grad_norm": 0.819917619228363, + "learning_rate": 7.526627520318329e-05, + "loss": 0.6772, + "step": 6217 + }, + { + "epoch": 1.9964681329266334, + "grad_norm": 0.7481715083122253, + "learning_rate": 7.523245469237026e-05, + "loss": 0.4364, + "step": 6218 + }, + { + "epoch": 1.996789211751485, + "grad_norm": 0.9296541810035706, + "learning_rate": 7.519863719940748e-05, + "loss": 0.5882, + "step": 6219 + }, + { + "epoch": 1.9971102905763365, + "grad_norm": 0.8651626706123352, + "learning_rate": 7.516482272841549e-05, + "loss": 0.6071, + "step": 6220 + }, + { + "epoch": 1.997431369401188, + "grad_norm": 0.8119394183158875, + "learning_rate": 7.513101128351454e-05, + "loss": 0.4912, + "step": 6221 + }, + { + "epoch": 1.9977524482260396, + "grad_norm": 1.2624839544296265, + "learning_rate": 7.50972028688244e-05, + "loss": 0.5303, + "step": 6222 + }, + { + "epoch": 1.9980735270508911, + "grad_norm": 1.1252477169036865, + "learning_rate": 7.506339748846461e-05, + "loss": 0.7267, + "step": 6223 + }, + { + "epoch": 1.9983946058757425, + "grad_norm": 0.8501682877540588, + "learning_rate": 7.502959514655414e-05, + "loss": 0.6461, + "step": 6224 + }, + { + "epoch": 1.998715684700594, + "grad_norm": 0.9397971034049988, + "learning_rate": 7.499579584721179e-05, + "loss": 0.491, + "step": 6225 + }, + { + "epoch": 1.9990367635254453, + "grad_norm": 0.8632334470748901, + "learning_rate": 7.496199959455584e-05, + "loss": 0.5389, + "step": 6226 + }, + { + "epoch": 1.999357842350297, + "grad_norm": 1.034378170967102, + "learning_rate": 7.492820639270434e-05, + "loss": 0.4877, + "step": 6227 + }, + { + "epoch": 1.9996789211751484, + "grad_norm": 0.5917540192604065, + "learning_rate": 7.489441624577485e-05, + "loss": 0.3548, + "step": 6228 + }, + { + "epoch": 2.0, + "grad_norm": 1.1308932304382324, + "learning_rate": 7.486062915788452e-05, + "loss": 0.6238, + "step": 6229 + }, + { + "epoch": 2.0003210788248516, + "grad_norm": 0.3727356791496277, + "learning_rate": 7.48268451331503e-05, + "loss": 0.5456, + "step": 6230 + }, + { + "epoch": 2.000642157649703, + "grad_norm": 0.40441709756851196, + "learning_rate": 7.479306417568864e-05, + "loss": 0.1526, + "step": 6231 + }, + { + "epoch": 2.0009632364745547, + "grad_norm": 0.27756381034851074, + "learning_rate": 7.475928628961566e-05, + "loss": 0.0981, + "step": 6232 + }, + { + "epoch": 2.001284315299406, + "grad_norm": 0.480465292930603, + "learning_rate": 7.472551147904708e-05, + "loss": 0.126, + "step": 6233 + }, + { + "epoch": 2.0016053941242573, + "grad_norm": 0.26273319125175476, + "learning_rate": 7.469173974809826e-05, + "loss": 0.0916, + "step": 6234 + }, + { + "epoch": 2.001926472949109, + "grad_norm": 0.4176071286201477, + "learning_rate": 7.465797110088417e-05, + "loss": 0.2166, + "step": 6235 + }, + { + "epoch": 2.0022475517739604, + "grad_norm": 0.5277458429336548, + "learning_rate": 7.462420554151944e-05, + "loss": 0.3709, + "step": 6236 + }, + { + "epoch": 2.002568630598812, + "grad_norm": 0.7284386157989502, + "learning_rate": 7.459044307411832e-05, + "loss": 0.4687, + "step": 6237 + }, + { + "epoch": 2.0028897094236635, + "grad_norm": 0.6903724074363708, + "learning_rate": 7.45566837027946e-05, + "loss": 0.4989, + "step": 6238 + }, + { + "epoch": 2.003210788248515, + "grad_norm": 0.7660374641418457, + "learning_rate": 7.45229274316618e-05, + "loss": 0.4508, + "step": 6239 + }, + { + "epoch": 2.0035318670733666, + "grad_norm": 0.5654639005661011, + "learning_rate": 7.448917426483299e-05, + "loss": 0.39, + "step": 6240 + }, + { + "epoch": 2.003852945898218, + "grad_norm": 0.6629282236099243, + "learning_rate": 7.445542420642097e-05, + "loss": 0.3377, + "step": 6241 + }, + { + "epoch": 2.0041740247230697, + "grad_norm": 0.7881885170936584, + "learning_rate": 7.442167726053797e-05, + "loss": 0.3575, + "step": 6242 + }, + { + "epoch": 2.004495103547921, + "grad_norm": 0.6721541881561279, + "learning_rate": 7.438793343129605e-05, + "loss": 0.3635, + "step": 6243 + }, + { + "epoch": 2.0048161823727724, + "grad_norm": 0.8870039582252502, + "learning_rate": 7.435419272280672e-05, + "loss": 0.4121, + "step": 6244 + }, + { + "epoch": 2.005137261197624, + "grad_norm": 0.9872992038726807, + "learning_rate": 7.432045513918122e-05, + "loss": 0.4399, + "step": 6245 + }, + { + "epoch": 2.0054583400224755, + "grad_norm": 0.9258590340614319, + "learning_rate": 7.42867206845304e-05, + "loss": 0.3429, + "step": 6246 + }, + { + "epoch": 2.005779418847327, + "grad_norm": 0.7288257479667664, + "learning_rate": 7.425298936296463e-05, + "loss": 0.3793, + "step": 6247 + }, + { + "epoch": 2.0061004976721786, + "grad_norm": 0.6890774965286255, + "learning_rate": 7.421926117859403e-05, + "loss": 0.3466, + "step": 6248 + }, + { + "epoch": 2.00642157649703, + "grad_norm": 0.7121484875679016, + "learning_rate": 7.418553613552825e-05, + "loss": 0.3155, + "step": 6249 + }, + { + "epoch": 2.0067426553218817, + "grad_norm": 0.5670762658119202, + "learning_rate": 7.415181423787659e-05, + "loss": 0.2762, + "step": 6250 + }, + { + "epoch": 2.0070637341467332, + "grad_norm": 0.684097945690155, + "learning_rate": 7.411809548974792e-05, + "loss": 0.3319, + "step": 6251 + }, + { + "epoch": 2.0073848129715843, + "grad_norm": 0.8124287724494934, + "learning_rate": 7.408437989525085e-05, + "loss": 0.3208, + "step": 6252 + }, + { + "epoch": 2.007705891796436, + "grad_norm": 0.9157565236091614, + "learning_rate": 7.405066745849346e-05, + "loss": 0.4214, + "step": 6253 + }, + { + "epoch": 2.0080269706212874, + "grad_norm": 0.7512825131416321, + "learning_rate": 7.401695818358353e-05, + "loss": 0.2616, + "step": 6254 + }, + { + "epoch": 2.008348049446139, + "grad_norm": 0.834835410118103, + "learning_rate": 7.398325207462846e-05, + "loss": 0.3358, + "step": 6255 + }, + { + "epoch": 2.0086691282709905, + "grad_norm": 1.0159118175506592, + "learning_rate": 7.394954913573517e-05, + "loss": 0.4667, + "step": 6256 + }, + { + "epoch": 2.008990207095842, + "grad_norm": 0.9639686942100525, + "learning_rate": 7.391584937101033e-05, + "loss": 0.2684, + "step": 6257 + }, + { + "epoch": 2.0093112859206936, + "grad_norm": 0.9616678357124329, + "learning_rate": 7.38821527845601e-05, + "loss": 0.4674, + "step": 6258 + }, + { + "epoch": 2.009632364745545, + "grad_norm": 1.6692441701889038, + "learning_rate": 7.384845938049031e-05, + "loss": 0.4352, + "step": 6259 + }, + { + "epoch": 2.0099534435703967, + "grad_norm": 0.8217843770980835, + "learning_rate": 7.381476916290644e-05, + "loss": 0.3344, + "step": 6260 + }, + { + "epoch": 2.010274522395248, + "grad_norm": 1.3268312215805054, + "learning_rate": 7.378108213591355e-05, + "loss": 0.5378, + "step": 6261 + }, + { + "epoch": 2.0105956012200994, + "grad_norm": 0.9014345407485962, + "learning_rate": 7.374739830361621e-05, + "loss": 0.3766, + "step": 6262 + }, + { + "epoch": 2.010916680044951, + "grad_norm": 0.8533119559288025, + "learning_rate": 7.37137176701188e-05, + "loss": 0.2654, + "step": 6263 + }, + { + "epoch": 2.0112377588698025, + "grad_norm": 0.9980586171150208, + "learning_rate": 7.368004023952517e-05, + "loss": 0.3431, + "step": 6264 + }, + { + "epoch": 2.011558837694654, + "grad_norm": 0.7079142332077026, + "learning_rate": 7.364636601593875e-05, + "loss": 0.2977, + "step": 6265 + }, + { + "epoch": 2.0118799165195056, + "grad_norm": 0.8518555164337158, + "learning_rate": 7.361269500346274e-05, + "loss": 0.3331, + "step": 6266 + }, + { + "epoch": 2.012200995344357, + "grad_norm": 0.9751218557357788, + "learning_rate": 7.357902720619976e-05, + "loss": 0.324, + "step": 6267 + }, + { + "epoch": 2.0125220741692087, + "grad_norm": 1.0797030925750732, + "learning_rate": 7.354536262825219e-05, + "loss": 0.399, + "step": 6268 + }, + { + "epoch": 2.0128431529940602, + "grad_norm": 1.0779436826705933, + "learning_rate": 7.351170127372191e-05, + "loss": 0.3863, + "step": 6269 + }, + { + "epoch": 2.0131642318189114, + "grad_norm": 0.9085304737091064, + "learning_rate": 7.347804314671055e-05, + "loss": 0.3424, + "step": 6270 + }, + { + "epoch": 2.013485310643763, + "grad_norm": 0.8026139140129089, + "learning_rate": 7.344438825131911e-05, + "loss": 0.3144, + "step": 6271 + }, + { + "epoch": 2.0138063894686145, + "grad_norm": 0.9833243489265442, + "learning_rate": 7.341073659164848e-05, + "loss": 0.3074, + "step": 6272 + }, + { + "epoch": 2.014127468293466, + "grad_norm": 1.829190969467163, + "learning_rate": 7.33770881717989e-05, + "loss": 0.3371, + "step": 6273 + }, + { + "epoch": 2.0144485471183176, + "grad_norm": 0.8061204552650452, + "learning_rate": 7.334344299587035e-05, + "loss": 0.3001, + "step": 6274 + }, + { + "epoch": 2.014769625943169, + "grad_norm": 0.5113372206687927, + "learning_rate": 7.330980106796246e-05, + "loss": 0.2632, + "step": 6275 + }, + { + "epoch": 2.0150907047680207, + "grad_norm": 0.5458338260650635, + "learning_rate": 7.327616239217431e-05, + "loss": 0.279, + "step": 6276 + }, + { + "epoch": 2.015411783592872, + "grad_norm": 0.7398447394371033, + "learning_rate": 7.324252697260474e-05, + "loss": 0.2997, + "step": 6277 + }, + { + "epoch": 2.0157328624177238, + "grad_norm": 0.7369372248649597, + "learning_rate": 7.320889481335207e-05, + "loss": 0.2572, + "step": 6278 + }, + { + "epoch": 2.016053941242575, + "grad_norm": 0.4629496932029724, + "learning_rate": 7.317526591851433e-05, + "loss": 0.2779, + "step": 6279 + }, + { + "epoch": 2.0163750200674264, + "grad_norm": 0.5169734954833984, + "learning_rate": 7.314164029218904e-05, + "loss": 0.3046, + "step": 6280 + }, + { + "epoch": 2.016696098892278, + "grad_norm": 0.6138988733291626, + "learning_rate": 7.310801793847344e-05, + "loss": 0.6675, + "step": 6281 + }, + { + "epoch": 2.0170171777171295, + "grad_norm": 0.6308935284614563, + "learning_rate": 7.307439886146428e-05, + "loss": 0.1614, + "step": 6282 + }, + { + "epoch": 2.017338256541981, + "grad_norm": 0.7147278785705566, + "learning_rate": 7.30407830652579e-05, + "loss": 0.2836, + "step": 6283 + }, + { + "epoch": 2.0176593353668326, + "grad_norm": 0.5563004612922668, + "learning_rate": 7.300717055395039e-05, + "loss": 0.1, + "step": 6284 + }, + { + "epoch": 2.017980414191684, + "grad_norm": 0.6495770812034607, + "learning_rate": 7.297356133163721e-05, + "loss": 0.1254, + "step": 6285 + }, + { + "epoch": 2.0183014930165357, + "grad_norm": 0.6263350248336792, + "learning_rate": 7.293995540241366e-05, + "loss": 0.2167, + "step": 6286 + }, + { + "epoch": 2.0186225718413873, + "grad_norm": 0.7048402428627014, + "learning_rate": 7.290635277037442e-05, + "loss": 0.3179, + "step": 6287 + }, + { + "epoch": 2.0189436506662384, + "grad_norm": 0.934867799282074, + "learning_rate": 7.287275343961392e-05, + "loss": 0.4398, + "step": 6288 + }, + { + "epoch": 2.01926472949109, + "grad_norm": 0.650744616985321, + "learning_rate": 7.283915741422612e-05, + "loss": 0.4162, + "step": 6289 + }, + { + "epoch": 2.0195858083159415, + "grad_norm": 0.9540171027183533, + "learning_rate": 7.280556469830464e-05, + "loss": 0.4748, + "step": 6290 + }, + { + "epoch": 2.019906887140793, + "grad_norm": 0.6486244201660156, + "learning_rate": 7.277197529594257e-05, + "loss": 0.3104, + "step": 6291 + }, + { + "epoch": 2.0202279659656446, + "grad_norm": 0.7935119867324829, + "learning_rate": 7.273838921123272e-05, + "loss": 0.3595, + "step": 6292 + }, + { + "epoch": 2.020549044790496, + "grad_norm": 0.6961763501167297, + "learning_rate": 7.270480644826749e-05, + "loss": 0.2754, + "step": 6293 + }, + { + "epoch": 2.0208701236153477, + "grad_norm": 0.8212137818336487, + "learning_rate": 7.267122701113876e-05, + "loss": 0.3855, + "step": 6294 + }, + { + "epoch": 2.0211912024401992, + "grad_norm": 0.6237925887107849, + "learning_rate": 7.263765090393817e-05, + "loss": 0.2879, + "step": 6295 + }, + { + "epoch": 2.021512281265051, + "grad_norm": 0.5799258947372437, + "learning_rate": 7.260407813075676e-05, + "loss": 0.29, + "step": 6296 + }, + { + "epoch": 2.021833360089902, + "grad_norm": 0.8160949349403381, + "learning_rate": 7.257050869568535e-05, + "loss": 0.4371, + "step": 6297 + }, + { + "epoch": 2.0221544389147534, + "grad_norm": 0.9621134400367737, + "learning_rate": 7.253694260281425e-05, + "loss": 0.492, + "step": 6298 + }, + { + "epoch": 2.022475517739605, + "grad_norm": 0.7058377861976624, + "learning_rate": 7.250337985623342e-05, + "loss": 0.299, + "step": 6299 + }, + { + "epoch": 2.0227965965644565, + "grad_norm": 0.7675186991691589, + "learning_rate": 7.246982046003234e-05, + "loss": 0.2944, + "step": 6300 + }, + { + "epoch": 2.023117675389308, + "grad_norm": 0.910317063331604, + "learning_rate": 7.243626441830009e-05, + "loss": 0.3878, + "step": 6301 + }, + { + "epoch": 2.0234387542141596, + "grad_norm": 0.9493252038955688, + "learning_rate": 7.240271173512546e-05, + "loss": 0.3264, + "step": 6302 + }, + { + "epoch": 2.023759833039011, + "grad_norm": 1.0064270496368408, + "learning_rate": 7.236916241459663e-05, + "loss": 0.3125, + "step": 6303 + }, + { + "epoch": 2.0240809118638627, + "grad_norm": 0.7101002335548401, + "learning_rate": 7.233561646080161e-05, + "loss": 0.2849, + "step": 6304 + }, + { + "epoch": 2.0244019906887143, + "grad_norm": 0.728187084197998, + "learning_rate": 7.230207387782776e-05, + "loss": 0.2976, + "step": 6305 + }, + { + "epoch": 2.0247230695135654, + "grad_norm": 0.8924553394317627, + "learning_rate": 7.226853466976222e-05, + "loss": 0.4461, + "step": 6306 + }, + { + "epoch": 2.025044148338417, + "grad_norm": 0.7692164182662964, + "learning_rate": 7.22349988406916e-05, + "loss": 0.3043, + "step": 6307 + }, + { + "epoch": 2.0253652271632685, + "grad_norm": 0.7273018956184387, + "learning_rate": 7.220146639470218e-05, + "loss": 0.321, + "step": 6308 + }, + { + "epoch": 2.02568630598812, + "grad_norm": 0.9611377716064453, + "learning_rate": 7.216793733587976e-05, + "loss": 0.2974, + "step": 6309 + }, + { + "epoch": 2.0260073848129716, + "grad_norm": 0.8985442519187927, + "learning_rate": 7.21344116683097e-05, + "loss": 0.3249, + "step": 6310 + }, + { + "epoch": 2.026328463637823, + "grad_norm": 0.7551534175872803, + "learning_rate": 7.210088939607708e-05, + "loss": 0.3713, + "step": 6311 + }, + { + "epoch": 2.0266495424626747, + "grad_norm": 1.1302769184112549, + "learning_rate": 7.206737052326645e-05, + "loss": 0.3553, + "step": 6312 + }, + { + "epoch": 2.0269706212875263, + "grad_norm": 0.7023568153381348, + "learning_rate": 7.203385505396203e-05, + "loss": 0.345, + "step": 6313 + }, + { + "epoch": 2.027291700112378, + "grad_norm": 1.19681715965271, + "learning_rate": 7.20003429922475e-05, + "loss": 0.4365, + "step": 6314 + }, + { + "epoch": 2.027612778937229, + "grad_norm": 1.0040020942687988, + "learning_rate": 7.196683434220625e-05, + "loss": 0.3879, + "step": 6315 + }, + { + "epoch": 2.0279338577620805, + "grad_norm": 0.9291061758995056, + "learning_rate": 7.193332910792124e-05, + "loss": 0.4166, + "step": 6316 + }, + { + "epoch": 2.028254936586932, + "grad_norm": 0.5969425439834595, + "learning_rate": 7.18998272934749e-05, + "loss": 0.2834, + "step": 6317 + }, + { + "epoch": 2.0285760154117836, + "grad_norm": 0.7998864650726318, + "learning_rate": 7.186632890294941e-05, + "loss": 0.3245, + "step": 6318 + }, + { + "epoch": 2.028897094236635, + "grad_norm": 0.6707684993743896, + "learning_rate": 7.183283394042634e-05, + "loss": 0.2533, + "step": 6319 + }, + { + "epoch": 2.0292181730614867, + "grad_norm": 0.6057593822479248, + "learning_rate": 7.179934240998706e-05, + "loss": 0.2811, + "step": 6320 + }, + { + "epoch": 2.029539251886338, + "grad_norm": 0.9882358908653259, + "learning_rate": 7.176585431571235e-05, + "loss": 0.4067, + "step": 6321 + }, + { + "epoch": 2.0298603307111898, + "grad_norm": 0.688306450843811, + "learning_rate": 7.173236966168268e-05, + "loss": 0.2973, + "step": 6322 + }, + { + "epoch": 2.0301814095360413, + "grad_norm": 0.886023223400116, + "learning_rate": 7.169888845197798e-05, + "loss": 0.3354, + "step": 6323 + }, + { + "epoch": 2.0305024883608924, + "grad_norm": 0.6586227416992188, + "learning_rate": 7.166541069067792e-05, + "loss": 0.2762, + "step": 6324 + }, + { + "epoch": 2.030823567185744, + "grad_norm": 1.0189096927642822, + "learning_rate": 7.163193638186158e-05, + "loss": 0.2873, + "step": 6325 + }, + { + "epoch": 2.0311446460105955, + "grad_norm": 0.48503080010414124, + "learning_rate": 7.159846552960774e-05, + "loss": 0.2642, + "step": 6326 + }, + { + "epoch": 2.031465724835447, + "grad_norm": 0.4218836724758148, + "learning_rate": 7.156499813799476e-05, + "loss": 0.247, + "step": 6327 + }, + { + "epoch": 2.0317868036602986, + "grad_norm": 0.5482301712036133, + "learning_rate": 7.153153421110048e-05, + "loss": 0.268, + "step": 6328 + }, + { + "epoch": 2.03210788248515, + "grad_norm": 0.47911396622657776, + "learning_rate": 7.149807375300239e-05, + "loss": 0.2903, + "step": 6329 + }, + { + "epoch": 2.0324289613100017, + "grad_norm": 0.5212134718894958, + "learning_rate": 7.146461676777756e-05, + "loss": 0.8643, + "step": 6330 + }, + { + "epoch": 2.0327500401348533, + "grad_norm": 0.43788644671440125, + "learning_rate": 7.143116325950265e-05, + "loss": 0.3721, + "step": 6331 + }, + { + "epoch": 2.033071118959705, + "grad_norm": 0.5127784609794617, + "learning_rate": 7.139771323225381e-05, + "loss": 0.2185, + "step": 6332 + }, + { + "epoch": 2.033392197784556, + "grad_norm": 0.5072081685066223, + "learning_rate": 7.136426669010689e-05, + "loss": 0.2602, + "step": 6333 + }, + { + "epoch": 2.0337132766094075, + "grad_norm": 0.7099177241325378, + "learning_rate": 7.13308236371372e-05, + "loss": 0.413, + "step": 6334 + }, + { + "epoch": 2.034034355434259, + "grad_norm": 0.9787275791168213, + "learning_rate": 7.129738407741964e-05, + "loss": 0.5135, + "step": 6335 + }, + { + "epoch": 2.0343554342591106, + "grad_norm": 0.8141676783561707, + "learning_rate": 7.126394801502882e-05, + "loss": 0.3424, + "step": 6336 + }, + { + "epoch": 2.034676513083962, + "grad_norm": 0.7322252988815308, + "learning_rate": 7.123051545403874e-05, + "loss": 0.3168, + "step": 6337 + }, + { + "epoch": 2.0349975919088137, + "grad_norm": 0.8565484285354614, + "learning_rate": 7.119708639852312e-05, + "loss": 0.2875, + "step": 6338 + }, + { + "epoch": 2.0353186707336652, + "grad_norm": 0.731372594833374, + "learning_rate": 7.11636608525551e-05, + "loss": 0.3009, + "step": 6339 + }, + { + "epoch": 2.035639749558517, + "grad_norm": 0.9566566348075867, + "learning_rate": 7.113023882020757e-05, + "loss": 0.3571, + "step": 6340 + }, + { + "epoch": 2.0359608283833683, + "grad_norm": 1.051526665687561, + "learning_rate": 7.109682030555283e-05, + "loss": 0.3774, + "step": 6341 + }, + { + "epoch": 2.0362819072082194, + "grad_norm": 0.8493756651878357, + "learning_rate": 7.106340531266292e-05, + "loss": 0.2954, + "step": 6342 + }, + { + "epoch": 2.036602986033071, + "grad_norm": 1.491635799407959, + "learning_rate": 7.102999384560927e-05, + "loss": 0.4563, + "step": 6343 + }, + { + "epoch": 2.0369240648579225, + "grad_norm": 0.7433634400367737, + "learning_rate": 7.099658590846299e-05, + "loss": 0.3044, + "step": 6344 + }, + { + "epoch": 2.037245143682774, + "grad_norm": 0.8622017502784729, + "learning_rate": 7.096318150529477e-05, + "loss": 0.2676, + "step": 6345 + }, + { + "epoch": 2.0375662225076256, + "grad_norm": 0.8949119448661804, + "learning_rate": 7.092978064017475e-05, + "loss": 0.2809, + "step": 6346 + }, + { + "epoch": 2.037887301332477, + "grad_norm": 1.0402297973632812, + "learning_rate": 7.089638331717284e-05, + "loss": 0.4512, + "step": 6347 + }, + { + "epoch": 2.0382083801573287, + "grad_norm": 0.9292569756507874, + "learning_rate": 7.08629895403583e-05, + "loss": 0.4013, + "step": 6348 + }, + { + "epoch": 2.0385294589821803, + "grad_norm": 1.625335454940796, + "learning_rate": 7.082959931380011e-05, + "loss": 0.3156, + "step": 6349 + }, + { + "epoch": 2.038850537807032, + "grad_norm": 0.6953380703926086, + "learning_rate": 7.079621264156675e-05, + "loss": 0.3309, + "step": 6350 + }, + { + "epoch": 2.039171616631883, + "grad_norm": 0.7905119061470032, + "learning_rate": 7.076282952772633e-05, + "loss": 0.3588, + "step": 6351 + }, + { + "epoch": 2.0394926954567345, + "grad_norm": 0.7801741361618042, + "learning_rate": 7.072944997634646e-05, + "loss": 0.2859, + "step": 6352 + }, + { + "epoch": 2.039813774281586, + "grad_norm": 1.3757237195968628, + "learning_rate": 7.069607399149428e-05, + "loss": 0.4971, + "step": 6353 + }, + { + "epoch": 2.0401348531064376, + "grad_norm": 0.847542405128479, + "learning_rate": 7.06627015772366e-05, + "loss": 0.3273, + "step": 6354 + }, + { + "epoch": 2.040455931931289, + "grad_norm": 0.6177198886871338, + "learning_rate": 7.062933273763975e-05, + "loss": 0.2351, + "step": 6355 + }, + { + "epoch": 2.0407770107561407, + "grad_norm": 0.9541710019111633, + "learning_rate": 7.059596747676962e-05, + "loss": 0.3357, + "step": 6356 + }, + { + "epoch": 2.0410980895809923, + "grad_norm": 0.8325964212417603, + "learning_rate": 7.056260579869165e-05, + "loss": 0.2587, + "step": 6357 + }, + { + "epoch": 2.041419168405844, + "grad_norm": 0.8677617907524109, + "learning_rate": 7.052924770747087e-05, + "loss": 0.2691, + "step": 6358 + }, + { + "epoch": 2.0417402472306954, + "grad_norm": 0.9467921257019043, + "learning_rate": 7.049589320717186e-05, + "loss": 0.3171, + "step": 6359 + }, + { + "epoch": 2.0420613260555465, + "grad_norm": 0.9890326261520386, + "learning_rate": 7.04625423018588e-05, + "loss": 0.3976, + "step": 6360 + }, + { + "epoch": 2.042382404880398, + "grad_norm": 0.6989203095436096, + "learning_rate": 7.042919499559537e-05, + "loss": 0.2859, + "step": 6361 + }, + { + "epoch": 2.0427034837052496, + "grad_norm": 0.8812395930290222, + "learning_rate": 7.039585129244477e-05, + "loss": 0.3466, + "step": 6362 + }, + { + "epoch": 2.043024562530101, + "grad_norm": 0.8233697414398193, + "learning_rate": 7.036251119646992e-05, + "loss": 0.2523, + "step": 6363 + }, + { + "epoch": 2.0433456413549527, + "grad_norm": 0.9117799401283264, + "learning_rate": 7.032917471173318e-05, + "loss": 0.3386, + "step": 6364 + }, + { + "epoch": 2.043666720179804, + "grad_norm": 0.9386948347091675, + "learning_rate": 7.029584184229653e-05, + "loss": 0.2908, + "step": 6365 + }, + { + "epoch": 2.0439877990046558, + "grad_norm": 0.7460491061210632, + "learning_rate": 7.026251259222141e-05, + "loss": 0.2804, + "step": 6366 + }, + { + "epoch": 2.0443088778295073, + "grad_norm": 0.7433897256851196, + "learning_rate": 7.022918696556896e-05, + "loss": 0.2923, + "step": 6367 + }, + { + "epoch": 2.0446299566543584, + "grad_norm": 0.6513417959213257, + "learning_rate": 7.019586496639974e-05, + "loss": 0.2907, + "step": 6368 + }, + { + "epoch": 2.04495103547921, + "grad_norm": 0.9250189661979675, + "learning_rate": 7.016254659877398e-05, + "loss": 0.3394, + "step": 6369 + }, + { + "epoch": 2.0452721143040615, + "grad_norm": 0.848927915096283, + "learning_rate": 7.012923186675144e-05, + "loss": 0.3345, + "step": 6370 + }, + { + "epoch": 2.045593193128913, + "grad_norm": 0.718906044960022, + "learning_rate": 7.009592077439134e-05, + "loss": 0.3041, + "step": 6371 + }, + { + "epoch": 2.0459142719537646, + "grad_norm": 0.887864351272583, + "learning_rate": 7.00626133257526e-05, + "loss": 0.3669, + "step": 6372 + }, + { + "epoch": 2.046235350778616, + "grad_norm": 0.7558139562606812, + "learning_rate": 7.002930952489362e-05, + "loss": 0.2784, + "step": 6373 + }, + { + "epoch": 2.0465564296034677, + "grad_norm": 0.6548720002174377, + "learning_rate": 6.999600937587239e-05, + "loss": 0.2803, + "step": 6374 + }, + { + "epoch": 2.0468775084283193, + "grad_norm": 0.7359789609909058, + "learning_rate": 6.996271288274636e-05, + "loss": 0.2794, + "step": 6375 + }, + { + "epoch": 2.047198587253171, + "grad_norm": 0.7246059775352478, + "learning_rate": 6.992942004957271e-05, + "loss": 0.2569, + "step": 6376 + }, + { + "epoch": 2.047519666078022, + "grad_norm": 0.4649535119533539, + "learning_rate": 6.989613088040796e-05, + "loss": 0.2646, + "step": 6377 + }, + { + "epoch": 2.0478407449028735, + "grad_norm": 1.727747917175293, + "learning_rate": 6.986284537930838e-05, + "loss": 0.2704, + "step": 6378 + }, + { + "epoch": 2.048161823727725, + "grad_norm": 0.7226576805114746, + "learning_rate": 6.982956355032968e-05, + "loss": 0.2848, + "step": 6379 + }, + { + "epoch": 2.0484829025525766, + "grad_norm": 0.4427557587623596, + "learning_rate": 6.979628539752711e-05, + "loss": 0.5042, + "step": 6380 + }, + { + "epoch": 2.048803981377428, + "grad_norm": 0.6116136312484741, + "learning_rate": 6.976301092495556e-05, + "loss": 0.7693, + "step": 6381 + }, + { + "epoch": 2.0491250602022797, + "grad_norm": 0.5171028971672058, + "learning_rate": 6.972974013666942e-05, + "loss": 0.3138, + "step": 6382 + }, + { + "epoch": 2.0494461390271312, + "grad_norm": 0.5506100058555603, + "learning_rate": 6.969647303672262e-05, + "loss": 0.1486, + "step": 6383 + }, + { + "epoch": 2.049767217851983, + "grad_norm": 0.48883306980133057, + "learning_rate": 6.966320962916864e-05, + "loss": 0.1554, + "step": 6384 + }, + { + "epoch": 2.0500882966768343, + "grad_norm": 0.720169186592102, + "learning_rate": 6.962994991806059e-05, + "loss": 0.3594, + "step": 6385 + }, + { + "epoch": 2.0504093755016854, + "grad_norm": 0.9285702705383301, + "learning_rate": 6.959669390745097e-05, + "loss": 0.5368, + "step": 6386 + }, + { + "epoch": 2.050730454326537, + "grad_norm": 0.9640303254127502, + "learning_rate": 6.956344160139201e-05, + "loss": 0.3814, + "step": 6387 + }, + { + "epoch": 2.0510515331513886, + "grad_norm": 0.9004988670349121, + "learning_rate": 6.953019300393538e-05, + "loss": 0.4218, + "step": 6388 + }, + { + "epoch": 2.05137261197624, + "grad_norm": 0.4966624975204468, + "learning_rate": 6.949694811913225e-05, + "loss": 0.206, + "step": 6389 + }, + { + "epoch": 2.0516936908010917, + "grad_norm": 0.7329800724983215, + "learning_rate": 6.946370695103353e-05, + "loss": 0.3075, + "step": 6390 + }, + { + "epoch": 2.052014769625943, + "grad_norm": 0.8683103322982788, + "learning_rate": 6.943046950368944e-05, + "loss": 0.4042, + "step": 6391 + }, + { + "epoch": 2.0523358484507948, + "grad_norm": 0.7453745603561401, + "learning_rate": 6.939723578114993e-05, + "loss": 0.325, + "step": 6392 + }, + { + "epoch": 2.0526569272756463, + "grad_norm": 0.8619517683982849, + "learning_rate": 6.93640057874644e-05, + "loss": 0.3574, + "step": 6393 + }, + { + "epoch": 2.052978006100498, + "grad_norm": 0.7723063826560974, + "learning_rate": 6.93307795266819e-05, + "loss": 0.3337, + "step": 6394 + }, + { + "epoch": 2.053299084925349, + "grad_norm": 0.7700170278549194, + "learning_rate": 6.929755700285081e-05, + "loss": 0.4027, + "step": 6395 + }, + { + "epoch": 2.0536201637502005, + "grad_norm": 0.7963249087333679, + "learning_rate": 6.92643382200193e-05, + "loss": 0.3438, + "step": 6396 + }, + { + "epoch": 2.053941242575052, + "grad_norm": 0.8687177896499634, + "learning_rate": 6.923112318223496e-05, + "loss": 0.4145, + "step": 6397 + }, + { + "epoch": 2.0542623213999036, + "grad_norm": 0.988239049911499, + "learning_rate": 6.91979118935449e-05, + "loss": 0.3573, + "step": 6398 + }, + { + "epoch": 2.054583400224755, + "grad_norm": 1.586459755897522, + "learning_rate": 6.916470435799587e-05, + "loss": 0.3187, + "step": 6399 + }, + { + "epoch": 2.0549044790496067, + "grad_norm": 0.8232465386390686, + "learning_rate": 6.913150057963404e-05, + "loss": 0.4458, + "step": 6400 + }, + { + "epoch": 2.0552255578744583, + "grad_norm": 0.6517797708511353, + "learning_rate": 6.909830056250527e-05, + "loss": 0.2838, + "step": 6401 + }, + { + "epoch": 2.05554663669931, + "grad_norm": 0.7707570195198059, + "learning_rate": 6.90651043106548e-05, + "loss": 0.3037, + "step": 6402 + }, + { + "epoch": 2.0558677155241614, + "grad_norm": 1.065109133720398, + "learning_rate": 6.90319118281276e-05, + "loss": 0.5664, + "step": 6403 + }, + { + "epoch": 2.0561887943490125, + "grad_norm": 0.7648012042045593, + "learning_rate": 6.899872311896795e-05, + "loss": 0.392, + "step": 6404 + }, + { + "epoch": 2.056509873173864, + "grad_norm": 0.7669097185134888, + "learning_rate": 6.896553818721989e-05, + "loss": 0.2889, + "step": 6405 + }, + { + "epoch": 2.0568309519987156, + "grad_norm": 1.1542384624481201, + "learning_rate": 6.893235703692685e-05, + "loss": 0.3426, + "step": 6406 + }, + { + "epoch": 2.057152030823567, + "grad_norm": 0.5195868015289307, + "learning_rate": 6.889917967213185e-05, + "loss": 0.2784, + "step": 6407 + }, + { + "epoch": 2.0574731096484187, + "grad_norm": 0.7854254841804504, + "learning_rate": 6.88660060968775e-05, + "loss": 0.3684, + "step": 6408 + }, + { + "epoch": 2.0577941884732702, + "grad_norm": 1.0029950141906738, + "learning_rate": 6.883283631520582e-05, + "loss": 0.4543, + "step": 6409 + }, + { + "epoch": 2.0581152672981218, + "grad_norm": 1.1565372943878174, + "learning_rate": 6.879967033115853e-05, + "loss": 0.4042, + "step": 6410 + }, + { + "epoch": 2.0584363461229733, + "grad_norm": 1.5860395431518555, + "learning_rate": 6.876650814877674e-05, + "loss": 0.3917, + "step": 6411 + }, + { + "epoch": 2.058757424947825, + "grad_norm": 1.0455715656280518, + "learning_rate": 6.873334977210122e-05, + "loss": 0.3976, + "step": 6412 + }, + { + "epoch": 2.059078503772676, + "grad_norm": 0.671190619468689, + "learning_rate": 6.870019520517217e-05, + "loss": 0.31, + "step": 6413 + }, + { + "epoch": 2.0593995825975275, + "grad_norm": 0.5373357534408569, + "learning_rate": 6.866704445202943e-05, + "loss": 0.2763, + "step": 6414 + }, + { + "epoch": 2.059720661422379, + "grad_norm": 1.1658103466033936, + "learning_rate": 6.863389751671225e-05, + "loss": 0.3591, + "step": 6415 + }, + { + "epoch": 2.0600417402472306, + "grad_norm": 0.7469364404678345, + "learning_rate": 6.860075440325951e-05, + "loss": 0.3128, + "step": 6416 + }, + { + "epoch": 2.060362819072082, + "grad_norm": 1.2972486019134521, + "learning_rate": 6.856761511570963e-05, + "loss": 0.3338, + "step": 6417 + }, + { + "epoch": 2.0606838978969337, + "grad_norm": 0.8493379950523376, + "learning_rate": 6.853447965810046e-05, + "loss": 0.277, + "step": 6418 + }, + { + "epoch": 2.0610049767217853, + "grad_norm": 0.7463499307632446, + "learning_rate": 6.850134803446954e-05, + "loss": 0.3535, + "step": 6419 + }, + { + "epoch": 2.061326055546637, + "grad_norm": 0.6443728804588318, + "learning_rate": 6.846822024885379e-05, + "loss": 0.2776, + "step": 6420 + }, + { + "epoch": 2.0616471343714884, + "grad_norm": 1.100448489189148, + "learning_rate": 6.843509630528977e-05, + "loss": 0.3022, + "step": 6421 + }, + { + "epoch": 2.0619682131963395, + "grad_norm": 0.7827779054641724, + "learning_rate": 6.840197620781349e-05, + "loss": 0.3248, + "step": 6422 + }, + { + "epoch": 2.062289292021191, + "grad_norm": 0.8405880331993103, + "learning_rate": 6.836885996046061e-05, + "loss": 0.3416, + "step": 6423 + }, + { + "epoch": 2.0626103708460426, + "grad_norm": 0.832291841506958, + "learning_rate": 6.833574756726618e-05, + "loss": 0.3128, + "step": 6424 + }, + { + "epoch": 2.062931449670894, + "grad_norm": 0.8597456812858582, + "learning_rate": 6.830263903226483e-05, + "loss": 0.3529, + "step": 6425 + }, + { + "epoch": 2.0632525284957457, + "grad_norm": 0.6993900537490845, + "learning_rate": 6.826953435949081e-05, + "loss": 0.3465, + "step": 6426 + }, + { + "epoch": 2.0635736073205972, + "grad_norm": 0.8503331542015076, + "learning_rate": 6.823643355297773e-05, + "loss": 0.3257, + "step": 6427 + }, + { + "epoch": 2.063894686145449, + "grad_norm": 0.460232138633728, + "learning_rate": 6.820333661675893e-05, + "loss": 0.2558, + "step": 6428 + }, + { + "epoch": 2.0642157649703003, + "grad_norm": 0.6968713402748108, + "learning_rate": 6.817024355486706e-05, + "loss": 0.3211, + "step": 6429 + }, + { + "epoch": 2.064536843795152, + "grad_norm": 0.532193660736084, + "learning_rate": 6.81371543713345e-05, + "loss": 0.8584, + "step": 6430 + }, + { + "epoch": 2.064857922620003, + "grad_norm": 0.474894255399704, + "learning_rate": 6.8104069070193e-05, + "loss": 0.6014, + "step": 6431 + }, + { + "epoch": 2.0651790014448546, + "grad_norm": 0.5201006531715393, + "learning_rate": 6.807098765547397e-05, + "loss": 0.2353, + "step": 6432 + }, + { + "epoch": 2.065500080269706, + "grad_norm": 0.5294368267059326, + "learning_rate": 6.803791013120822e-05, + "loss": 0.1455, + "step": 6433 + }, + { + "epoch": 2.0658211590945577, + "grad_norm": 0.3815780580043793, + "learning_rate": 6.800483650142617e-05, + "loss": 0.0917, + "step": 6434 + }, + { + "epoch": 2.066142237919409, + "grad_norm": 0.5428910851478577, + "learning_rate": 6.797176677015775e-05, + "loss": 0.1991, + "step": 6435 + }, + { + "epoch": 2.0664633167442608, + "grad_norm": 0.46137359738349915, + "learning_rate": 6.793870094143238e-05, + "loss": 0.1742, + "step": 6436 + }, + { + "epoch": 2.0667843955691123, + "grad_norm": 0.6830509901046753, + "learning_rate": 6.790563901927907e-05, + "loss": 0.4138, + "step": 6437 + }, + { + "epoch": 2.067105474393964, + "grad_norm": 0.5673910975456238, + "learning_rate": 6.787258100772627e-05, + "loss": 0.2538, + "step": 6438 + }, + { + "epoch": 2.0674265532188154, + "grad_norm": 0.769719123840332, + "learning_rate": 6.783952691080203e-05, + "loss": 0.257, + "step": 6439 + }, + { + "epoch": 2.0677476320436665, + "grad_norm": 0.6150414347648621, + "learning_rate": 6.780647673253391e-05, + "loss": 0.2459, + "step": 6440 + }, + { + "epoch": 2.068068710868518, + "grad_norm": 0.5758988857269287, + "learning_rate": 6.77734304769489e-05, + "loss": 0.2108, + "step": 6441 + }, + { + "epoch": 2.0683897896933696, + "grad_norm": 0.979168713092804, + "learning_rate": 6.774038814807369e-05, + "loss": 0.4103, + "step": 6442 + }, + { + "epoch": 2.068710868518221, + "grad_norm": 0.8934927582740784, + "learning_rate": 6.770734974993426e-05, + "loss": 0.3478, + "step": 6443 + }, + { + "epoch": 2.0690319473430727, + "grad_norm": 1.4555315971374512, + "learning_rate": 6.767431528655635e-05, + "loss": 0.3517, + "step": 6444 + }, + { + "epoch": 2.0693530261679243, + "grad_norm": 0.8350145220756531, + "learning_rate": 6.764128476196505e-05, + "loss": 0.3284, + "step": 6445 + }, + { + "epoch": 2.069674104992776, + "grad_norm": 0.7142661809921265, + "learning_rate": 6.760825818018508e-05, + "loss": 0.2966, + "step": 6446 + }, + { + "epoch": 2.0699951838176274, + "grad_norm": 1.0782002210617065, + "learning_rate": 6.757523554524056e-05, + "loss": 0.4881, + "step": 6447 + }, + { + "epoch": 2.070316262642479, + "grad_norm": 0.9906001687049866, + "learning_rate": 6.754221686115525e-05, + "loss": 0.4128, + "step": 6448 + }, + { + "epoch": 2.07063734146733, + "grad_norm": 0.8098526000976562, + "learning_rate": 6.750920213195238e-05, + "loss": 0.3199, + "step": 6449 + }, + { + "epoch": 2.0709584202921816, + "grad_norm": 1.1390670537948608, + "learning_rate": 6.747619136165463e-05, + "loss": 0.41, + "step": 6450 + }, + { + "epoch": 2.071279499117033, + "grad_norm": 0.8910248875617981, + "learning_rate": 6.744318455428436e-05, + "loss": 0.3355, + "step": 6451 + }, + { + "epoch": 2.0716005779418847, + "grad_norm": 0.9463450312614441, + "learning_rate": 6.741018171386326e-05, + "loss": 0.4561, + "step": 6452 + }, + { + "epoch": 2.0719216567667362, + "grad_norm": 0.7821532487869263, + "learning_rate": 6.737718284441267e-05, + "loss": 0.3068, + "step": 6453 + }, + { + "epoch": 2.072242735591588, + "grad_norm": 1.0844594240188599, + "learning_rate": 6.734418794995337e-05, + "loss": 0.4195, + "step": 6454 + }, + { + "epoch": 2.0725638144164393, + "grad_norm": 0.7623462677001953, + "learning_rate": 6.731119703450577e-05, + "loss": 0.3326, + "step": 6455 + }, + { + "epoch": 2.072884893241291, + "grad_norm": 0.7975969314575195, + "learning_rate": 6.727821010208962e-05, + "loss": 0.3355, + "step": 6456 + }, + { + "epoch": 2.0732059720661424, + "grad_norm": 1.0122053623199463, + "learning_rate": 6.724522715672432e-05, + "loss": 0.4545, + "step": 6457 + }, + { + "epoch": 2.0735270508909935, + "grad_norm": 0.9668064117431641, + "learning_rate": 6.721224820242875e-05, + "loss": 0.381, + "step": 6458 + }, + { + "epoch": 2.073848129715845, + "grad_norm": 0.8006826043128967, + "learning_rate": 6.717927324322124e-05, + "loss": 0.3322, + "step": 6459 + }, + { + "epoch": 2.0741692085406966, + "grad_norm": 0.6765208840370178, + "learning_rate": 6.714630228311978e-05, + "loss": 0.2891, + "step": 6460 + }, + { + "epoch": 2.074490287365548, + "grad_norm": 0.8815526366233826, + "learning_rate": 6.711333532614168e-05, + "loss": 0.3118, + "step": 6461 + }, + { + "epoch": 2.0748113661903997, + "grad_norm": 0.7715243101119995, + "learning_rate": 6.708037237630395e-05, + "loss": 0.3149, + "step": 6462 + }, + { + "epoch": 2.0751324450152513, + "grad_norm": 0.7587166428565979, + "learning_rate": 6.704741343762295e-05, + "loss": 0.3618, + "step": 6463 + }, + { + "epoch": 2.075453523840103, + "grad_norm": 0.6574379205703735, + "learning_rate": 6.701445851411472e-05, + "loss": 0.3009, + "step": 6464 + }, + { + "epoch": 2.0757746026649544, + "grad_norm": 1.0423983335494995, + "learning_rate": 6.698150760979463e-05, + "loss": 0.3738, + "step": 6465 + }, + { + "epoch": 2.076095681489806, + "grad_norm": 0.5287320613861084, + "learning_rate": 6.69485607286777e-05, + "loss": 0.2526, + "step": 6466 + }, + { + "epoch": 2.076416760314657, + "grad_norm": 0.7871254086494446, + "learning_rate": 6.69156178747784e-05, + "loss": 0.2799, + "step": 6467 + }, + { + "epoch": 2.0767378391395086, + "grad_norm": 1.0920846462249756, + "learning_rate": 6.688267905211067e-05, + "loss": 0.3249, + "step": 6468 + }, + { + "epoch": 2.07705891796436, + "grad_norm": 1.0607701539993286, + "learning_rate": 6.684974426468808e-05, + "loss": 0.363, + "step": 6469 + }, + { + "epoch": 2.0773799967892117, + "grad_norm": 0.6999905705451965, + "learning_rate": 6.681681351652356e-05, + "loss": 0.3041, + "step": 6470 + }, + { + "epoch": 2.0777010756140633, + "grad_norm": 1.2206069231033325, + "learning_rate": 6.67838868116297e-05, + "loss": 0.3103, + "step": 6471 + }, + { + "epoch": 2.078022154438915, + "grad_norm": 0.7365018725395203, + "learning_rate": 6.675096415401842e-05, + "loss": 0.3293, + "step": 6472 + }, + { + "epoch": 2.0783432332637664, + "grad_norm": 0.6262559294700623, + "learning_rate": 6.671804554770135e-05, + "loss": 0.2744, + "step": 6473 + }, + { + "epoch": 2.078664312088618, + "grad_norm": 0.5494680404663086, + "learning_rate": 6.668513099668944e-05, + "loss": 0.2866, + "step": 6474 + }, + { + "epoch": 2.0789853909134695, + "grad_norm": 0.6878925561904907, + "learning_rate": 6.66522205049933e-05, + "loss": 0.3257, + "step": 6475 + }, + { + "epoch": 2.0793064697383206, + "grad_norm": 0.7526207566261292, + "learning_rate": 6.661931407662292e-05, + "loss": 0.3167, + "step": 6476 + }, + { + "epoch": 2.079627548563172, + "grad_norm": 1.070246934890747, + "learning_rate": 6.658641171558785e-05, + "loss": 0.3273, + "step": 6477 + }, + { + "epoch": 2.0799486273880237, + "grad_norm": 0.7761482000350952, + "learning_rate": 6.65535134258972e-05, + "loss": 0.304, + "step": 6478 + }, + { + "epoch": 2.080269706212875, + "grad_norm": 0.6462170481681824, + "learning_rate": 6.652061921155943e-05, + "loss": 0.2834, + "step": 6479 + }, + { + "epoch": 2.0805907850377268, + "grad_norm": 0.5869211554527283, + "learning_rate": 6.648772907658272e-05, + "loss": 0.5811, + "step": 6480 + }, + { + "epoch": 2.0809118638625783, + "grad_norm": 0.397668719291687, + "learning_rate": 6.64548430249745e-05, + "loss": 0.2837, + "step": 6481 + }, + { + "epoch": 2.08123294268743, + "grad_norm": 0.5283700227737427, + "learning_rate": 6.642196106074194e-05, + "loss": 0.2056, + "step": 6482 + }, + { + "epoch": 2.0815540215122814, + "grad_norm": 0.5720309615135193, + "learning_rate": 6.638908318789156e-05, + "loss": 0.1464, + "step": 6483 + }, + { + "epoch": 2.081875100337133, + "grad_norm": 0.4584559202194214, + "learning_rate": 6.635620941042945e-05, + "loss": 0.2292, + "step": 6484 + }, + { + "epoch": 2.082196179161984, + "grad_norm": 0.5467604994773865, + "learning_rate": 6.63233397323612e-05, + "loss": 0.2558, + "step": 6485 + }, + { + "epoch": 2.0825172579868356, + "grad_norm": 0.6699813008308411, + "learning_rate": 6.62904741576918e-05, + "loss": 0.3378, + "step": 6486 + }, + { + "epoch": 2.082838336811687, + "grad_norm": 0.9696993231773376, + "learning_rate": 6.62576126904259e-05, + "loss": 0.3986, + "step": 6487 + }, + { + "epoch": 2.0831594156365387, + "grad_norm": 0.788311243057251, + "learning_rate": 6.622475533456751e-05, + "loss": 0.361, + "step": 6488 + }, + { + "epoch": 2.0834804944613903, + "grad_norm": 1.0402991771697998, + "learning_rate": 6.61919020941203e-05, + "loss": 0.3738, + "step": 6489 + }, + { + "epoch": 2.083801573286242, + "grad_norm": 0.8160973191261292, + "learning_rate": 6.61590529730872e-05, + "loss": 0.3736, + "step": 6490 + }, + { + "epoch": 2.0841226521110934, + "grad_norm": 0.6715680360794067, + "learning_rate": 6.612620797547087e-05, + "loss": 0.3151, + "step": 6491 + }, + { + "epoch": 2.084443730935945, + "grad_norm": 0.8539772629737854, + "learning_rate": 6.609336710527332e-05, + "loss": 0.4157, + "step": 6492 + }, + { + "epoch": 2.0847648097607965, + "grad_norm": 0.8843332529067993, + "learning_rate": 6.606053036649619e-05, + "loss": 0.3158, + "step": 6493 + }, + { + "epoch": 2.0850858885856476, + "grad_norm": 0.9574533700942993, + "learning_rate": 6.602769776314049e-05, + "loss": 0.3908, + "step": 6494 + }, + { + "epoch": 2.085406967410499, + "grad_norm": 0.8649710416793823, + "learning_rate": 6.599486929920673e-05, + "loss": 0.3199, + "step": 6495 + }, + { + "epoch": 2.0857280462353507, + "grad_norm": 1.0361381769180298, + "learning_rate": 6.5962044978695e-05, + "loss": 0.4014, + "step": 6496 + }, + { + "epoch": 2.0860491250602022, + "grad_norm": 1.0464524030685425, + "learning_rate": 6.592922480560483e-05, + "loss": 0.2594, + "step": 6497 + }, + { + "epoch": 2.086370203885054, + "grad_norm": 1.0081483125686646, + "learning_rate": 6.589640878393531e-05, + "loss": 0.3912, + "step": 6498 + }, + { + "epoch": 2.0866912827099053, + "grad_norm": 0.9563593864440918, + "learning_rate": 6.58635969176849e-05, + "loss": 0.453, + "step": 6499 + }, + { + "epoch": 2.087012361534757, + "grad_norm": 0.8743892908096313, + "learning_rate": 6.583078921085167e-05, + "loss": 0.3783, + "step": 6500 + }, + { + "epoch": 2.0873334403596084, + "grad_norm": 0.964160680770874, + "learning_rate": 6.579798566743314e-05, + "loss": 0.3375, + "step": 6501 + }, + { + "epoch": 2.08765451918446, + "grad_norm": 1.0127390623092651, + "learning_rate": 6.57651862914263e-05, + "loss": 0.4236, + "step": 6502 + }, + { + "epoch": 2.087975598009311, + "grad_norm": 0.7326671481132507, + "learning_rate": 6.573239108682768e-05, + "loss": 0.3081, + "step": 6503 + }, + { + "epoch": 2.0882966768341626, + "grad_norm": 0.9221829771995544, + "learning_rate": 6.569960005763323e-05, + "loss": 0.284, + "step": 6504 + }, + { + "epoch": 2.088617755659014, + "grad_norm": 0.6714749336242676, + "learning_rate": 6.566681320783849e-05, + "loss": 0.2871, + "step": 6505 + }, + { + "epoch": 2.0889388344838657, + "grad_norm": 0.9831109642982483, + "learning_rate": 6.56340305414384e-05, + "loss": 0.3815, + "step": 6506 + }, + { + "epoch": 2.0892599133087173, + "grad_norm": 0.8641473650932312, + "learning_rate": 6.560125206242746e-05, + "loss": 0.3888, + "step": 6507 + }, + { + "epoch": 2.089580992133569, + "grad_norm": 0.9115124344825745, + "learning_rate": 6.55684777747996e-05, + "loss": 0.3683, + "step": 6508 + }, + { + "epoch": 2.0899020709584204, + "grad_norm": 0.6893644332885742, + "learning_rate": 6.55357076825483e-05, + "loss": 0.2572, + "step": 6509 + }, + { + "epoch": 2.090223149783272, + "grad_norm": 1.011763572692871, + "learning_rate": 6.550294178966647e-05, + "loss": 0.3746, + "step": 6510 + }, + { + "epoch": 2.0905442286081235, + "grad_norm": 0.739936888217926, + "learning_rate": 6.547018010014654e-05, + "loss": 0.338, + "step": 6511 + }, + { + "epoch": 2.0908653074329746, + "grad_norm": 0.9537821412086487, + "learning_rate": 6.543742261798045e-05, + "loss": 0.2584, + "step": 6512 + }, + { + "epoch": 2.091186386257826, + "grad_norm": 0.6458030343055725, + "learning_rate": 6.540466934715953e-05, + "loss": 0.3046, + "step": 6513 + }, + { + "epoch": 2.0915074650826777, + "grad_norm": 0.7992889881134033, + "learning_rate": 6.537192029167474e-05, + "loss": 0.3166, + "step": 6514 + }, + { + "epoch": 2.0918285439075293, + "grad_norm": 0.651301383972168, + "learning_rate": 6.53391754555164e-05, + "loss": 0.2818, + "step": 6515 + }, + { + "epoch": 2.092149622732381, + "grad_norm": 0.6748250722885132, + "learning_rate": 6.530643484267443e-05, + "loss": 0.3025, + "step": 6516 + }, + { + "epoch": 2.0924707015572324, + "grad_norm": 0.7246717810630798, + "learning_rate": 6.52736984571381e-05, + "loss": 0.258, + "step": 6517 + }, + { + "epoch": 2.092791780382084, + "grad_norm": 0.7231757640838623, + "learning_rate": 6.52409663028963e-05, + "loss": 0.2886, + "step": 6518 + }, + { + "epoch": 2.0931128592069355, + "grad_norm": 0.8351329565048218, + "learning_rate": 6.520823838393731e-05, + "loss": 0.3284, + "step": 6519 + }, + { + "epoch": 2.093433938031787, + "grad_norm": 0.9581044316291809, + "learning_rate": 6.517551470424895e-05, + "loss": 0.3134, + "step": 6520 + }, + { + "epoch": 2.093755016856638, + "grad_norm": 0.9175045490264893, + "learning_rate": 6.51427952678185e-05, + "loss": 0.3739, + "step": 6521 + }, + { + "epoch": 2.0940760956814897, + "grad_norm": 0.9837835431098938, + "learning_rate": 6.511008007863268e-05, + "loss": 0.4271, + "step": 6522 + }, + { + "epoch": 2.094397174506341, + "grad_norm": 0.467220664024353, + "learning_rate": 6.507736914067781e-05, + "loss": 0.2381, + "step": 6523 + }, + { + "epoch": 2.0947182533311928, + "grad_norm": 1.0905086994171143, + "learning_rate": 6.504466245793955e-05, + "loss": 0.4082, + "step": 6524 + }, + { + "epoch": 2.0950393321560443, + "grad_norm": 0.6342186331748962, + "learning_rate": 6.501196003440314e-05, + "loss": 0.268, + "step": 6525 + }, + { + "epoch": 2.095360410980896, + "grad_norm": 0.6534708142280579, + "learning_rate": 6.497926187405326e-05, + "loss": 0.261, + "step": 6526 + }, + { + "epoch": 2.0956814898057474, + "grad_norm": 0.5003716349601746, + "learning_rate": 6.494656798087412e-05, + "loss": 0.2658, + "step": 6527 + }, + { + "epoch": 2.096002568630599, + "grad_norm": 0.6496462225914001, + "learning_rate": 6.49138783588493e-05, + "loss": 0.3374, + "step": 6528 + }, + { + "epoch": 2.0963236474554505, + "grad_norm": 0.8161367177963257, + "learning_rate": 6.488119301196201e-05, + "loss": 0.3874, + "step": 6529 + }, + { + "epoch": 2.0966447262803016, + "grad_norm": 0.5122060775756836, + "learning_rate": 6.484851194419484e-05, + "loss": 0.6031, + "step": 6530 + }, + { + "epoch": 2.096965805105153, + "grad_norm": 0.43576663732528687, + "learning_rate": 6.481583515952983e-05, + "loss": 0.6629, + "step": 6531 + }, + { + "epoch": 2.0972868839300047, + "grad_norm": 0.484679639339447, + "learning_rate": 6.478316266194861e-05, + "loss": 0.2203, + "step": 6532 + }, + { + "epoch": 2.0976079627548563, + "grad_norm": 0.5761163830757141, + "learning_rate": 6.475049445543215e-05, + "loss": 0.2443, + "step": 6533 + }, + { + "epoch": 2.097929041579708, + "grad_norm": 0.5513499975204468, + "learning_rate": 6.471783054396105e-05, + "loss": 0.1622, + "step": 6534 + }, + { + "epoch": 2.0982501204045594, + "grad_norm": 0.522097110748291, + "learning_rate": 6.468517093151525e-05, + "loss": 0.1611, + "step": 6535 + }, + { + "epoch": 2.098571199229411, + "grad_norm": 0.7834147214889526, + "learning_rate": 6.465251562207431e-05, + "loss": 0.3935, + "step": 6536 + }, + { + "epoch": 2.0988922780542625, + "grad_norm": 1.0069727897644043, + "learning_rate": 6.461986461961706e-05, + "loss": 0.4389, + "step": 6537 + }, + { + "epoch": 2.099213356879114, + "grad_norm": 1.162589192390442, + "learning_rate": 6.458721792812204e-05, + "loss": 0.4037, + "step": 6538 + }, + { + "epoch": 2.099534435703965, + "grad_norm": 0.7293219566345215, + "learning_rate": 6.455457555156705e-05, + "loss": 0.341, + "step": 6539 + }, + { + "epoch": 2.0998555145288167, + "grad_norm": 0.6971396803855896, + "learning_rate": 6.452193749392952e-05, + "loss": 0.2258, + "step": 6540 + }, + { + "epoch": 2.1001765933536682, + "grad_norm": 0.7499396800994873, + "learning_rate": 6.448930375918631e-05, + "loss": 0.2789, + "step": 6541 + }, + { + "epoch": 2.10049767217852, + "grad_norm": 0.637330174446106, + "learning_rate": 6.44566743513137e-05, + "loss": 0.301, + "step": 6542 + }, + { + "epoch": 2.1008187510033713, + "grad_norm": 0.9882725477218628, + "learning_rate": 6.442404927428751e-05, + "loss": 0.499, + "step": 6543 + }, + { + "epoch": 2.101139829828223, + "grad_norm": 0.7943243980407715, + "learning_rate": 6.4391428532083e-05, + "loss": 0.324, + "step": 6544 + }, + { + "epoch": 2.1014609086530744, + "grad_norm": 0.8991156816482544, + "learning_rate": 6.435881212867493e-05, + "loss": 0.3999, + "step": 6545 + }, + { + "epoch": 2.101781987477926, + "grad_norm": 0.8025935292243958, + "learning_rate": 6.432620006803746e-05, + "loss": 0.3236, + "step": 6546 + }, + { + "epoch": 2.1021030663027775, + "grad_norm": 0.795685350894928, + "learning_rate": 6.429359235414436e-05, + "loss": 0.3178, + "step": 6547 + }, + { + "epoch": 2.1024241451276287, + "grad_norm": 0.8055205345153809, + "learning_rate": 6.426098899096868e-05, + "loss": 0.3545, + "step": 6548 + }, + { + "epoch": 2.10274522395248, + "grad_norm": 0.9374400973320007, + "learning_rate": 6.422838998248307e-05, + "loss": 0.4094, + "step": 6549 + }, + { + "epoch": 2.1030663027773318, + "grad_norm": 1.0976170301437378, + "learning_rate": 6.419579533265968e-05, + "loss": 0.3392, + "step": 6550 + }, + { + "epoch": 2.1033873816021833, + "grad_norm": 0.8357941508293152, + "learning_rate": 6.416320504546997e-05, + "loss": 0.3858, + "step": 6551 + }, + { + "epoch": 2.103708460427035, + "grad_norm": 1.0488425493240356, + "learning_rate": 6.413061912488508e-05, + "loss": 0.4397, + "step": 6552 + }, + { + "epoch": 2.1040295392518864, + "grad_norm": 0.8378785848617554, + "learning_rate": 6.409803757487538e-05, + "loss": 0.3655, + "step": 6553 + }, + { + "epoch": 2.104350618076738, + "grad_norm": 1.1138861179351807, + "learning_rate": 6.406546039941094e-05, + "loss": 0.3253, + "step": 6554 + }, + { + "epoch": 2.1046716969015895, + "grad_norm": 1.0650768280029297, + "learning_rate": 6.403288760246112e-05, + "loss": 0.3725, + "step": 6555 + }, + { + "epoch": 2.1049927757264406, + "grad_norm": 1.272623896598816, + "learning_rate": 6.40003191879949e-05, + "loss": 0.3764, + "step": 6556 + }, + { + "epoch": 2.105313854551292, + "grad_norm": 1.197476863861084, + "learning_rate": 6.396775515998055e-05, + "loss": 0.3797, + "step": 6557 + }, + { + "epoch": 2.1056349333761437, + "grad_norm": 0.9312015771865845, + "learning_rate": 6.393519552238591e-05, + "loss": 0.3614, + "step": 6558 + }, + { + "epoch": 2.1059560122009953, + "grad_norm": 1.1965104341506958, + "learning_rate": 6.390264027917836e-05, + "loss": 0.4493, + "step": 6559 + }, + { + "epoch": 2.106277091025847, + "grad_norm": 0.7930313348770142, + "learning_rate": 6.387008943432455e-05, + "loss": 0.3172, + "step": 6560 + }, + { + "epoch": 2.1065981698506984, + "grad_norm": 0.645929753780365, + "learning_rate": 6.383754299179079e-05, + "loss": 0.3191, + "step": 6561 + }, + { + "epoch": 2.10691924867555, + "grad_norm": 0.7615616917610168, + "learning_rate": 6.380500095554268e-05, + "loss": 0.2763, + "step": 6562 + }, + { + "epoch": 2.1072403275004015, + "grad_norm": 1.1042746305465698, + "learning_rate": 6.377246332954544e-05, + "loss": 0.3695, + "step": 6563 + }, + { + "epoch": 2.107561406325253, + "grad_norm": 0.7434027791023254, + "learning_rate": 6.373993011776367e-05, + "loss": 0.3348, + "step": 6564 + }, + { + "epoch": 2.107882485150104, + "grad_norm": 0.8379579782485962, + "learning_rate": 6.370740132416138e-05, + "loss": 0.3715, + "step": 6565 + }, + { + "epoch": 2.1082035639749557, + "grad_norm": 0.7400662899017334, + "learning_rate": 6.367487695270217e-05, + "loss": 0.3288, + "step": 6566 + }, + { + "epoch": 2.1085246427998072, + "grad_norm": 0.8709506392478943, + "learning_rate": 6.364235700734903e-05, + "loss": 0.3382, + "step": 6567 + }, + { + "epoch": 2.1088457216246588, + "grad_norm": 1.0580693483352661, + "learning_rate": 6.360984149206439e-05, + "loss": 0.3497, + "step": 6568 + }, + { + "epoch": 2.1091668004495103, + "grad_norm": 1.0599806308746338, + "learning_rate": 6.357733041081018e-05, + "loss": 0.3606, + "step": 6569 + }, + { + "epoch": 2.109487879274362, + "grad_norm": 0.7007160782814026, + "learning_rate": 6.35448237675478e-05, + "loss": 0.2877, + "step": 6570 + }, + { + "epoch": 2.1098089580992134, + "grad_norm": 0.9604334831237793, + "learning_rate": 6.351232156623803e-05, + "loss": 0.3173, + "step": 6571 + }, + { + "epoch": 2.110130036924065, + "grad_norm": 0.7274121642112732, + "learning_rate": 6.347982381084123e-05, + "loss": 0.2848, + "step": 6572 + }, + { + "epoch": 2.1104511157489165, + "grad_norm": 0.8316487669944763, + "learning_rate": 6.344733050531713e-05, + "loss": 0.3307, + "step": 6573 + }, + { + "epoch": 2.1107721945737676, + "grad_norm": 0.6824556589126587, + "learning_rate": 6.341484165362487e-05, + "loss": 0.2692, + "step": 6574 + }, + { + "epoch": 2.111093273398619, + "grad_norm": 0.9442518949508667, + "learning_rate": 6.338235725972325e-05, + "loss": 0.326, + "step": 6575 + }, + { + "epoch": 2.1114143522234707, + "grad_norm": 0.46426621079444885, + "learning_rate": 6.334987732757029e-05, + "loss": 0.2598, + "step": 6576 + }, + { + "epoch": 2.1117354310483223, + "grad_norm": 0.6108339428901672, + "learning_rate": 6.33174018611236e-05, + "loss": 0.3211, + "step": 6577 + }, + { + "epoch": 2.112056509873174, + "grad_norm": 0.8628884553909302, + "learning_rate": 6.328493086434023e-05, + "loss": 0.3061, + "step": 6578 + }, + { + "epoch": 2.1123775886980254, + "grad_norm": 0.45920509099960327, + "learning_rate": 6.325246434117668e-05, + "loss": 0.2858, + "step": 6579 + }, + { + "epoch": 2.112698667522877, + "grad_norm": 0.36384066939353943, + "learning_rate": 6.322000229558887e-05, + "loss": 0.3652, + "step": 6580 + }, + { + "epoch": 2.1130197463477285, + "grad_norm": 0.6161776781082153, + "learning_rate": 6.318754473153221e-05, + "loss": 0.4469, + "step": 6581 + }, + { + "epoch": 2.11334082517258, + "grad_norm": 0.5256691575050354, + "learning_rate": 6.315509165296159e-05, + "loss": 0.2341, + "step": 6582 + }, + { + "epoch": 2.113661903997431, + "grad_norm": 0.5201000571250916, + "learning_rate": 6.312264306383124e-05, + "loss": 0.2947, + "step": 6583 + }, + { + "epoch": 2.1139829828222827, + "grad_norm": 0.5829021334648132, + "learning_rate": 6.309019896809503e-05, + "loss": 0.2341, + "step": 6584 + }, + { + "epoch": 2.1143040616471342, + "grad_norm": 0.5891327261924744, + "learning_rate": 6.305775936970606e-05, + "loss": 0.1354, + "step": 6585 + }, + { + "epoch": 2.114625140471986, + "grad_norm": 0.4353516697883606, + "learning_rate": 6.302532427261709e-05, + "loss": 0.1752, + "step": 6586 + }, + { + "epoch": 2.1149462192968373, + "grad_norm": 0.6292280554771423, + "learning_rate": 6.299289368078016e-05, + "loss": 0.327, + "step": 6587 + }, + { + "epoch": 2.115267298121689, + "grad_norm": 0.7321916818618774, + "learning_rate": 6.296046759814694e-05, + "loss": 0.3184, + "step": 6588 + }, + { + "epoch": 2.1155883769465404, + "grad_norm": 1.0000382661819458, + "learning_rate": 6.292804602866834e-05, + "loss": 0.3432, + "step": 6589 + }, + { + "epoch": 2.115909455771392, + "grad_norm": 0.9078216552734375, + "learning_rate": 6.289562897629492e-05, + "loss": 0.4655, + "step": 6590 + }, + { + "epoch": 2.1162305345962436, + "grad_norm": 0.827180802822113, + "learning_rate": 6.286321644497655e-05, + "loss": 0.369, + "step": 6591 + }, + { + "epoch": 2.1165516134210947, + "grad_norm": 0.5973712205886841, + "learning_rate": 6.283080843866256e-05, + "loss": 0.2657, + "step": 6592 + }, + { + "epoch": 2.116872692245946, + "grad_norm": 0.7066929936408997, + "learning_rate": 6.27984049613019e-05, + "loss": 0.3177, + "step": 6593 + }, + { + "epoch": 2.1171937710707978, + "grad_norm": 0.8273329734802246, + "learning_rate": 6.276600601684267e-05, + "loss": 0.3679, + "step": 6594 + }, + { + "epoch": 2.1175148498956493, + "grad_norm": 0.9276707768440247, + "learning_rate": 6.27336116092327e-05, + "loss": 0.4267, + "step": 6595 + }, + { + "epoch": 2.117835928720501, + "grad_norm": 0.9455394744873047, + "learning_rate": 6.27012217424191e-05, + "loss": 0.3272, + "step": 6596 + }, + { + "epoch": 2.1181570075453524, + "grad_norm": 1.1003650426864624, + "learning_rate": 6.266883642034853e-05, + "loss": 0.4239, + "step": 6597 + }, + { + "epoch": 2.118478086370204, + "grad_norm": 1.107857346534729, + "learning_rate": 6.263645564696696e-05, + "loss": 0.436, + "step": 6598 + }, + { + "epoch": 2.1187991651950555, + "grad_norm": 0.7510982155799866, + "learning_rate": 6.260407942621998e-05, + "loss": 0.3712, + "step": 6599 + }, + { + "epoch": 2.119120244019907, + "grad_norm": 0.7718844413757324, + "learning_rate": 6.257170776205245e-05, + "loss": 0.3182, + "step": 6600 + }, + { + "epoch": 2.119441322844758, + "grad_norm": 0.8712736964225769, + "learning_rate": 6.25393406584088e-05, + "loss": 0.3044, + "step": 6601 + }, + { + "epoch": 2.1197624016696097, + "grad_norm": 1.5617177486419678, + "learning_rate": 6.25069781192329e-05, + "loss": 0.42, + "step": 6602 + }, + { + "epoch": 2.1200834804944613, + "grad_norm": 0.9841622710227966, + "learning_rate": 6.247462014846792e-05, + "loss": 0.399, + "step": 6603 + }, + { + "epoch": 2.120404559319313, + "grad_norm": 0.9455597400665283, + "learning_rate": 6.24422667500567e-05, + "loss": 0.4426, + "step": 6604 + }, + { + "epoch": 2.1207256381441644, + "grad_norm": 0.6564199924468994, + "learning_rate": 6.240991792794133e-05, + "loss": 0.2695, + "step": 6605 + }, + { + "epoch": 2.121046716969016, + "grad_norm": 0.9546607732772827, + "learning_rate": 6.237757368606345e-05, + "loss": 0.4005, + "step": 6606 + }, + { + "epoch": 2.1213677957938675, + "grad_norm": 0.9721749424934387, + "learning_rate": 6.234523402836407e-05, + "loss": 0.4454, + "step": 6607 + }, + { + "epoch": 2.121688874618719, + "grad_norm": 1.2680310010910034, + "learning_rate": 6.231289895878375e-05, + "loss": 0.3899, + "step": 6608 + }, + { + "epoch": 2.1220099534435706, + "grad_norm": 0.8545310497283936, + "learning_rate": 6.228056848126236e-05, + "loss": 0.3709, + "step": 6609 + }, + { + "epoch": 2.1223310322684217, + "grad_norm": 0.8629288077354431, + "learning_rate": 6.224824259973925e-05, + "loss": 0.2811, + "step": 6610 + }, + { + "epoch": 2.1226521110932732, + "grad_norm": 1.1735135316848755, + "learning_rate": 6.22159213181533e-05, + "loss": 0.3109, + "step": 6611 + }, + { + "epoch": 2.122973189918125, + "grad_norm": 0.7792378067970276, + "learning_rate": 6.21836046404427e-05, + "loss": 0.3302, + "step": 6612 + }, + { + "epoch": 2.1232942687429763, + "grad_norm": 0.7332779169082642, + "learning_rate": 6.215129257054522e-05, + "loss": 0.3134, + "step": 6613 + }, + { + "epoch": 2.123615347567828, + "grad_norm": 0.6187347769737244, + "learning_rate": 6.211898511239787e-05, + "loss": 0.2562, + "step": 6614 + }, + { + "epoch": 2.1239364263926794, + "grad_norm": 0.8484505414962769, + "learning_rate": 6.20866822699373e-05, + "loss": 0.2516, + "step": 6615 + }, + { + "epoch": 2.124257505217531, + "grad_norm": 0.8732132911682129, + "learning_rate": 6.205438404709947e-05, + "loss": 0.3172, + "step": 6616 + }, + { + "epoch": 2.1245785840423825, + "grad_norm": 0.8905081152915955, + "learning_rate": 6.20220904478199e-05, + "loss": 0.3439, + "step": 6617 + }, + { + "epoch": 2.124899662867234, + "grad_norm": 0.7029334306716919, + "learning_rate": 6.198980147603339e-05, + "loss": 0.2728, + "step": 6618 + }, + { + "epoch": 2.125220741692085, + "grad_norm": 0.8085559010505676, + "learning_rate": 6.195751713567427e-05, + "loss": 0.2663, + "step": 6619 + }, + { + "epoch": 2.1255418205169367, + "grad_norm": 0.8118886351585388, + "learning_rate": 6.192523743067626e-05, + "loss": 0.3628, + "step": 6620 + }, + { + "epoch": 2.1258628993417883, + "grad_norm": 0.5868399143218994, + "learning_rate": 6.18929623649726e-05, + "loss": 0.2687, + "step": 6621 + }, + { + "epoch": 2.12618397816664, + "grad_norm": 0.5492334961891174, + "learning_rate": 6.18606919424959e-05, + "loss": 0.2749, + "step": 6622 + }, + { + "epoch": 2.1265050569914914, + "grad_norm": 1.074462652206421, + "learning_rate": 6.182842616717817e-05, + "loss": 0.3686, + "step": 6623 + }, + { + "epoch": 2.126826135816343, + "grad_norm": 0.8293436765670776, + "learning_rate": 6.179616504295092e-05, + "loss": 0.3272, + "step": 6624 + }, + { + "epoch": 2.1271472146411945, + "grad_norm": 1.137435793876648, + "learning_rate": 6.176390857374507e-05, + "loss": 0.3109, + "step": 6625 + }, + { + "epoch": 2.127468293466046, + "grad_norm": 0.7483497858047485, + "learning_rate": 6.173165676349103e-05, + "loss": 0.343, + "step": 6626 + }, + { + "epoch": 2.1277893722908976, + "grad_norm": 0.40780845284461975, + "learning_rate": 6.169940961611854e-05, + "loss": 0.2489, + "step": 6627 + }, + { + "epoch": 2.1281104511157487, + "grad_norm": 0.6803835034370422, + "learning_rate": 6.166716713555674e-05, + "loss": 0.3458, + "step": 6628 + }, + { + "epoch": 2.1284315299406003, + "grad_norm": 0.6361788511276245, + "learning_rate": 6.163492932573438e-05, + "loss": 0.3, + "step": 6629 + }, + { + "epoch": 2.128752608765452, + "grad_norm": 0.5756948590278625, + "learning_rate": 6.160269619057951e-05, + "loss": 0.6565, + "step": 6630 + }, + { + "epoch": 2.1290736875903034, + "grad_norm": 0.5994390845298767, + "learning_rate": 6.157046773401964e-05, + "loss": 0.4455, + "step": 6631 + }, + { + "epoch": 2.129394766415155, + "grad_norm": 0.661205530166626, + "learning_rate": 6.153824395998168e-05, + "loss": 0.1752, + "step": 6632 + }, + { + "epoch": 2.1297158452400065, + "grad_norm": 0.5093392729759216, + "learning_rate": 6.150602487239206e-05, + "loss": 0.0911, + "step": 6633 + }, + { + "epoch": 2.130036924064858, + "grad_norm": 0.4452694356441498, + "learning_rate": 6.147381047517655e-05, + "loss": 0.0949, + "step": 6634 + }, + { + "epoch": 2.1303580028897096, + "grad_norm": 0.4948364496231079, + "learning_rate": 6.144160077226036e-05, + "loss": 0.1567, + "step": 6635 + }, + { + "epoch": 2.130679081714561, + "grad_norm": 0.5426846742630005, + "learning_rate": 6.140939576756817e-05, + "loss": 0.2952, + "step": 6636 + }, + { + "epoch": 2.131000160539412, + "grad_norm": 0.6515873074531555, + "learning_rate": 6.1377195465024e-05, + "loss": 0.3118, + "step": 6637 + }, + { + "epoch": 2.1313212393642638, + "grad_norm": 0.7269318103790283, + "learning_rate": 6.134499986855144e-05, + "loss": 0.3818, + "step": 6638 + }, + { + "epoch": 2.1316423181891153, + "grad_norm": 0.7911025285720825, + "learning_rate": 6.131280898207339e-05, + "loss": 0.3881, + "step": 6639 + }, + { + "epoch": 2.131963397013967, + "grad_norm": 0.8551725149154663, + "learning_rate": 6.128062280951224e-05, + "loss": 0.3669, + "step": 6640 + }, + { + "epoch": 2.1322844758388184, + "grad_norm": 0.8696125745773315, + "learning_rate": 6.12484413547897e-05, + "loss": 0.3246, + "step": 6641 + }, + { + "epoch": 2.13260555466367, + "grad_norm": 0.9644296765327454, + "learning_rate": 6.121626462182707e-05, + "loss": 0.3639, + "step": 6642 + }, + { + "epoch": 2.1329266334885215, + "grad_norm": 0.9500702619552612, + "learning_rate": 6.118409261454494e-05, + "loss": 0.2879, + "step": 6643 + }, + { + "epoch": 2.133247712313373, + "grad_norm": 0.942509114742279, + "learning_rate": 6.11519253368634e-05, + "loss": 0.3482, + "step": 6644 + }, + { + "epoch": 2.1335687911382246, + "grad_norm": 0.7259782552719116, + "learning_rate": 6.111976279270192e-05, + "loss": 0.3214, + "step": 6645 + }, + { + "epoch": 2.1338898699630757, + "grad_norm": 0.7801980972290039, + "learning_rate": 6.108760498597938e-05, + "loss": 0.3301, + "step": 6646 + }, + { + "epoch": 2.1342109487879273, + "grad_norm": 0.8624287247657776, + "learning_rate": 6.105545192061416e-05, + "loss": 0.3717, + "step": 6647 + }, + { + "epoch": 2.134532027612779, + "grad_norm": 0.8877105712890625, + "learning_rate": 6.1023303600523975e-05, + "loss": 0.4454, + "step": 6648 + }, + { + "epoch": 2.1348531064376304, + "grad_norm": 0.7793693542480469, + "learning_rate": 6.099116002962604e-05, + "loss": 0.3266, + "step": 6649 + }, + { + "epoch": 2.135174185262482, + "grad_norm": 1.096024990081787, + "learning_rate": 6.09590212118369e-05, + "loss": 0.4317, + "step": 6650 + }, + { + "epoch": 2.1354952640873335, + "grad_norm": 0.889217734336853, + "learning_rate": 6.092688715107264e-05, + "loss": 0.4424, + "step": 6651 + }, + { + "epoch": 2.135816342912185, + "grad_norm": 0.8628430962562561, + "learning_rate": 6.089475785124863e-05, + "loss": 0.3435, + "step": 6652 + }, + { + "epoch": 2.1361374217370366, + "grad_norm": 0.8696472644805908, + "learning_rate": 6.086263331627976e-05, + "loss": 0.3443, + "step": 6653 + }, + { + "epoch": 2.136458500561888, + "grad_norm": 0.9807567596435547, + "learning_rate": 6.0830513550080335e-05, + "loss": 0.3259, + "step": 6654 + }, + { + "epoch": 2.1367795793867392, + "grad_norm": 0.9957831501960754, + "learning_rate": 6.0798398556563976e-05, + "loss": 0.3485, + "step": 6655 + }, + { + "epoch": 2.137100658211591, + "grad_norm": 1.0580285787582397, + "learning_rate": 6.076628833964388e-05, + "loss": 0.4838, + "step": 6656 + }, + { + "epoch": 2.1374217370364423, + "grad_norm": 0.7610054016113281, + "learning_rate": 6.073418290323251e-05, + "loss": 0.3736, + "step": 6657 + }, + { + "epoch": 2.137742815861294, + "grad_norm": 0.8939658999443054, + "learning_rate": 6.070208225124185e-05, + "loss": 0.3186, + "step": 6658 + }, + { + "epoch": 2.1380638946861454, + "grad_norm": 0.9913893342018127, + "learning_rate": 6.066998638758326e-05, + "loss": 0.3687, + "step": 6659 + }, + { + "epoch": 2.138384973510997, + "grad_norm": 0.6824525594711304, + "learning_rate": 6.063789531616757e-05, + "loss": 0.277, + "step": 6660 + }, + { + "epoch": 2.1387060523358485, + "grad_norm": 0.935042679309845, + "learning_rate": 6.0605809040904894e-05, + "loss": 0.3631, + "step": 6661 + }, + { + "epoch": 2.1390271311607, + "grad_norm": 0.7435742020606995, + "learning_rate": 6.05737275657049e-05, + "loss": 0.3319, + "step": 6662 + }, + { + "epoch": 2.1393482099855516, + "grad_norm": 0.6883519291877747, + "learning_rate": 6.054165089447663e-05, + "loss": 0.3006, + "step": 6663 + }, + { + "epoch": 2.1396692888104027, + "grad_norm": 0.7741847038269043, + "learning_rate": 6.0509579031128485e-05, + "loss": 0.3047, + "step": 6664 + }, + { + "epoch": 2.1399903676352543, + "grad_norm": 0.8180326223373413, + "learning_rate": 6.047751197956838e-05, + "loss": 0.3593, + "step": 6665 + }, + { + "epoch": 2.140311446460106, + "grad_norm": 1.200945496559143, + "learning_rate": 6.0445449743703516e-05, + "loss": 0.3234, + "step": 6666 + }, + { + "epoch": 2.1406325252849574, + "grad_norm": 0.9025703072547913, + "learning_rate": 6.0413392327440635e-05, + "loss": 0.3599, + "step": 6667 + }, + { + "epoch": 2.140953604109809, + "grad_norm": 0.9718348979949951, + "learning_rate": 6.03813397346858e-05, + "loss": 0.3025, + "step": 6668 + }, + { + "epoch": 2.1412746829346605, + "grad_norm": 0.9838557243347168, + "learning_rate": 6.034929196934459e-05, + "loss": 0.3157, + "step": 6669 + }, + { + "epoch": 2.141595761759512, + "grad_norm": 0.940606951713562, + "learning_rate": 6.031724903532183e-05, + "loss": 0.3326, + "step": 6670 + }, + { + "epoch": 2.1419168405843636, + "grad_norm": 0.6957113742828369, + "learning_rate": 6.0285210936521955e-05, + "loss": 0.2739, + "step": 6671 + }, + { + "epoch": 2.142237919409215, + "grad_norm": 0.977173388004303, + "learning_rate": 6.025317767684864e-05, + "loss": 0.3673, + "step": 6672 + }, + { + "epoch": 2.1425589982340663, + "grad_norm": 0.6859129071235657, + "learning_rate": 6.022114926020504e-05, + "loss": 0.3025, + "step": 6673 + }, + { + "epoch": 2.142880077058918, + "grad_norm": 0.8257904052734375, + "learning_rate": 6.018912569049376e-05, + "loss": 0.3357, + "step": 6674 + }, + { + "epoch": 2.1432011558837694, + "grad_norm": 0.5588051080703735, + "learning_rate": 6.015710697161674e-05, + "loss": 0.2849, + "step": 6675 + }, + { + "epoch": 2.143522234708621, + "grad_norm": 0.5934134125709534, + "learning_rate": 6.012509310747538e-05, + "loss": 0.2849, + "step": 6676 + }, + { + "epoch": 2.1438433135334725, + "grad_norm": 0.800035834312439, + "learning_rate": 6.009308410197047e-05, + "loss": 0.2704, + "step": 6677 + }, + { + "epoch": 2.144164392358324, + "grad_norm": 0.42928749322891235, + "learning_rate": 6.006107995900224e-05, + "loss": 0.2584, + "step": 6678 + }, + { + "epoch": 2.1444854711831756, + "grad_norm": 0.4132596552371979, + "learning_rate": 6.002908068247024e-05, + "loss": 0.2802, + "step": 6679 + }, + { + "epoch": 2.144806550008027, + "grad_norm": 0.4437423348426819, + "learning_rate": 5.999708627627354e-05, + "loss": 0.4253, + "step": 6680 + }, + { + "epoch": 2.1451276288328787, + "grad_norm": 0.4113101065158844, + "learning_rate": 5.9965096744310526e-05, + "loss": 0.2371, + "step": 6681 + }, + { + "epoch": 2.1454487076577298, + "grad_norm": 0.513230562210083, + "learning_rate": 5.9933112090479006e-05, + "loss": 0.1586, + "step": 6682 + }, + { + "epoch": 2.1457697864825813, + "grad_norm": 0.5119988918304443, + "learning_rate": 5.990113231867629e-05, + "loss": 0.1865, + "step": 6683 + }, + { + "epoch": 2.146090865307433, + "grad_norm": 0.6047758460044861, + "learning_rate": 5.9869157432798926e-05, + "loss": 0.2309, + "step": 6684 + }, + { + "epoch": 2.1464119441322844, + "grad_norm": 0.5919579267501831, + "learning_rate": 5.9837187436743016e-05, + "loss": 0.1818, + "step": 6685 + }, + { + "epoch": 2.146733022957136, + "grad_norm": 0.44815778732299805, + "learning_rate": 5.9805222334404e-05, + "loss": 0.1748, + "step": 6686 + }, + { + "epoch": 2.1470541017819875, + "grad_norm": 0.4934236705303192, + "learning_rate": 5.977326212967671e-05, + "loss": 0.1963, + "step": 6687 + }, + { + "epoch": 2.147375180606839, + "grad_norm": 0.8252405524253845, + "learning_rate": 5.974130682645538e-05, + "loss": 0.3972, + "step": 6688 + }, + { + "epoch": 2.1476962594316906, + "grad_norm": 1.336822509765625, + "learning_rate": 5.9709356428633746e-05, + "loss": 0.3795, + "step": 6689 + }, + { + "epoch": 2.148017338256542, + "grad_norm": 0.7905248999595642, + "learning_rate": 5.967741094010479e-05, + "loss": 0.4087, + "step": 6690 + }, + { + "epoch": 2.1483384170813933, + "grad_norm": 0.7174348831176758, + "learning_rate": 5.964547036476099e-05, + "loss": 0.3429, + "step": 6691 + }, + { + "epoch": 2.148659495906245, + "grad_norm": 0.7305684685707092, + "learning_rate": 5.961353470649426e-05, + "loss": 0.2866, + "step": 6692 + }, + { + "epoch": 2.1489805747310964, + "grad_norm": 0.6904447078704834, + "learning_rate": 5.9581603969195766e-05, + "loss": 0.3, + "step": 6693 + }, + { + "epoch": 2.149301653555948, + "grad_norm": 0.8633493185043335, + "learning_rate": 5.954967815675627e-05, + "loss": 0.3736, + "step": 6694 + }, + { + "epoch": 2.1496227323807995, + "grad_norm": 0.8246678113937378, + "learning_rate": 5.951775727306577e-05, + "loss": 0.359, + "step": 6695 + }, + { + "epoch": 2.149943811205651, + "grad_norm": 1.400453805923462, + "learning_rate": 5.9485841322013755e-05, + "loss": 0.454, + "step": 6696 + }, + { + "epoch": 2.1502648900305026, + "grad_norm": 2.0813472270965576, + "learning_rate": 5.94539303074891e-05, + "loss": 0.3139, + "step": 6697 + }, + { + "epoch": 2.150585968855354, + "grad_norm": 0.9798773527145386, + "learning_rate": 5.9422024233380013e-05, + "loss": 0.4055, + "step": 6698 + }, + { + "epoch": 2.1509070476802057, + "grad_norm": 0.9480912685394287, + "learning_rate": 5.9390123103574215e-05, + "loss": 0.3727, + "step": 6699 + }, + { + "epoch": 2.151228126505057, + "grad_norm": 0.9623056650161743, + "learning_rate": 5.935822692195869e-05, + "loss": 0.2921, + "step": 6700 + }, + { + "epoch": 2.1515492053299083, + "grad_norm": 0.8746601343154907, + "learning_rate": 5.9326335692419995e-05, + "loss": 0.3345, + "step": 6701 + }, + { + "epoch": 2.15187028415476, + "grad_norm": 0.725217878818512, + "learning_rate": 5.929444941884388e-05, + "loss": 0.346, + "step": 6702 + }, + { + "epoch": 2.1521913629796114, + "grad_norm": 0.8172053098678589, + "learning_rate": 5.9262568105115654e-05, + "loss": 0.3468, + "step": 6703 + }, + { + "epoch": 2.152512441804463, + "grad_norm": 0.648247241973877, + "learning_rate": 5.92306917551199e-05, + "loss": 0.2908, + "step": 6704 + }, + { + "epoch": 2.1528335206293145, + "grad_norm": 0.7141852378845215, + "learning_rate": 5.9198820372740726e-05, + "loss": 0.2882, + "step": 6705 + }, + { + "epoch": 2.153154599454166, + "grad_norm": 0.8479703068733215, + "learning_rate": 5.916695396186154e-05, + "loss": 0.3407, + "step": 6706 + }, + { + "epoch": 2.1534756782790176, + "grad_norm": 0.8146336674690247, + "learning_rate": 5.913509252636511e-05, + "loss": 0.2869, + "step": 6707 + }, + { + "epoch": 2.153796757103869, + "grad_norm": 0.6545037627220154, + "learning_rate": 5.910323607013373e-05, + "loss": 0.2742, + "step": 6708 + }, + { + "epoch": 2.1541178359287203, + "grad_norm": 0.7448673248291016, + "learning_rate": 5.907138459704895e-05, + "loss": 0.3192, + "step": 6709 + }, + { + "epoch": 2.154438914753572, + "grad_norm": 0.7796539664268494, + "learning_rate": 5.903953811099183e-05, + "loss": 0.2947, + "step": 6710 + }, + { + "epoch": 2.1547599935784234, + "grad_norm": 1.0737991333007812, + "learning_rate": 5.900769661584272e-05, + "loss": 0.4188, + "step": 6711 + }, + { + "epoch": 2.155081072403275, + "grad_norm": 0.6433249711990356, + "learning_rate": 5.8975860115481487e-05, + "loss": 0.2853, + "step": 6712 + }, + { + "epoch": 2.1554021512281265, + "grad_norm": 0.8667881488800049, + "learning_rate": 5.8944028613787206e-05, + "loss": 0.3175, + "step": 6713 + }, + { + "epoch": 2.155723230052978, + "grad_norm": 1.061166524887085, + "learning_rate": 5.8912202114638524e-05, + "loss": 0.4216, + "step": 6714 + }, + { + "epoch": 2.1560443088778296, + "grad_norm": 0.9208434224128723, + "learning_rate": 5.88803806219134e-05, + "loss": 0.3846, + "step": 6715 + }, + { + "epoch": 2.156365387702681, + "grad_norm": 0.6994331479072571, + "learning_rate": 5.884856413948913e-05, + "loss": 0.2805, + "step": 6716 + }, + { + "epoch": 2.1566864665275327, + "grad_norm": 0.9039559960365295, + "learning_rate": 5.881675267124254e-05, + "loss": 0.3893, + "step": 6717 + }, + { + "epoch": 2.157007545352384, + "grad_norm": 0.8214314579963684, + "learning_rate": 5.8784946221049666e-05, + "loss": 0.3455, + "step": 6718 + }, + { + "epoch": 2.1573286241772354, + "grad_norm": 0.5890683531761169, + "learning_rate": 5.8753144792786096e-05, + "loss": 0.2904, + "step": 6719 + }, + { + "epoch": 2.157649703002087, + "grad_norm": 0.8071637749671936, + "learning_rate": 5.87213483903267e-05, + "loss": 0.3424, + "step": 6720 + }, + { + "epoch": 2.1579707818269385, + "grad_norm": 0.5049499869346619, + "learning_rate": 5.868955701754584e-05, + "loss": 0.2401, + "step": 6721 + }, + { + "epoch": 2.15829186065179, + "grad_norm": 0.7438219785690308, + "learning_rate": 5.86577706783171e-05, + "loss": 0.2888, + "step": 6722 + }, + { + "epoch": 2.1586129394766416, + "grad_norm": 0.5607714056968689, + "learning_rate": 5.862598937651365e-05, + "loss": 0.28, + "step": 6723 + }, + { + "epoch": 2.158934018301493, + "grad_norm": 0.6200205087661743, + "learning_rate": 5.8594213116007855e-05, + "loss": 0.3038, + "step": 6724 + }, + { + "epoch": 2.1592550971263447, + "grad_norm": 0.9494879841804504, + "learning_rate": 5.856244190067159e-05, + "loss": 0.3483, + "step": 6725 + }, + { + "epoch": 2.159576175951196, + "grad_norm": 0.9687539339065552, + "learning_rate": 5.853067573437612e-05, + "loss": 0.3126, + "step": 6726 + }, + { + "epoch": 2.1598972547760473, + "grad_norm": 0.35860729217529297, + "learning_rate": 5.849891462099198e-05, + "loss": 0.2379, + "step": 6727 + }, + { + "epoch": 2.160218333600899, + "grad_norm": 0.4721701443195343, + "learning_rate": 5.8467158564389234e-05, + "loss": 0.2996, + "step": 6728 + }, + { + "epoch": 2.1605394124257504, + "grad_norm": 2.956984758377075, + "learning_rate": 5.8435407568437216e-05, + "loss": 0.2978, + "step": 6729 + }, + { + "epoch": 2.160860491250602, + "grad_norm": 0.5004953742027283, + "learning_rate": 5.8403661637004736e-05, + "loss": 0.6088, + "step": 6730 + }, + { + "epoch": 2.1611815700754535, + "grad_norm": 0.43807387351989746, + "learning_rate": 5.83719207739599e-05, + "loss": 0.6434, + "step": 6731 + }, + { + "epoch": 2.161502648900305, + "grad_norm": 0.5199769139289856, + "learning_rate": 5.834018498317024e-05, + "loss": 0.5589, + "step": 6732 + }, + { + "epoch": 2.1618237277251566, + "grad_norm": 0.46276336908340454, + "learning_rate": 5.8308454268502675e-05, + "loss": 0.2407, + "step": 6733 + }, + { + "epoch": 2.162144806550008, + "grad_norm": 0.47607436776161194, + "learning_rate": 5.82767286338235e-05, + "loss": 0.2192, + "step": 6734 + }, + { + "epoch": 2.1624658853748597, + "grad_norm": 0.36103343963623047, + "learning_rate": 5.8245008082998364e-05, + "loss": 0.0913, + "step": 6735 + }, + { + "epoch": 2.162786964199711, + "grad_norm": 0.6850721836090088, + "learning_rate": 5.82132926198923e-05, + "loss": 0.353, + "step": 6736 + }, + { + "epoch": 2.1631080430245624, + "grad_norm": 0.7700496315956116, + "learning_rate": 5.818158224836987e-05, + "loss": 0.4083, + "step": 6737 + }, + { + "epoch": 2.163429121849414, + "grad_norm": 0.7721123695373535, + "learning_rate": 5.814987697229471e-05, + "loss": 0.382, + "step": 6738 + }, + { + "epoch": 2.1637502006742655, + "grad_norm": 0.7945840954780579, + "learning_rate": 5.8118176795530176e-05, + "loss": 0.3534, + "step": 6739 + }, + { + "epoch": 2.164071279499117, + "grad_norm": 0.5969064831733704, + "learning_rate": 5.808648172193868e-05, + "loss": 0.2527, + "step": 6740 + }, + { + "epoch": 2.1643923583239686, + "grad_norm": 0.9726594686508179, + "learning_rate": 5.805479175538229e-05, + "loss": 0.3638, + "step": 6741 + }, + { + "epoch": 2.16471343714882, + "grad_norm": 0.7358959913253784, + "learning_rate": 5.802310689972233e-05, + "loss": 0.3867, + "step": 6742 + }, + { + "epoch": 2.1650345159736717, + "grad_norm": 0.7400618195533752, + "learning_rate": 5.799142715881938e-05, + "loss": 0.2788, + "step": 6743 + }, + { + "epoch": 2.1653555947985232, + "grad_norm": 0.8120781779289246, + "learning_rate": 5.795975253653364e-05, + "loss": 0.3746, + "step": 6744 + }, + { + "epoch": 2.1656766736233743, + "grad_norm": 0.7224815487861633, + "learning_rate": 5.792808303672454e-05, + "loss": 0.3336, + "step": 6745 + }, + { + "epoch": 2.165997752448226, + "grad_norm": 0.7844706773757935, + "learning_rate": 5.789641866325091e-05, + "loss": 0.3321, + "step": 6746 + }, + { + "epoch": 2.1663188312730774, + "grad_norm": 1.100580096244812, + "learning_rate": 5.786475941997094e-05, + "loss": 0.3908, + "step": 6747 + }, + { + "epoch": 2.166639910097929, + "grad_norm": 0.7362777590751648, + "learning_rate": 5.783310531074223e-05, + "loss": 0.3151, + "step": 6748 + }, + { + "epoch": 2.1669609889227806, + "grad_norm": 0.9936489462852478, + "learning_rate": 5.780145633942173e-05, + "loss": 0.401, + "step": 6749 + }, + { + "epoch": 2.167282067747632, + "grad_norm": 0.7832114696502686, + "learning_rate": 5.7769812509865773e-05, + "loss": 0.3097, + "step": 6750 + }, + { + "epoch": 2.1676031465724837, + "grad_norm": 0.7143785357475281, + "learning_rate": 5.773817382593008e-05, + "loss": 0.2509, + "step": 6751 + }, + { + "epoch": 2.167924225397335, + "grad_norm": 0.9188450574874878, + "learning_rate": 5.770654029146969e-05, + "loss": 0.4234, + "step": 6752 + }, + { + "epoch": 2.1682453042221868, + "grad_norm": 0.7946859002113342, + "learning_rate": 5.7674911910339094e-05, + "loss": 0.28, + "step": 6753 + }, + { + "epoch": 2.168566383047038, + "grad_norm": 1.0123013257980347, + "learning_rate": 5.764328868639208e-05, + "loss": 0.3555, + "step": 6754 + }, + { + "epoch": 2.1688874618718894, + "grad_norm": 1.0456223487854004, + "learning_rate": 5.7611670623481864e-05, + "loss": 0.28, + "step": 6755 + }, + { + "epoch": 2.169208540696741, + "grad_norm": 0.765620231628418, + "learning_rate": 5.758005772546097e-05, + "loss": 0.3154, + "step": 6756 + }, + { + "epoch": 2.1695296195215925, + "grad_norm": 0.8351200222969055, + "learning_rate": 5.754844999618144e-05, + "loss": 0.3056, + "step": 6757 + }, + { + "epoch": 2.169850698346444, + "grad_norm": 0.7080745697021484, + "learning_rate": 5.7516847439494435e-05, + "loss": 0.2799, + "step": 6758 + }, + { + "epoch": 2.1701717771712956, + "grad_norm": 0.7718917727470398, + "learning_rate": 5.748525005925074e-05, + "loss": 0.3316, + "step": 6759 + }, + { + "epoch": 2.170492855996147, + "grad_norm": 0.6026411652565002, + "learning_rate": 5.7453657859300415e-05, + "loss": 0.2607, + "step": 6760 + }, + { + "epoch": 2.1708139348209987, + "grad_norm": 1.0333638191223145, + "learning_rate": 5.7422070843492734e-05, + "loss": 0.2856, + "step": 6761 + }, + { + "epoch": 2.1711350136458503, + "grad_norm": 0.8613486289978027, + "learning_rate": 5.739048901567665e-05, + "loss": 0.3556, + "step": 6762 + }, + { + "epoch": 2.1714560924707014, + "grad_norm": 0.7801258563995361, + "learning_rate": 5.735891237970015e-05, + "loss": 0.3481, + "step": 6763 + }, + { + "epoch": 2.171777171295553, + "grad_norm": 0.8479055762290955, + "learning_rate": 5.732734093941087e-05, + "loss": 0.3521, + "step": 6764 + }, + { + "epoch": 2.1720982501204045, + "grad_norm": 0.8801478743553162, + "learning_rate": 5.7295774698655655e-05, + "loss": 0.3246, + "step": 6765 + }, + { + "epoch": 2.172419328945256, + "grad_norm": 0.860466718673706, + "learning_rate": 5.7264213661280765e-05, + "loss": 0.3524, + "step": 6766 + }, + { + "epoch": 2.1727404077701076, + "grad_norm": 0.7384558320045471, + "learning_rate": 5.723265783113181e-05, + "loss": 0.2897, + "step": 6767 + }, + { + "epoch": 2.173061486594959, + "grad_norm": 0.9084320068359375, + "learning_rate": 5.720110721205376e-05, + "loss": 0.3288, + "step": 6768 + }, + { + "epoch": 2.1733825654198107, + "grad_norm": 0.7406836152076721, + "learning_rate": 5.716956180789098e-05, + "loss": 0.3548, + "step": 6769 + }, + { + "epoch": 2.1737036442446622, + "grad_norm": 0.8314934372901917, + "learning_rate": 5.713802162248718e-05, + "loss": 0.3294, + "step": 6770 + }, + { + "epoch": 2.1740247230695138, + "grad_norm": 1.2621833086013794, + "learning_rate": 5.710648665968543e-05, + "loss": 0.4228, + "step": 6771 + }, + { + "epoch": 2.174345801894365, + "grad_norm": 0.7521808743476868, + "learning_rate": 5.707495692332816e-05, + "loss": 0.2837, + "step": 6772 + }, + { + "epoch": 2.1746668807192164, + "grad_norm": 1.019662857055664, + "learning_rate": 5.704343241725719e-05, + "loss": 0.3995, + "step": 6773 + }, + { + "epoch": 2.174987959544068, + "grad_norm": 0.661185085773468, + "learning_rate": 5.701191314531364e-05, + "loss": 0.2683, + "step": 6774 + }, + { + "epoch": 2.1753090383689195, + "grad_norm": 0.9004967212677002, + "learning_rate": 5.6980399111338156e-05, + "loss": 0.3288, + "step": 6775 + }, + { + "epoch": 2.175630117193771, + "grad_norm": 0.5190272927284241, + "learning_rate": 5.694889031917047e-05, + "loss": 0.2803, + "step": 6776 + }, + { + "epoch": 2.1759511960186226, + "grad_norm": 0.6035006642341614, + "learning_rate": 5.691738677265e-05, + "loss": 0.2716, + "step": 6777 + }, + { + "epoch": 2.176272274843474, + "grad_norm": 0.888232409954071, + "learning_rate": 5.6885888475615204e-05, + "loss": 0.3035, + "step": 6778 + }, + { + "epoch": 2.1765933536683257, + "grad_norm": 0.47930851578712463, + "learning_rate": 5.6854395431904094e-05, + "loss": 0.255, + "step": 6779 + }, + { + "epoch": 2.1769144324931773, + "grad_norm": 0.3152773678302765, + "learning_rate": 5.68229076453541e-05, + "loss": 0.3738, + "step": 6780 + }, + { + "epoch": 2.1772355113180284, + "grad_norm": 0.5532098412513733, + "learning_rate": 5.679142511980175e-05, + "loss": 0.7954, + "step": 6781 + }, + { + "epoch": 2.17755659014288, + "grad_norm": 0.48920655250549316, + "learning_rate": 5.675994785908329e-05, + "loss": 0.4281, + "step": 6782 + }, + { + "epoch": 2.1778776689677315, + "grad_norm": 0.5712006688117981, + "learning_rate": 5.6728475867033925e-05, + "loss": 0.255, + "step": 6783 + }, + { + "epoch": 2.178198747792583, + "grad_norm": 0.6813241839408875, + "learning_rate": 5.669700914748857e-05, + "loss": 0.1684, + "step": 6784 + }, + { + "epoch": 2.1785198266174346, + "grad_norm": 0.3276176452636719, + "learning_rate": 5.666554770428129e-05, + "loss": 0.0962, + "step": 6785 + }, + { + "epoch": 2.178840905442286, + "grad_norm": 0.8344230651855469, + "learning_rate": 5.663409154124557e-05, + "loss": 0.3953, + "step": 6786 + }, + { + "epoch": 2.1791619842671377, + "grad_norm": 0.9453107714653015, + "learning_rate": 5.6602640662214256e-05, + "loss": 0.4333, + "step": 6787 + }, + { + "epoch": 2.1794830630919892, + "grad_norm": 0.737591564655304, + "learning_rate": 5.657119507101954e-05, + "loss": 0.3033, + "step": 6788 + }, + { + "epoch": 2.179804141916841, + "grad_norm": 0.7458065748214722, + "learning_rate": 5.653975477149298e-05, + "loss": 0.2871, + "step": 6789 + }, + { + "epoch": 2.180125220741692, + "grad_norm": 0.9435691833496094, + "learning_rate": 5.6508319767465465e-05, + "loss": 0.4635, + "step": 6790 + }, + { + "epoch": 2.1804462995665435, + "grad_norm": 0.611143946647644, + "learning_rate": 5.647689006276726e-05, + "loss": 0.2361, + "step": 6791 + }, + { + "epoch": 2.180767378391395, + "grad_norm": 0.6122483611106873, + "learning_rate": 5.6445465661227994e-05, + "loss": 0.3083, + "step": 6792 + }, + { + "epoch": 2.1810884572162466, + "grad_norm": 0.9717581272125244, + "learning_rate": 5.641404656667661e-05, + "loss": 0.4157, + "step": 6793 + }, + { + "epoch": 2.181409536041098, + "grad_norm": 0.8596290349960327, + "learning_rate": 5.6382632782941405e-05, + "loss": 0.3547, + "step": 6794 + }, + { + "epoch": 2.1817306148659497, + "grad_norm": 0.9642032384872437, + "learning_rate": 5.635122431385016e-05, + "loss": 0.3883, + "step": 6795 + }, + { + "epoch": 2.182051693690801, + "grad_norm": 0.8566706776618958, + "learning_rate": 5.63198211632298e-05, + "loss": 0.395, + "step": 6796 + }, + { + "epoch": 2.1823727725156528, + "grad_norm": 0.8098425269126892, + "learning_rate": 5.6288423334906735e-05, + "loss": 0.3301, + "step": 6797 + }, + { + "epoch": 2.1826938513405043, + "grad_norm": 0.8384745121002197, + "learning_rate": 5.6257030832706695e-05, + "loss": 0.3198, + "step": 6798 + }, + { + "epoch": 2.1830149301653554, + "grad_norm": 0.8153161406517029, + "learning_rate": 5.622564366045472e-05, + "loss": 0.3101, + "step": 6799 + }, + { + "epoch": 2.183336008990207, + "grad_norm": 0.8854767084121704, + "learning_rate": 5.619426182197536e-05, + "loss": 0.3009, + "step": 6800 + }, + { + "epoch": 2.1836570878150585, + "grad_norm": 0.9710304737091064, + "learning_rate": 5.616288532109225e-05, + "loss": 0.3954, + "step": 6801 + }, + { + "epoch": 2.18397816663991, + "grad_norm": 0.9379624128341675, + "learning_rate": 5.6131514161628626e-05, + "loss": 0.3856, + "step": 6802 + }, + { + "epoch": 2.1842992454647616, + "grad_norm": 1.2100353240966797, + "learning_rate": 5.610014834740693e-05, + "loss": 0.5041, + "step": 6803 + }, + { + "epoch": 2.184620324289613, + "grad_norm": 0.8026917576789856, + "learning_rate": 5.6068787882249005e-05, + "loss": 0.28, + "step": 6804 + }, + { + "epoch": 2.1849414031144647, + "grad_norm": 0.776182234287262, + "learning_rate": 5.6037432769976064e-05, + "loss": 0.2885, + "step": 6805 + }, + { + "epoch": 2.1852624819393163, + "grad_norm": 0.9188643097877502, + "learning_rate": 5.6006083014408484e-05, + "loss": 0.4413, + "step": 6806 + }, + { + "epoch": 2.185583560764168, + "grad_norm": 0.9212349057197571, + "learning_rate": 5.5974738619366295e-05, + "loss": 0.4209, + "step": 6807 + }, + { + "epoch": 2.185904639589019, + "grad_norm": 1.0901522636413574, + "learning_rate": 5.594339958866867e-05, + "loss": 0.3755, + "step": 6808 + }, + { + "epoch": 2.1862257184138705, + "grad_norm": 1.0959854125976562, + "learning_rate": 5.591206592613416e-05, + "loss": 0.3334, + "step": 6809 + }, + { + "epoch": 2.186546797238722, + "grad_norm": 0.8223209381103516, + "learning_rate": 5.588073763558068e-05, + "loss": 0.3331, + "step": 6810 + }, + { + "epoch": 2.1868678760635736, + "grad_norm": 0.9825518131256104, + "learning_rate": 5.584941472082549e-05, + "loss": 0.3669, + "step": 6811 + }, + { + "epoch": 2.187188954888425, + "grad_norm": 0.7833166122436523, + "learning_rate": 5.5818097185685206e-05, + "loss": 0.3309, + "step": 6812 + }, + { + "epoch": 2.1875100337132767, + "grad_norm": 0.5879184603691101, + "learning_rate": 5.578678503397574e-05, + "loss": 0.2564, + "step": 6813 + }, + { + "epoch": 2.1878311125381282, + "grad_norm": 0.9265559315681458, + "learning_rate": 5.575547826951242e-05, + "loss": 0.3454, + "step": 6814 + }, + { + "epoch": 2.18815219136298, + "grad_norm": 0.9046943187713623, + "learning_rate": 5.572417689610987e-05, + "loss": 0.3133, + "step": 6815 + }, + { + "epoch": 2.1884732701878313, + "grad_norm": 1.2076610326766968, + "learning_rate": 5.5692880917582046e-05, + "loss": 0.4036, + "step": 6816 + }, + { + "epoch": 2.1887943490126824, + "grad_norm": 0.8415603637695312, + "learning_rate": 5.566159033774225e-05, + "loss": 0.3282, + "step": 6817 + }, + { + "epoch": 2.189115427837534, + "grad_norm": 0.725054144859314, + "learning_rate": 5.5630305160403275e-05, + "loss": 0.2602, + "step": 6818 + }, + { + "epoch": 2.1894365066623855, + "grad_norm": 0.7742078900337219, + "learning_rate": 5.5599025389376935e-05, + "loss": 0.3131, + "step": 6819 + }, + { + "epoch": 2.189757585487237, + "grad_norm": 0.7798224091529846, + "learning_rate": 5.556775102847475e-05, + "loss": 0.3724, + "step": 6820 + }, + { + "epoch": 2.1900786643120886, + "grad_norm": 0.964368462562561, + "learning_rate": 5.553648208150728e-05, + "loss": 0.3715, + "step": 6821 + }, + { + "epoch": 2.19039974313694, + "grad_norm": 1.1923866271972656, + "learning_rate": 5.5505218552284565e-05, + "loss": 0.3996, + "step": 6822 + }, + { + "epoch": 2.1907208219617917, + "grad_norm": 0.852208137512207, + "learning_rate": 5.547396044461608e-05, + "loss": 0.3929, + "step": 6823 + }, + { + "epoch": 2.1910419007866433, + "grad_norm": 0.5591673254966736, + "learning_rate": 5.544270776231038e-05, + "loss": 0.2536, + "step": 6824 + }, + { + "epoch": 2.191362979611495, + "grad_norm": 0.6292114853858948, + "learning_rate": 5.541146050917561e-05, + "loss": 0.3284, + "step": 6825 + }, + { + "epoch": 2.191684058436346, + "grad_norm": 0.8230541944503784, + "learning_rate": 5.5380218689019125e-05, + "loss": 0.3711, + "step": 6826 + }, + { + "epoch": 2.1920051372611975, + "grad_norm": 0.500881016254425, + "learning_rate": 5.5348982305647643e-05, + "loss": 0.2666, + "step": 6827 + }, + { + "epoch": 2.192326216086049, + "grad_norm": 0.8684194087982178, + "learning_rate": 5.5317751362867234e-05, + "loss": 0.3415, + "step": 6828 + }, + { + "epoch": 2.1926472949109006, + "grad_norm": 1.012944221496582, + "learning_rate": 5.5286525864483285e-05, + "loss": 0.3696, + "step": 6829 + }, + { + "epoch": 2.192968373735752, + "grad_norm": 0.5349951386451721, + "learning_rate": 5.525530581430054e-05, + "loss": 0.6524, + "step": 6830 + }, + { + "epoch": 2.1932894525606037, + "grad_norm": 0.5182074308395386, + "learning_rate": 5.522409121612304e-05, + "loss": 0.5625, + "step": 6831 + }, + { + "epoch": 2.1936105313854553, + "grad_norm": 0.450907438993454, + "learning_rate": 5.519288207375422e-05, + "loss": 0.2453, + "step": 6832 + }, + { + "epoch": 2.193931610210307, + "grad_norm": 0.4998016655445099, + "learning_rate": 5.5161678390996796e-05, + "loss": 0.1582, + "step": 6833 + }, + { + "epoch": 2.1942526890351584, + "grad_norm": 0.7120688557624817, + "learning_rate": 5.513048017165284e-05, + "loss": 0.2156, + "step": 6834 + }, + { + "epoch": 2.1945737678600095, + "grad_norm": 0.4577905237674713, + "learning_rate": 5.509928741952379e-05, + "loss": 0.203, + "step": 6835 + }, + { + "epoch": 2.194894846684861, + "grad_norm": 0.744748055934906, + "learning_rate": 5.506810013841036e-05, + "loss": 0.377, + "step": 6836 + }, + { + "epoch": 2.1952159255097126, + "grad_norm": 1.0785094499588013, + "learning_rate": 5.50369183321126e-05, + "loss": 0.4875, + "step": 6837 + }, + { + "epoch": 2.195537004334564, + "grad_norm": 0.8972413539886475, + "learning_rate": 5.5005742004430025e-05, + "loss": 0.398, + "step": 6838 + }, + { + "epoch": 2.1958580831594157, + "grad_norm": 0.7619357705116272, + "learning_rate": 5.497457115916127e-05, + "loss": 0.3321, + "step": 6839 + }, + { + "epoch": 2.196179161984267, + "grad_norm": 0.711033046245575, + "learning_rate": 5.494340580010441e-05, + "loss": 0.3118, + "step": 6840 + }, + { + "epoch": 2.1965002408091188, + "grad_norm": 0.715796172618866, + "learning_rate": 5.491224593105695e-05, + "loss": 0.3553, + "step": 6841 + }, + { + "epoch": 2.1968213196339703, + "grad_norm": 0.6636841297149658, + "learning_rate": 5.488109155581549e-05, + "loss": 0.2737, + "step": 6842 + }, + { + "epoch": 2.197142398458822, + "grad_norm": 0.918535590171814, + "learning_rate": 5.484994267817624e-05, + "loss": 0.284, + "step": 6843 + }, + { + "epoch": 2.197463477283673, + "grad_norm": 0.7877380847930908, + "learning_rate": 5.481879930193443e-05, + "loss": 0.3617, + "step": 6844 + }, + { + "epoch": 2.1977845561085245, + "grad_norm": 0.9507259130477905, + "learning_rate": 5.478766143088492e-05, + "loss": 0.3557, + "step": 6845 + }, + { + "epoch": 2.198105634933376, + "grad_norm": 0.8569693565368652, + "learning_rate": 5.475652906882173e-05, + "loss": 0.3243, + "step": 6846 + }, + { + "epoch": 2.1984267137582276, + "grad_norm": 0.8883730173110962, + "learning_rate": 5.4725402219538236e-05, + "loss": 0.3438, + "step": 6847 + }, + { + "epoch": 2.198747792583079, + "grad_norm": 0.8322269320487976, + "learning_rate": 5.469428088682717e-05, + "loss": 0.3581, + "step": 6848 + }, + { + "epoch": 2.1990688714079307, + "grad_norm": 0.8799639344215393, + "learning_rate": 5.466316507448048e-05, + "loss": 0.3936, + "step": 6849 + }, + { + "epoch": 2.1993899502327823, + "grad_norm": 1.0511608123779297, + "learning_rate": 5.4632054786289656e-05, + "loss": 0.4143, + "step": 6850 + }, + { + "epoch": 2.199711029057634, + "grad_norm": 0.8037779927253723, + "learning_rate": 5.4600950026045326e-05, + "loss": 0.3097, + "step": 6851 + }, + { + "epoch": 2.2000321078824854, + "grad_norm": 0.8404053449630737, + "learning_rate": 5.4569850797537536e-05, + "loss": 0.3181, + "step": 6852 + }, + { + "epoch": 2.2003531867073365, + "grad_norm": 1.0227787494659424, + "learning_rate": 5.4538757104555615e-05, + "loss": 0.3921, + "step": 6853 + }, + { + "epoch": 2.200674265532188, + "grad_norm": 0.5781304240226746, + "learning_rate": 5.4507668950888245e-05, + "loss": 0.2198, + "step": 6854 + }, + { + "epoch": 2.2009953443570396, + "grad_norm": 0.811543881893158, + "learning_rate": 5.447658634032338e-05, + "loss": 0.3287, + "step": 6855 + }, + { + "epoch": 2.201316423181891, + "grad_norm": 0.7216514945030212, + "learning_rate": 5.4445509276648466e-05, + "loss": 0.3109, + "step": 6856 + }, + { + "epoch": 2.2016375020067427, + "grad_norm": 0.7720044851303101, + "learning_rate": 5.441443776365003e-05, + "loss": 0.3122, + "step": 6857 + }, + { + "epoch": 2.2019585808315942, + "grad_norm": 0.8112689852714539, + "learning_rate": 5.438337180511406e-05, + "loss": 0.3681, + "step": 6858 + }, + { + "epoch": 2.202279659656446, + "grad_norm": 0.9748257398605347, + "learning_rate": 5.435231140482587e-05, + "loss": 0.4623, + "step": 6859 + }, + { + "epoch": 2.2026007384812973, + "grad_norm": 0.9538961052894592, + "learning_rate": 5.4321256566570036e-05, + "loss": 0.4044, + "step": 6860 + }, + { + "epoch": 2.202921817306149, + "grad_norm": 0.7297887206077576, + "learning_rate": 5.4290207294130615e-05, + "loss": 0.2882, + "step": 6861 + }, + { + "epoch": 2.203242896131, + "grad_norm": 0.8942319750785828, + "learning_rate": 5.42591635912907e-05, + "loss": 0.3243, + "step": 6862 + }, + { + "epoch": 2.2035639749558515, + "grad_norm": 0.6959256529808044, + "learning_rate": 5.4228125461833024e-05, + "loss": 0.2407, + "step": 6863 + }, + { + "epoch": 2.203885053780703, + "grad_norm": 0.8571872115135193, + "learning_rate": 5.419709290953936e-05, + "loss": 0.3134, + "step": 6864 + }, + { + "epoch": 2.2042061326055546, + "grad_norm": 0.8868092894554138, + "learning_rate": 5.416606593819101e-05, + "loss": 0.3696, + "step": 6865 + }, + { + "epoch": 2.204527211430406, + "grad_norm": 1.1420269012451172, + "learning_rate": 5.4135044551568546e-05, + "loss": 0.338, + "step": 6866 + }, + { + "epoch": 2.2048482902552577, + "grad_norm": 0.7926510572433472, + "learning_rate": 5.4104028753451696e-05, + "loss": 0.3127, + "step": 6867 + }, + { + "epoch": 2.2051693690801093, + "grad_norm": 0.7856837511062622, + "learning_rate": 5.407301854761977e-05, + "loss": 0.2912, + "step": 6868 + }, + { + "epoch": 2.205490447904961, + "grad_norm": 0.9291871786117554, + "learning_rate": 5.404201393785122e-05, + "loss": 0.3568, + "step": 6869 + }, + { + "epoch": 2.2058115267298124, + "grad_norm": 0.7121933102607727, + "learning_rate": 5.401101492792386e-05, + "loss": 0.3451, + "step": 6870 + }, + { + "epoch": 2.2061326055546635, + "grad_norm": 0.6835899949073792, + "learning_rate": 5.398002152161484e-05, + "loss": 0.284, + "step": 6871 + }, + { + "epoch": 2.206453684379515, + "grad_norm": 0.9216342568397522, + "learning_rate": 5.394903372270062e-05, + "loss": 0.357, + "step": 6872 + }, + { + "epoch": 2.2067747632043666, + "grad_norm": 0.6000509262084961, + "learning_rate": 5.3918051534956926e-05, + "loss": 0.2943, + "step": 6873 + }, + { + "epoch": 2.207095842029218, + "grad_norm": 1.0322999954223633, + "learning_rate": 5.388707496215888e-05, + "loss": 0.3784, + "step": 6874 + }, + { + "epoch": 2.2074169208540697, + "grad_norm": 0.9377049803733826, + "learning_rate": 5.3856104008080876e-05, + "loss": 0.3036, + "step": 6875 + }, + { + "epoch": 2.2077379996789213, + "grad_norm": 0.6841163039207458, + "learning_rate": 5.382513867649663e-05, + "loss": 0.2938, + "step": 6876 + }, + { + "epoch": 2.208059078503773, + "grad_norm": 0.9732836484909058, + "learning_rate": 5.3794178971179165e-05, + "loss": 0.377, + "step": 6877 + }, + { + "epoch": 2.2083801573286244, + "grad_norm": 0.5027754306793213, + "learning_rate": 5.3763224895900846e-05, + "loss": 0.2797, + "step": 6878 + }, + { + "epoch": 2.208701236153476, + "grad_norm": 0.867052435874939, + "learning_rate": 5.373227645443332e-05, + "loss": 0.3347, + "step": 6879 + }, + { + "epoch": 2.209022314978327, + "grad_norm": 0.4138392210006714, + "learning_rate": 5.3701333650547525e-05, + "loss": 0.4415, + "step": 6880 + }, + { + "epoch": 2.2093433938031786, + "grad_norm": 0.4743240475654602, + "learning_rate": 5.3670396488013854e-05, + "loss": 0.4268, + "step": 6881 + }, + { + "epoch": 2.20966447262803, + "grad_norm": 0.6370947957038879, + "learning_rate": 5.3639464970601775e-05, + "loss": 0.5011, + "step": 6882 + }, + { + "epoch": 2.2099855514528817, + "grad_norm": 0.6120859980583191, + "learning_rate": 5.360853910208028e-05, + "loss": 0.4556, + "step": 6883 + }, + { + "epoch": 2.210306630277733, + "grad_norm": 0.5497114062309265, + "learning_rate": 5.357761888621764e-05, + "loss": 0.1802, + "step": 6884 + }, + { + "epoch": 2.2106277091025848, + "grad_norm": 0.7469709515571594, + "learning_rate": 5.3546704326781236e-05, + "loss": 0.1502, + "step": 6885 + }, + { + "epoch": 2.2109487879274363, + "grad_norm": 0.6369648575782776, + "learning_rate": 5.351579542753807e-05, + "loss": 0.2719, + "step": 6886 + }, + { + "epoch": 2.211269866752288, + "grad_norm": 0.8421298861503601, + "learning_rate": 5.348489219225416e-05, + "loss": 0.4426, + "step": 6887 + }, + { + "epoch": 2.2115909455771394, + "grad_norm": 0.9989409446716309, + "learning_rate": 5.345399462469509e-05, + "loss": 0.4848, + "step": 6888 + }, + { + "epoch": 2.2119120244019905, + "grad_norm": 0.9956933259963989, + "learning_rate": 5.3423102728625574e-05, + "loss": 0.4123, + "step": 6889 + }, + { + "epoch": 2.212233103226842, + "grad_norm": 0.6040937900543213, + "learning_rate": 5.3392216507809714e-05, + "loss": 0.2475, + "step": 6890 + }, + { + "epoch": 2.2125541820516936, + "grad_norm": 0.8480896353721619, + "learning_rate": 5.33613359660109e-05, + "loss": 0.3257, + "step": 6891 + }, + { + "epoch": 2.212875260876545, + "grad_norm": 0.824813961982727, + "learning_rate": 5.33304611069918e-05, + "loss": 0.3541, + "step": 6892 + }, + { + "epoch": 2.2131963397013967, + "grad_norm": 1.2338895797729492, + "learning_rate": 5.329959193451448e-05, + "loss": 0.347, + "step": 6893 + }, + { + "epoch": 2.2135174185262483, + "grad_norm": 0.9513993263244629, + "learning_rate": 5.326872845234021e-05, + "loss": 0.3895, + "step": 6894 + }, + { + "epoch": 2.2138384973511, + "grad_norm": 0.9307944774627686, + "learning_rate": 5.3237870664229636e-05, + "loss": 0.4291, + "step": 6895 + }, + { + "epoch": 2.2141595761759514, + "grad_norm": 0.8363532423973083, + "learning_rate": 5.320701857394268e-05, + "loss": 0.3269, + "step": 6896 + }, + { + "epoch": 2.2144806550008025, + "grad_norm": 0.873292863368988, + "learning_rate": 5.317617218523856e-05, + "loss": 0.4286, + "step": 6897 + }, + { + "epoch": 2.214801733825654, + "grad_norm": 0.7528856992721558, + "learning_rate": 5.3145331501875796e-05, + "loss": 0.3506, + "step": 6898 + }, + { + "epoch": 2.2151228126505056, + "grad_norm": 0.7959845662117004, + "learning_rate": 5.311449652761235e-05, + "loss": 0.3377, + "step": 6899 + }, + { + "epoch": 2.215443891475357, + "grad_norm": 0.954590916633606, + "learning_rate": 5.3083667266205194e-05, + "loss": 0.3659, + "step": 6900 + }, + { + "epoch": 2.2157649703002087, + "grad_norm": 0.9657596945762634, + "learning_rate": 5.305284372141095e-05, + "loss": 0.437, + "step": 6901 + }, + { + "epoch": 2.2160860491250602, + "grad_norm": 0.9250769019126892, + "learning_rate": 5.302202589698525e-05, + "loss": 0.3796, + "step": 6902 + }, + { + "epoch": 2.216407127949912, + "grad_norm": 0.8130130171775818, + "learning_rate": 5.299121379668316e-05, + "loss": 0.286, + "step": 6903 + }, + { + "epoch": 2.2167282067747633, + "grad_norm": 0.8332054018974304, + "learning_rate": 5.296040742425916e-05, + "loss": 0.3795, + "step": 6904 + }, + { + "epoch": 2.217049285599615, + "grad_norm": 0.786918580532074, + "learning_rate": 5.292960678346675e-05, + "loss": 0.3355, + "step": 6905 + }, + { + "epoch": 2.217370364424466, + "grad_norm": 1.060431718826294, + "learning_rate": 5.2898811878059e-05, + "loss": 0.4117, + "step": 6906 + }, + { + "epoch": 2.2176914432493176, + "grad_norm": 0.7190358638763428, + "learning_rate": 5.286802271178815e-05, + "loss": 0.2718, + "step": 6907 + }, + { + "epoch": 2.218012522074169, + "grad_norm": 1.1578835248947144, + "learning_rate": 5.2837239288405784e-05, + "loss": 0.4461, + "step": 6908 + }, + { + "epoch": 2.2183336008990207, + "grad_norm": 0.8902946710586548, + "learning_rate": 5.2806461611662735e-05, + "loss": 0.3296, + "step": 6909 + }, + { + "epoch": 2.218654679723872, + "grad_norm": 0.8117751479148865, + "learning_rate": 5.277568968530919e-05, + "loss": 0.3455, + "step": 6910 + }, + { + "epoch": 2.2189757585487238, + "grad_norm": 0.6965182423591614, + "learning_rate": 5.274492351309461e-05, + "loss": 0.2972, + "step": 6911 + }, + { + "epoch": 2.2192968373735753, + "grad_norm": 0.7272025942802429, + "learning_rate": 5.271416309876776e-05, + "loss": 0.2932, + "step": 6912 + }, + { + "epoch": 2.219617916198427, + "grad_norm": 0.7494564056396484, + "learning_rate": 5.26834084460767e-05, + "loss": 0.2976, + "step": 6913 + }, + { + "epoch": 2.2199389950232784, + "grad_norm": 0.7828695774078369, + "learning_rate": 5.265265955876879e-05, + "loss": 0.2948, + "step": 6914 + }, + { + "epoch": 2.2202600738481295, + "grad_norm": 0.8334441781044006, + "learning_rate": 5.2621916440590715e-05, + "loss": 0.3339, + "step": 6915 + }, + { + "epoch": 2.220581152672981, + "grad_norm": 0.6018253564834595, + "learning_rate": 5.259117909528839e-05, + "loss": 0.265, + "step": 6916 + }, + { + "epoch": 2.2209022314978326, + "grad_norm": 1.2306244373321533, + "learning_rate": 5.256044752660709e-05, + "loss": 0.2778, + "step": 6917 + }, + { + "epoch": 2.221223310322684, + "grad_norm": 1.4875001907348633, + "learning_rate": 5.2529721738291315e-05, + "loss": 0.3668, + "step": 6918 + }, + { + "epoch": 2.2215443891475357, + "grad_norm": 1.1924291849136353, + "learning_rate": 5.2499001734085044e-05, + "loss": 0.4159, + "step": 6919 + }, + { + "epoch": 2.2218654679723873, + "grad_norm": 0.8212143182754517, + "learning_rate": 5.2468287517731276e-05, + "loss": 0.3293, + "step": 6920 + }, + { + "epoch": 2.222186546797239, + "grad_norm": 0.7851700782775879, + "learning_rate": 5.243757909297247e-05, + "loss": 0.3297, + "step": 6921 + }, + { + "epoch": 2.2225076256220904, + "grad_norm": 0.7045104503631592, + "learning_rate": 5.2406876463550445e-05, + "loss": 0.2812, + "step": 6922 + }, + { + "epoch": 2.222828704446942, + "grad_norm": 1.1284433603286743, + "learning_rate": 5.237617963320608e-05, + "loss": 0.3043, + "step": 6923 + }, + { + "epoch": 2.223149783271793, + "grad_norm": 0.7409510016441345, + "learning_rate": 5.234548860567985e-05, + "loss": 0.3132, + "step": 6924 + }, + { + "epoch": 2.2234708620966446, + "grad_norm": 1.1001427173614502, + "learning_rate": 5.2314803384711195e-05, + "loss": 0.3493, + "step": 6925 + }, + { + "epoch": 2.223791940921496, + "grad_norm": 0.7392867803573608, + "learning_rate": 5.2284123974039154e-05, + "loss": 0.3222, + "step": 6926 + }, + { + "epoch": 2.2241130197463477, + "grad_norm": 0.556638777256012, + "learning_rate": 5.225345037740186e-05, + "loss": 0.2963, + "step": 6927 + }, + { + "epoch": 2.2244340985711992, + "grad_norm": 0.4246133863925934, + "learning_rate": 5.222278259853681e-05, + "loss": 0.2487, + "step": 6928 + }, + { + "epoch": 2.2247551773960508, + "grad_norm": 1.3218295574188232, + "learning_rate": 5.2192120641180786e-05, + "loss": 0.2931, + "step": 6929 + }, + { + "epoch": 2.2250762562209023, + "grad_norm": 0.4780152440071106, + "learning_rate": 5.216146450906984e-05, + "loss": 0.4964, + "step": 6930 + }, + { + "epoch": 2.225397335045754, + "grad_norm": 0.41451188921928406, + "learning_rate": 5.213081420593933e-05, + "loss": 0.4104, + "step": 6931 + }, + { + "epoch": 2.2257184138706054, + "grad_norm": 0.5102071762084961, + "learning_rate": 5.210016973552391e-05, + "loss": 0.2379, + "step": 6932 + }, + { + "epoch": 2.2260394926954565, + "grad_norm": 0.32184290885925293, + "learning_rate": 5.20695311015575e-05, + "loss": 0.0819, + "step": 6933 + }, + { + "epoch": 2.226360571520308, + "grad_norm": 0.5118618011474609, + "learning_rate": 5.2038898307773354e-05, + "loss": 0.1675, + "step": 6934 + }, + { + "epoch": 2.2266816503451596, + "grad_norm": 0.561114490032196, + "learning_rate": 5.200827135790396e-05, + "loss": 0.2358, + "step": 6935 + }, + { + "epoch": 2.227002729170011, + "grad_norm": 0.8075849413871765, + "learning_rate": 5.197765025568109e-05, + "loss": 0.4558, + "step": 6936 + }, + { + "epoch": 2.2273238079948627, + "grad_norm": 0.7261122465133667, + "learning_rate": 5.194703500483593e-05, + "loss": 0.3476, + "step": 6937 + }, + { + "epoch": 2.2276448868197143, + "grad_norm": 0.8632923364639282, + "learning_rate": 5.1916425609098775e-05, + "loss": 0.4246, + "step": 6938 + }, + { + "epoch": 2.227965965644566, + "grad_norm": 0.8326036334037781, + "learning_rate": 5.188582207219931e-05, + "loss": 0.3473, + "step": 6939 + }, + { + "epoch": 2.2282870444694174, + "grad_norm": 0.9028269648551941, + "learning_rate": 5.1855224397866476e-05, + "loss": 0.3969, + "step": 6940 + }, + { + "epoch": 2.228608123294269, + "grad_norm": 0.8235176205635071, + "learning_rate": 5.182463258982846e-05, + "loss": 0.2969, + "step": 6941 + }, + { + "epoch": 2.22892920211912, + "grad_norm": 0.8503168821334839, + "learning_rate": 5.179404665181291e-05, + "loss": 0.3176, + "step": 6942 + }, + { + "epoch": 2.2292502809439716, + "grad_norm": 0.8072776794433594, + "learning_rate": 5.1763466587546485e-05, + "loss": 0.3188, + "step": 6943 + }, + { + "epoch": 2.229571359768823, + "grad_norm": 0.9978686571121216, + "learning_rate": 5.1732892400755376e-05, + "loss": 0.3749, + "step": 6944 + }, + { + "epoch": 2.2298924385936747, + "grad_norm": 0.9184989929199219, + "learning_rate": 5.170232409516496e-05, + "loss": 0.4544, + "step": 6945 + }, + { + "epoch": 2.2302135174185262, + "grad_norm": 0.7581484913825989, + "learning_rate": 5.1671761674499765e-05, + "loss": 0.2865, + "step": 6946 + }, + { + "epoch": 2.230534596243378, + "grad_norm": 0.6982542276382446, + "learning_rate": 5.1641205142483894e-05, + "loss": 0.285, + "step": 6947 + }, + { + "epoch": 2.2308556750682293, + "grad_norm": 1.0385242700576782, + "learning_rate": 5.16106545028404e-05, + "loss": 0.3419, + "step": 6948 + }, + { + "epoch": 2.231176753893081, + "grad_norm": 0.9583960175514221, + "learning_rate": 5.158010975929193e-05, + "loss": 0.3959, + "step": 6949 + }, + { + "epoch": 2.2314978327179325, + "grad_norm": 1.119672417640686, + "learning_rate": 5.1549570915560206e-05, + "loss": 0.4012, + "step": 6950 + }, + { + "epoch": 2.2318189115427836, + "grad_norm": 0.9875466823577881, + "learning_rate": 5.15190379753663e-05, + "loss": 0.4324, + "step": 6951 + }, + { + "epoch": 2.232139990367635, + "grad_norm": 0.9298665523529053, + "learning_rate": 5.148851094243057e-05, + "loss": 0.4168, + "step": 6952 + }, + { + "epoch": 2.2324610691924867, + "grad_norm": 0.7571067810058594, + "learning_rate": 5.145798982047261e-05, + "loss": 0.3063, + "step": 6953 + }, + { + "epoch": 2.232782148017338, + "grad_norm": 0.9960950613021851, + "learning_rate": 5.1427474613211356e-05, + "loss": 0.4125, + "step": 6954 + }, + { + "epoch": 2.2331032268421898, + "grad_norm": 0.9957188367843628, + "learning_rate": 5.1396965324364986e-05, + "loss": 0.3128, + "step": 6955 + }, + { + "epoch": 2.2334243056670413, + "grad_norm": 0.9226759672164917, + "learning_rate": 5.1366461957650954e-05, + "loss": 0.3247, + "step": 6956 + }, + { + "epoch": 2.233745384491893, + "grad_norm": 0.7779395580291748, + "learning_rate": 5.133596451678603e-05, + "loss": 0.297, + "step": 6957 + }, + { + "epoch": 2.2340664633167444, + "grad_norm": 1.165313482284546, + "learning_rate": 5.13054730054862e-05, + "loss": 0.4306, + "step": 6958 + }, + { + "epoch": 2.234387542141596, + "grad_norm": 0.7935650944709778, + "learning_rate": 5.127498742746675e-05, + "loss": 0.3009, + "step": 6959 + }, + { + "epoch": 2.234708620966447, + "grad_norm": 1.200869083404541, + "learning_rate": 5.1244507786442356e-05, + "loss": 0.4304, + "step": 6960 + }, + { + "epoch": 2.2350296997912986, + "grad_norm": 1.077374815940857, + "learning_rate": 5.121403408612672e-05, + "loss": 0.498, + "step": 6961 + }, + { + "epoch": 2.23535077861615, + "grad_norm": 0.8770790100097656, + "learning_rate": 5.1183566330233124e-05, + "loss": 0.3659, + "step": 6962 + }, + { + "epoch": 2.2356718574410017, + "grad_norm": 0.8858685493469238, + "learning_rate": 5.115310452247386e-05, + "loss": 0.3483, + "step": 6963 + }, + { + "epoch": 2.2359929362658533, + "grad_norm": 0.9457120895385742, + "learning_rate": 5.112264866656059e-05, + "loss": 0.4015, + "step": 6964 + }, + { + "epoch": 2.236314015090705, + "grad_norm": 0.7275859713554382, + "learning_rate": 5.1092198766204415e-05, + "loss": 0.3293, + "step": 6965 + }, + { + "epoch": 2.2366350939155564, + "grad_norm": 0.8628625869750977, + "learning_rate": 5.1061754825115374e-05, + "loss": 0.3513, + "step": 6966 + }, + { + "epoch": 2.236956172740408, + "grad_norm": 1.0785568952560425, + "learning_rate": 5.103131684700314e-05, + "loss": 0.3859, + "step": 6967 + }, + { + "epoch": 2.2372772515652595, + "grad_norm": 1.0810168981552124, + "learning_rate": 5.100088483557634e-05, + "loss": 0.3165, + "step": 6968 + }, + { + "epoch": 2.2375983303901106, + "grad_norm": 0.8149095177650452, + "learning_rate": 5.097045879454313e-05, + "loss": 0.3059, + "step": 6969 + }, + { + "epoch": 2.237919409214962, + "grad_norm": 0.921907901763916, + "learning_rate": 5.0940038727610796e-05, + "loss": 0.279, + "step": 6970 + }, + { + "epoch": 2.2382404880398137, + "grad_norm": 0.6431214213371277, + "learning_rate": 5.090962463848592e-05, + "loss": 0.3469, + "step": 6971 + }, + { + "epoch": 2.2385615668646652, + "grad_norm": 0.8860642313957214, + "learning_rate": 5.087921653087437e-05, + "loss": 0.4101, + "step": 6972 + }, + { + "epoch": 2.238882645689517, + "grad_norm": 0.930432140827179, + "learning_rate": 5.0848814408481305e-05, + "loss": 0.3629, + "step": 6973 + }, + { + "epoch": 2.2392037245143683, + "grad_norm": 1.0363874435424805, + "learning_rate": 5.0818418275011104e-05, + "loss": 0.2949, + "step": 6974 + }, + { + "epoch": 2.23952480333922, + "grad_norm": 1.0017728805541992, + "learning_rate": 5.0788028134167456e-05, + "loss": 0.37, + "step": 6975 + }, + { + "epoch": 2.2398458821640714, + "grad_norm": 0.7278321385383606, + "learning_rate": 5.07576439896533e-05, + "loss": 0.2832, + "step": 6976 + }, + { + "epoch": 2.240166960988923, + "grad_norm": 1.1001341342926025, + "learning_rate": 5.072726584517086e-05, + "loss": 0.3137, + "step": 6977 + }, + { + "epoch": 2.240488039813774, + "grad_norm": 0.9127258062362671, + "learning_rate": 5.069689370442161e-05, + "loss": 0.2938, + "step": 6978 + }, + { + "epoch": 2.2408091186386256, + "grad_norm": 0.6705423593521118, + "learning_rate": 5.066652757110628e-05, + "loss": 0.2919, + "step": 6979 + }, + { + "epoch": 2.241130197463477, + "grad_norm": 0.6100792288780212, + "learning_rate": 5.0636167448924987e-05, + "loss": 0.8073, + "step": 6980 + }, + { + "epoch": 2.2414512762883287, + "grad_norm": 0.6084210872650146, + "learning_rate": 5.0605813341576924e-05, + "loss": 0.8022, + "step": 6981 + }, + { + "epoch": 2.2417723551131803, + "grad_norm": 0.42793241143226624, + "learning_rate": 5.057546525276068e-05, + "loss": 0.2486, + "step": 6982 + }, + { + "epoch": 2.242093433938032, + "grad_norm": 0.3993290662765503, + "learning_rate": 5.054512318617406e-05, + "loss": 0.1494, + "step": 6983 + }, + { + "epoch": 2.2424145127628834, + "grad_norm": 0.5702289938926697, + "learning_rate": 5.051478714551414e-05, + "loss": 0.3078, + "step": 6984 + }, + { + "epoch": 2.242735591587735, + "grad_norm": 0.3860454559326172, + "learning_rate": 5.048445713447738e-05, + "loss": 0.0808, + "step": 6985 + }, + { + "epoch": 2.2430566704125865, + "grad_norm": 0.7141852974891663, + "learning_rate": 5.045413315675924e-05, + "loss": 0.3344, + "step": 6986 + }, + { + "epoch": 2.2433777492374376, + "grad_norm": 0.7017838954925537, + "learning_rate": 5.0423815216054724e-05, + "loss": 0.348, + "step": 6987 + }, + { + "epoch": 2.243698828062289, + "grad_norm": 1.1939187049865723, + "learning_rate": 5.039350331605794e-05, + "loss": 0.4238, + "step": 6988 + }, + { + "epoch": 2.2440199068871407, + "grad_norm": 0.8673609495162964, + "learning_rate": 5.036319746046232e-05, + "loss": 0.4174, + "step": 6989 + }, + { + "epoch": 2.2443409857119923, + "grad_norm": 0.7690039873123169, + "learning_rate": 5.033289765296054e-05, + "loss": 0.3594, + "step": 6990 + }, + { + "epoch": 2.244662064536844, + "grad_norm": 0.624330997467041, + "learning_rate": 5.0302603897244474e-05, + "loss": 0.2757, + "step": 6991 + }, + { + "epoch": 2.2449831433616954, + "grad_norm": 0.8041682839393616, + "learning_rate": 5.0272316197005396e-05, + "loss": 0.347, + "step": 6992 + }, + { + "epoch": 2.245304222186547, + "grad_norm": 0.6567179560661316, + "learning_rate": 5.024203455593375e-05, + "loss": 0.2858, + "step": 6993 + }, + { + "epoch": 2.2456253010113985, + "grad_norm": 0.9437269568443298, + "learning_rate": 5.021175897771927e-05, + "loss": 0.3507, + "step": 6994 + }, + { + "epoch": 2.24594637983625, + "grad_norm": 1.0030709505081177, + "learning_rate": 5.018148946605092e-05, + "loss": 0.4534, + "step": 6995 + }, + { + "epoch": 2.246267458661101, + "grad_norm": 0.8728824853897095, + "learning_rate": 5.015122602461698e-05, + "loss": 0.3801, + "step": 6996 + }, + { + "epoch": 2.2465885374859527, + "grad_norm": 0.9060454964637756, + "learning_rate": 5.012096865710494e-05, + "loss": 0.3869, + "step": 6997 + }, + { + "epoch": 2.246909616310804, + "grad_norm": 0.8747976422309875, + "learning_rate": 5.0090717367201554e-05, + "loss": 0.3823, + "step": 6998 + }, + { + "epoch": 2.2472306951356558, + "grad_norm": 0.9838657975196838, + "learning_rate": 5.006047215859289e-05, + "loss": 0.2901, + "step": 6999 + }, + { + "epoch": 2.2475517739605073, + "grad_norm": 0.7466594576835632, + "learning_rate": 5.003023303496419e-05, + "loss": 0.3505, + "step": 7000 + }, + { + "epoch": 2.247872852785359, + "grad_norm": 0.8229215145111084, + "learning_rate": 5.000000000000002e-05, + "loss": 0.3138, + "step": 7001 + }, + { + "epoch": 2.2481939316102104, + "grad_norm": 0.7786136865615845, + "learning_rate": 4.996977305738415e-05, + "loss": 0.3044, + "step": 7002 + }, + { + "epoch": 2.248515010435062, + "grad_norm": 0.6571848392486572, + "learning_rate": 4.9939552210799755e-05, + "loss": 0.2649, + "step": 7003 + }, + { + "epoch": 2.248836089259913, + "grad_norm": 0.6646254062652588, + "learning_rate": 4.990933746392899e-05, + "loss": 0.3347, + "step": 7004 + }, + { + "epoch": 2.2491571680847646, + "grad_norm": 0.8542558550834656, + "learning_rate": 4.98791288204536e-05, + "loss": 0.3518, + "step": 7005 + }, + { + "epoch": 2.249478246909616, + "grad_norm": 0.726729154586792, + "learning_rate": 4.9848926284054255e-05, + "loss": 0.2939, + "step": 7006 + }, + { + "epoch": 2.2497993257344677, + "grad_norm": 0.9049625396728516, + "learning_rate": 4.981872985841115e-05, + "loss": 0.3657, + "step": 7007 + }, + { + "epoch": 2.2501204045593193, + "grad_norm": 0.9269988536834717, + "learning_rate": 4.978853954720364e-05, + "loss": 0.2894, + "step": 7008 + }, + { + "epoch": 2.250441483384171, + "grad_norm": 1.2654590606689453, + "learning_rate": 4.97583553541102e-05, + "loss": 0.4061, + "step": 7009 + }, + { + "epoch": 2.2507625622090224, + "grad_norm": 0.8163503408432007, + "learning_rate": 4.97281772828088e-05, + "loss": 0.336, + "step": 7010 + }, + { + "epoch": 2.251083641033874, + "grad_norm": 1.1745022535324097, + "learning_rate": 4.969800533697649e-05, + "loss": 0.3993, + "step": 7011 + }, + { + "epoch": 2.2514047198587255, + "grad_norm": 0.7915329933166504, + "learning_rate": 4.966783952028967e-05, + "loss": 0.3401, + "step": 7012 + }, + { + "epoch": 2.2517257986835766, + "grad_norm": 0.9306576251983643, + "learning_rate": 4.9637679836423924e-05, + "loss": 0.3605, + "step": 7013 + }, + { + "epoch": 2.252046877508428, + "grad_norm": 0.5942294597625732, + "learning_rate": 4.960752628905412e-05, + "loss": 0.2566, + "step": 7014 + }, + { + "epoch": 2.2523679563332797, + "grad_norm": 0.743722140789032, + "learning_rate": 4.957737888185439e-05, + "loss": 0.2979, + "step": 7015 + }, + { + "epoch": 2.2526890351581312, + "grad_norm": 0.7660803198814392, + "learning_rate": 4.9547237618498085e-05, + "loss": 0.2865, + "step": 7016 + }, + { + "epoch": 2.253010113982983, + "grad_norm": 0.5819270014762878, + "learning_rate": 4.9517102502657845e-05, + "loss": 0.2663, + "step": 7017 + }, + { + "epoch": 2.2533311928078343, + "grad_norm": 0.7460042834281921, + "learning_rate": 4.9486973538005535e-05, + "loss": 0.2898, + "step": 7018 + }, + { + "epoch": 2.253652271632686, + "grad_norm": 0.8672288060188293, + "learning_rate": 4.945685072821227e-05, + "loss": 0.3627, + "step": 7019 + }, + { + "epoch": 2.2539733504575374, + "grad_norm": 1.0746335983276367, + "learning_rate": 4.9426734076948436e-05, + "loss": 0.3425, + "step": 7020 + }, + { + "epoch": 2.254294429282389, + "grad_norm": 0.6159694790840149, + "learning_rate": 4.939662358788364e-05, + "loss": 0.2663, + "step": 7021 + }, + { + "epoch": 2.25461550810724, + "grad_norm": 0.8164811730384827, + "learning_rate": 4.9366519264686725e-05, + "loss": 0.251, + "step": 7022 + }, + { + "epoch": 2.2549365869320916, + "grad_norm": 0.7953729033470154, + "learning_rate": 4.933642111102594e-05, + "loss": 0.3071, + "step": 7023 + }, + { + "epoch": 2.255257665756943, + "grad_norm": 0.7565558552742004, + "learning_rate": 4.9306329130568474e-05, + "loss": 0.3028, + "step": 7024 + }, + { + "epoch": 2.2555787445817947, + "grad_norm": 1.2023780345916748, + "learning_rate": 4.927624332698109e-05, + "loss": 0.4058, + "step": 7025 + }, + { + "epoch": 2.2558998234066463, + "grad_norm": 0.4921356439590454, + "learning_rate": 4.924616370392961e-05, + "loss": 0.2675, + "step": 7026 + }, + { + "epoch": 2.256220902231498, + "grad_norm": 0.8888304233551025, + "learning_rate": 4.921609026507907e-05, + "loss": 0.3284, + "step": 7027 + }, + { + "epoch": 2.2565419810563494, + "grad_norm": 0.6142435669898987, + "learning_rate": 4.918602301409395e-05, + "loss": 0.2956, + "step": 7028 + }, + { + "epoch": 2.256863059881201, + "grad_norm": 0.3907153308391571, + "learning_rate": 4.915596195463773e-05, + "loss": 0.2773, + "step": 7029 + }, + { + "epoch": 2.2571841387060525, + "grad_norm": 0.5676328539848328, + "learning_rate": 4.912590709037335e-05, + "loss": 0.6601, + "step": 7030 + }, + { + "epoch": 2.2575052175309036, + "grad_norm": 0.5613508224487305, + "learning_rate": 4.909585842496287e-05, + "loss": 0.4196, + "step": 7031 + }, + { + "epoch": 2.257826296355755, + "grad_norm": 0.6825602054595947, + "learning_rate": 4.906581596206764e-05, + "loss": 0.4297, + "step": 7032 + }, + { + "epoch": 2.2581473751806067, + "grad_norm": 0.6425372362136841, + "learning_rate": 4.9035779705348226e-05, + "loss": 0.1434, + "step": 7033 + }, + { + "epoch": 2.2584684540054583, + "grad_norm": 0.3993998169898987, + "learning_rate": 4.900574965846447e-05, + "loss": 0.1526, + "step": 7034 + }, + { + "epoch": 2.25878953283031, + "grad_norm": 0.9571073651313782, + "learning_rate": 4.8975725825075435e-05, + "loss": 0.4906, + "step": 7035 + }, + { + "epoch": 2.2591106116551614, + "grad_norm": 0.9649335741996765, + "learning_rate": 4.894570820883944e-05, + "loss": 0.4216, + "step": 7036 + }, + { + "epoch": 2.259431690480013, + "grad_norm": 0.8993098735809326, + "learning_rate": 4.8915696813414026e-05, + "loss": 0.4621, + "step": 7037 + }, + { + "epoch": 2.2597527693048645, + "grad_norm": 0.8691626191139221, + "learning_rate": 4.888569164245601e-05, + "loss": 0.3543, + "step": 7038 + }, + { + "epoch": 2.260073848129716, + "grad_norm": 1.0925235748291016, + "learning_rate": 4.885569269962142e-05, + "loss": 0.3965, + "step": 7039 + }, + { + "epoch": 2.260394926954567, + "grad_norm": 0.9121451377868652, + "learning_rate": 4.8825699988565485e-05, + "loss": 0.3612, + "step": 7040 + }, + { + "epoch": 2.2607160057794187, + "grad_norm": 0.8157550096511841, + "learning_rate": 4.8795713512942865e-05, + "loss": 0.3485, + "step": 7041 + }, + { + "epoch": 2.26103708460427, + "grad_norm": 0.691774845123291, + "learning_rate": 4.8765733276407156e-05, + "loss": 0.285, + "step": 7042 + }, + { + "epoch": 2.2613581634291218, + "grad_norm": 0.9489686489105225, + "learning_rate": 4.8735759282611516e-05, + "loss": 0.3201, + "step": 7043 + }, + { + "epoch": 2.2616792422539733, + "grad_norm": 0.9636274576187134, + "learning_rate": 4.870579153520807e-05, + "loss": 0.3511, + "step": 7044 + }, + { + "epoch": 2.262000321078825, + "grad_norm": 0.8838746547698975, + "learning_rate": 4.867583003784829e-05, + "loss": 0.3283, + "step": 7045 + }, + { + "epoch": 2.2623213999036764, + "grad_norm": 0.8975582718849182, + "learning_rate": 4.864587479418302e-05, + "loss": 0.3509, + "step": 7046 + }, + { + "epoch": 2.262642478728528, + "grad_norm": 0.8758841156959534, + "learning_rate": 4.861592580786205e-05, + "loss": 0.3312, + "step": 7047 + }, + { + "epoch": 2.2629635575533795, + "grad_norm": 0.8865678310394287, + "learning_rate": 4.858598308253473e-05, + "loss": 0.4163, + "step": 7048 + }, + { + "epoch": 2.2632846363782306, + "grad_norm": 0.7309367656707764, + "learning_rate": 4.8556046621849346e-05, + "loss": 0.3007, + "step": 7049 + }, + { + "epoch": 2.263605715203082, + "grad_norm": 1.0383028984069824, + "learning_rate": 4.852611642945368e-05, + "loss": 0.4668, + "step": 7050 + }, + { + "epoch": 2.2639267940279337, + "grad_norm": 0.8784850239753723, + "learning_rate": 4.8496192508994576e-05, + "loss": 0.2978, + "step": 7051 + }, + { + "epoch": 2.2642478728527853, + "grad_norm": 0.8731421232223511, + "learning_rate": 4.84662748641182e-05, + "loss": 0.3559, + "step": 7052 + }, + { + "epoch": 2.264568951677637, + "grad_norm": 0.8055264353752136, + "learning_rate": 4.8436363498469906e-05, + "loss": 0.333, + "step": 7053 + }, + { + "epoch": 2.2648900305024884, + "grad_norm": 0.7497748732566833, + "learning_rate": 4.840645841569431e-05, + "loss": 0.2937, + "step": 7054 + }, + { + "epoch": 2.26521110932734, + "grad_norm": 0.9812029600143433, + "learning_rate": 4.837655961943526e-05, + "loss": 0.4044, + "step": 7055 + }, + { + "epoch": 2.2655321881521915, + "grad_norm": 0.9688860177993774, + "learning_rate": 4.834666711333582e-05, + "loss": 0.3708, + "step": 7056 + }, + { + "epoch": 2.265853266977043, + "grad_norm": 0.8438839912414551, + "learning_rate": 4.8316780901038314e-05, + "loss": 0.2835, + "step": 7057 + }, + { + "epoch": 2.266174345801894, + "grad_norm": 1.0307844877243042, + "learning_rate": 4.828690098618429e-05, + "loss": 0.3232, + "step": 7058 + }, + { + "epoch": 2.2664954246267457, + "grad_norm": 0.6895515322685242, + "learning_rate": 4.825702737241452e-05, + "loss": 0.2816, + "step": 7059 + }, + { + "epoch": 2.2668165034515972, + "grad_norm": 0.8338602185249329, + "learning_rate": 4.822716006336897e-05, + "loss": 0.3282, + "step": 7060 + }, + { + "epoch": 2.267137582276449, + "grad_norm": 0.7110063433647156, + "learning_rate": 4.8197299062686995e-05, + "loss": 0.2868, + "step": 7061 + }, + { + "epoch": 2.2674586611013003, + "grad_norm": 0.8078935146331787, + "learning_rate": 4.816744437400697e-05, + "loss": 0.3183, + "step": 7062 + }, + { + "epoch": 2.267779739926152, + "grad_norm": 0.830915093421936, + "learning_rate": 4.8137596000966614e-05, + "loss": 0.3452, + "step": 7063 + }, + { + "epoch": 2.2681008187510034, + "grad_norm": 1.4586313962936401, + "learning_rate": 4.810775394720286e-05, + "loss": 0.4637, + "step": 7064 + }, + { + "epoch": 2.268421897575855, + "grad_norm": 0.9930241703987122, + "learning_rate": 4.807791821635186e-05, + "loss": 0.3232, + "step": 7065 + }, + { + "epoch": 2.2687429764007065, + "grad_norm": 0.7299748063087463, + "learning_rate": 4.8048088812049096e-05, + "loss": 0.313, + "step": 7066 + }, + { + "epoch": 2.2690640552255577, + "grad_norm": 0.6193024516105652, + "learning_rate": 4.8018265737929044e-05, + "loss": 0.285, + "step": 7067 + }, + { + "epoch": 2.269385134050409, + "grad_norm": 0.773335874080658, + "learning_rate": 4.798844899762568e-05, + "loss": 0.3412, + "step": 7068 + }, + { + "epoch": 2.2697062128752608, + "grad_norm": 0.9206930994987488, + "learning_rate": 4.7958638594772064e-05, + "loss": 0.3609, + "step": 7069 + }, + { + "epoch": 2.2700272917001123, + "grad_norm": 1.3238471746444702, + "learning_rate": 4.792883453300042e-05, + "loss": 0.3777, + "step": 7070 + }, + { + "epoch": 2.270348370524964, + "grad_norm": 0.8293914794921875, + "learning_rate": 4.78990368159424e-05, + "loss": 0.3592, + "step": 7071 + }, + { + "epoch": 2.2706694493498154, + "grad_norm": 0.9413627982139587, + "learning_rate": 4.786924544722864e-05, + "loss": 0.377, + "step": 7072 + }, + { + "epoch": 2.270990528174667, + "grad_norm": 0.9093666672706604, + "learning_rate": 4.783946043048923e-05, + "loss": 0.3278, + "step": 7073 + }, + { + "epoch": 2.2713116069995185, + "grad_norm": 0.8311013579368591, + "learning_rate": 4.780968176935333e-05, + "loss": 0.2915, + "step": 7074 + }, + { + "epoch": 2.27163268582437, + "grad_norm": 0.9607753157615662, + "learning_rate": 4.7779909467449414e-05, + "loss": 0.2904, + "step": 7075 + }, + { + "epoch": 2.271953764649221, + "grad_norm": 0.5113980174064636, + "learning_rate": 4.7750143528405126e-05, + "loss": 0.2676, + "step": 7076 + }, + { + "epoch": 2.2722748434740727, + "grad_norm": 0.7223218679428101, + "learning_rate": 4.7720383955847345e-05, + "loss": 0.291, + "step": 7077 + }, + { + "epoch": 2.2725959222989243, + "grad_norm": 0.48104968667030334, + "learning_rate": 4.769063075340222e-05, + "loss": 0.2631, + "step": 7078 + }, + { + "epoch": 2.272917001123776, + "grad_norm": 1.0848753452301025, + "learning_rate": 4.766088392469506e-05, + "loss": 0.3135, + "step": 7079 + }, + { + "epoch": 2.2732380799486274, + "grad_norm": 0.40874695777893066, + "learning_rate": 4.763114347335043e-05, + "loss": 0.4864, + "step": 7080 + }, + { + "epoch": 2.273559158773479, + "grad_norm": 0.5546656250953674, + "learning_rate": 4.7601409402992106e-05, + "loss": 0.3189, + "step": 7081 + }, + { + "epoch": 2.2738802375983305, + "grad_norm": 0.678066611289978, + "learning_rate": 4.757168171724311e-05, + "loss": 0.3145, + "step": 7082 + }, + { + "epoch": 2.274201316423182, + "grad_norm": 0.4192000925540924, + "learning_rate": 4.7541960419725626e-05, + "loss": 0.1832, + "step": 7083 + }, + { + "epoch": 2.2745223952480336, + "grad_norm": 0.24414893984794617, + "learning_rate": 4.7512245514061225e-05, + "loss": 0.0799, + "step": 7084 + }, + { + "epoch": 2.2748434740728847, + "grad_norm": 0.634087085723877, + "learning_rate": 4.748253700387042e-05, + "loss": 0.139, + "step": 7085 + }, + { + "epoch": 2.2751645528977362, + "grad_norm": 0.9760504364967346, + "learning_rate": 4.745283489277325e-05, + "loss": 0.522, + "step": 7086 + }, + { + "epoch": 2.2754856317225878, + "grad_norm": 0.9048032164573669, + "learning_rate": 4.742313918438872e-05, + "loss": 0.4195, + "step": 7087 + }, + { + "epoch": 2.2758067105474393, + "grad_norm": 0.9374763369560242, + "learning_rate": 4.739344988233516e-05, + "loss": 0.3933, + "step": 7088 + }, + { + "epoch": 2.276127789372291, + "grad_norm": 1.0608501434326172, + "learning_rate": 4.736376699023023e-05, + "loss": 0.4224, + "step": 7089 + }, + { + "epoch": 2.2764488681971424, + "grad_norm": 0.7070604562759399, + "learning_rate": 4.7334090511690554e-05, + "loss": 0.3017, + "step": 7090 + }, + { + "epoch": 2.276769947021994, + "grad_norm": 0.7293145060539246, + "learning_rate": 4.7304420450332244e-05, + "loss": 0.297, + "step": 7091 + }, + { + "epoch": 2.2770910258468455, + "grad_norm": 0.7650532722473145, + "learning_rate": 4.7274756809770446e-05, + "loss": 0.3605, + "step": 7092 + }, + { + "epoch": 2.277412104671697, + "grad_norm": 0.9350664019584656, + "learning_rate": 4.724509959361961e-05, + "loss": 0.3853, + "step": 7093 + }, + { + "epoch": 2.277733183496548, + "grad_norm": 0.8287744522094727, + "learning_rate": 4.721544880549337e-05, + "loss": 0.3868, + "step": 7094 + }, + { + "epoch": 2.2780542623213997, + "grad_norm": 0.7390232086181641, + "learning_rate": 4.7185804449004565e-05, + "loss": 0.3578, + "step": 7095 + }, + { + "epoch": 2.2783753411462513, + "grad_norm": 0.8684871196746826, + "learning_rate": 4.71561665277653e-05, + "loss": 0.3543, + "step": 7096 + }, + { + "epoch": 2.278696419971103, + "grad_norm": 0.6932856440544128, + "learning_rate": 4.712653504538683e-05, + "loss": 0.296, + "step": 7097 + }, + { + "epoch": 2.2790174987959544, + "grad_norm": 1.051287055015564, + "learning_rate": 4.70969100054797e-05, + "loss": 0.5495, + "step": 7098 + }, + { + "epoch": 2.279338577620806, + "grad_norm": 0.8018210530281067, + "learning_rate": 4.706729141165361e-05, + "loss": 0.2714, + "step": 7099 + }, + { + "epoch": 2.2796596564456575, + "grad_norm": 1.3825582265853882, + "learning_rate": 4.7037679267517495e-05, + "loss": 0.3997, + "step": 7100 + }, + { + "epoch": 2.279980735270509, + "grad_norm": 0.8683844804763794, + "learning_rate": 4.700807357667952e-05, + "loss": 0.3403, + "step": 7101 + }, + { + "epoch": 2.2803018140953606, + "grad_norm": 0.9737560749053955, + "learning_rate": 4.697847434274704e-05, + "loss": 0.3464, + "step": 7102 + }, + { + "epoch": 2.2806228929202117, + "grad_norm": 0.7259578704833984, + "learning_rate": 4.694888156932658e-05, + "loss": 0.3272, + "step": 7103 + }, + { + "epoch": 2.2809439717450632, + "grad_norm": 0.7659843564033508, + "learning_rate": 4.6919295260024054e-05, + "loss": 0.32, + "step": 7104 + }, + { + "epoch": 2.281265050569915, + "grad_norm": 1.1575708389282227, + "learning_rate": 4.688971541844436e-05, + "loss": 0.4335, + "step": 7105 + }, + { + "epoch": 2.2815861293947663, + "grad_norm": 0.7869051098823547, + "learning_rate": 4.68601420481917e-05, + "loss": 0.3432, + "step": 7106 + }, + { + "epoch": 2.281907208219618, + "grad_norm": 1.0924334526062012, + "learning_rate": 4.6830575152869616e-05, + "loss": 0.3652, + "step": 7107 + }, + { + "epoch": 2.2822282870444695, + "grad_norm": 0.8119152188301086, + "learning_rate": 4.6801014736080596e-05, + "loss": 0.2975, + "step": 7108 + }, + { + "epoch": 2.282549365869321, + "grad_norm": 0.9879711270332336, + "learning_rate": 4.6771460801426635e-05, + "loss": 0.3559, + "step": 7109 + }, + { + "epoch": 2.2828704446941726, + "grad_norm": 0.7146173119544983, + "learning_rate": 4.674191335250865e-05, + "loss": 0.2943, + "step": 7110 + }, + { + "epoch": 2.283191523519024, + "grad_norm": 0.7338638305664062, + "learning_rate": 4.6712372392927e-05, + "loss": 0.2994, + "step": 7111 + }, + { + "epoch": 2.283512602343875, + "grad_norm": 1.170583963394165, + "learning_rate": 4.668283792628114e-05, + "loss": 0.3887, + "step": 7112 + }, + { + "epoch": 2.2838336811687268, + "grad_norm": 0.9542192816734314, + "learning_rate": 4.665330995616974e-05, + "loss": 0.3925, + "step": 7113 + }, + { + "epoch": 2.2841547599935783, + "grad_norm": 0.7801535129547119, + "learning_rate": 4.6623788486190725e-05, + "loss": 0.2912, + "step": 7114 + }, + { + "epoch": 2.28447583881843, + "grad_norm": 1.0590436458587646, + "learning_rate": 4.6594273519941154e-05, + "loss": 0.3585, + "step": 7115 + }, + { + "epoch": 2.2847969176432814, + "grad_norm": 0.7104535102844238, + "learning_rate": 4.656476506101737e-05, + "loss": 0.2778, + "step": 7116 + }, + { + "epoch": 2.285117996468133, + "grad_norm": 0.7049524784088135, + "learning_rate": 4.653526311301488e-05, + "loss": 0.3042, + "step": 7117 + }, + { + "epoch": 2.2854390752929845, + "grad_norm": 1.7429343461990356, + "learning_rate": 4.65057676795284e-05, + "loss": 0.3351, + "step": 7118 + }, + { + "epoch": 2.285760154117836, + "grad_norm": 0.6940548419952393, + "learning_rate": 4.647627876415186e-05, + "loss": 0.2715, + "step": 7119 + }, + { + "epoch": 2.2860812329426876, + "grad_norm": 0.7398602366447449, + "learning_rate": 4.6446796370478394e-05, + "loss": 0.3182, + "step": 7120 + }, + { + "epoch": 2.2864023117675387, + "grad_norm": 1.2548072338104248, + "learning_rate": 4.6417320502100316e-05, + "loss": 0.3538, + "step": 7121 + }, + { + "epoch": 2.2867233905923903, + "grad_norm": 0.6731551289558411, + "learning_rate": 4.6387851162609275e-05, + "loss": 0.3286, + "step": 7122 + }, + { + "epoch": 2.287044469417242, + "grad_norm": 0.6933743357658386, + "learning_rate": 4.6358388355595904e-05, + "loss": 0.2885, + "step": 7123 + }, + { + "epoch": 2.2873655482420934, + "grad_norm": 1.1013816595077515, + "learning_rate": 4.632893208465021e-05, + "loss": 0.4317, + "step": 7124 + }, + { + "epoch": 2.287686627066945, + "grad_norm": 0.7992168664932251, + "learning_rate": 4.629948235336133e-05, + "loss": 0.317, + "step": 7125 + }, + { + "epoch": 2.2880077058917965, + "grad_norm": 0.8445520401000977, + "learning_rate": 4.6270039165317605e-05, + "loss": 0.3382, + "step": 7126 + }, + { + "epoch": 2.288328784716648, + "grad_norm": 0.7868288159370422, + "learning_rate": 4.62406025241067e-05, + "loss": 0.2714, + "step": 7127 + }, + { + "epoch": 2.2886498635414996, + "grad_norm": 0.7132185697555542, + "learning_rate": 4.621117243331523e-05, + "loss": 0.3171, + "step": 7128 + }, + { + "epoch": 2.288970942366351, + "grad_norm": 0.7570092082023621, + "learning_rate": 4.6181748896529273e-05, + "loss": 0.3292, + "step": 7129 + }, + { + "epoch": 2.2892920211912022, + "grad_norm": 0.5305631160736084, + "learning_rate": 4.615233191733398e-05, + "loss": 0.6934, + "step": 7130 + }, + { + "epoch": 2.289613100016054, + "grad_norm": 0.5653072595596313, + "learning_rate": 4.612292149931369e-05, + "loss": 0.7784, + "step": 7131 + }, + { + "epoch": 2.2899341788409053, + "grad_norm": 0.5794984102249146, + "learning_rate": 4.6093517646052034e-05, + "loss": 0.4634, + "step": 7132 + }, + { + "epoch": 2.290255257665757, + "grad_norm": 0.5841457843780518, + "learning_rate": 4.6064120361131656e-05, + "loss": 0.4593, + "step": 7133 + }, + { + "epoch": 2.2905763364906084, + "grad_norm": 0.6066222786903381, + "learning_rate": 4.603472964813466e-05, + "loss": 0.1791, + "step": 7134 + }, + { + "epoch": 2.29089741531546, + "grad_norm": 0.4143172800540924, + "learning_rate": 4.600534551064215e-05, + "loss": 0.0885, + "step": 7135 + }, + { + "epoch": 2.2912184941403115, + "grad_norm": 0.424333393573761, + "learning_rate": 4.59759679522345e-05, + "loss": 0.1697, + "step": 7136 + }, + { + "epoch": 2.291539572965163, + "grad_norm": 0.6681946516036987, + "learning_rate": 4.5946596976491295e-05, + "loss": 0.2762, + "step": 7137 + }, + { + "epoch": 2.2918606517900146, + "grad_norm": 0.9873928427696228, + "learning_rate": 4.591723258699127e-05, + "loss": 0.3777, + "step": 7138 + }, + { + "epoch": 2.2921817306148657, + "grad_norm": 0.8079444766044617, + "learning_rate": 4.588787478731242e-05, + "loss": 0.4295, + "step": 7139 + }, + { + "epoch": 2.2925028094397173, + "grad_norm": 0.8375623822212219, + "learning_rate": 4.5858523581031884e-05, + "loss": 0.2995, + "step": 7140 + }, + { + "epoch": 2.292823888264569, + "grad_norm": 1.0319302082061768, + "learning_rate": 4.582917897172603e-05, + "loss": 0.4127, + "step": 7141 + }, + { + "epoch": 2.2931449670894204, + "grad_norm": 0.7338537573814392, + "learning_rate": 4.579984096297038e-05, + "loss": 0.3225, + "step": 7142 + }, + { + "epoch": 2.293466045914272, + "grad_norm": 0.9782087206840515, + "learning_rate": 4.577050955833973e-05, + "loss": 0.3765, + "step": 7143 + }, + { + "epoch": 2.2937871247391235, + "grad_norm": 0.8286393284797668, + "learning_rate": 4.574118476140794e-05, + "loss": 0.3714, + "step": 7144 + }, + { + "epoch": 2.294108203563975, + "grad_norm": 0.7754277586936951, + "learning_rate": 4.5711866575748276e-05, + "loss": 0.3201, + "step": 7145 + }, + { + "epoch": 2.2944292823888266, + "grad_norm": 0.8050145506858826, + "learning_rate": 4.568255500493292e-05, + "loss": 0.3388, + "step": 7146 + }, + { + "epoch": 2.294750361213678, + "grad_norm": 0.9979050755500793, + "learning_rate": 4.565325005253356e-05, + "loss": 0.3467, + "step": 7147 + }, + { + "epoch": 2.2950714400385293, + "grad_norm": 0.6032088398933411, + "learning_rate": 4.5623951722120736e-05, + "loss": 0.2315, + "step": 7148 + }, + { + "epoch": 2.295392518863381, + "grad_norm": 0.8849102258682251, + "learning_rate": 4.559466001726451e-05, + "loss": 0.3303, + "step": 7149 + }, + { + "epoch": 2.2957135976882324, + "grad_norm": 0.9479863047599792, + "learning_rate": 4.5565374941533965e-05, + "loss": 0.3925, + "step": 7150 + }, + { + "epoch": 2.296034676513084, + "grad_norm": 0.8778762817382812, + "learning_rate": 4.5536096498497295e-05, + "loss": 0.3579, + "step": 7151 + }, + { + "epoch": 2.2963557553379355, + "grad_norm": 1.0045291185379028, + "learning_rate": 4.5506824691722126e-05, + "loss": 0.3575, + "step": 7152 + }, + { + "epoch": 2.296676834162787, + "grad_norm": 1.0932892560958862, + "learning_rate": 4.5477559524774994e-05, + "loss": 0.4418, + "step": 7153 + }, + { + "epoch": 2.2969979129876386, + "grad_norm": 1.10843026638031, + "learning_rate": 4.5448301001221895e-05, + "loss": 0.5555, + "step": 7154 + }, + { + "epoch": 2.29731899181249, + "grad_norm": 0.7526745200157166, + "learning_rate": 4.541904912462784e-05, + "loss": 0.3106, + "step": 7155 + }, + { + "epoch": 2.2976400706373417, + "grad_norm": 0.8937078714370728, + "learning_rate": 4.5389803898557106e-05, + "loss": 0.3546, + "step": 7156 + }, + { + "epoch": 2.2979611494621928, + "grad_norm": 0.660315752029419, + "learning_rate": 4.5360565326573104e-05, + "loss": 0.2998, + "step": 7157 + }, + { + "epoch": 2.2982822282870443, + "grad_norm": 1.0045713186264038, + "learning_rate": 4.5331333412238475e-05, + "loss": 0.497, + "step": 7158 + }, + { + "epoch": 2.298603307111896, + "grad_norm": 1.044492244720459, + "learning_rate": 4.530210815911504e-05, + "loss": 0.4079, + "step": 7159 + }, + { + "epoch": 2.2989243859367474, + "grad_norm": 0.894120991230011, + "learning_rate": 4.527288957076382e-05, + "loss": 0.364, + "step": 7160 + }, + { + "epoch": 2.299245464761599, + "grad_norm": 0.7585509419441223, + "learning_rate": 4.524367765074499e-05, + "loss": 0.2742, + "step": 7161 + }, + { + "epoch": 2.2995665435864505, + "grad_norm": 1.1006792783737183, + "learning_rate": 4.5214472402617944e-05, + "loss": 0.4268, + "step": 7162 + }, + { + "epoch": 2.299887622411302, + "grad_norm": 0.7505506873130798, + "learning_rate": 4.518527382994127e-05, + "loss": 0.3131, + "step": 7163 + }, + { + "epoch": 2.3002087012361536, + "grad_norm": 0.645842432975769, + "learning_rate": 4.515608193627265e-05, + "loss": 0.2745, + "step": 7164 + }, + { + "epoch": 2.300529780061005, + "grad_norm": 0.8791124820709229, + "learning_rate": 4.512689672516918e-05, + "loss": 0.4013, + "step": 7165 + }, + { + "epoch": 2.3008508588858563, + "grad_norm": 0.6512730121612549, + "learning_rate": 4.5097718200186814e-05, + "loss": 0.2472, + "step": 7166 + }, + { + "epoch": 2.301171937710708, + "grad_norm": 0.7656790018081665, + "learning_rate": 4.506854636488103e-05, + "loss": 0.3403, + "step": 7167 + }, + { + "epoch": 2.3014930165355594, + "grad_norm": 0.6794445514678955, + "learning_rate": 4.50393812228062e-05, + "loss": 0.3161, + "step": 7168 + }, + { + "epoch": 2.301814095360411, + "grad_norm": 0.6210803389549255, + "learning_rate": 4.501022277751602e-05, + "loss": 0.2855, + "step": 7169 + }, + { + "epoch": 2.3021351741852625, + "grad_norm": 0.748984694480896, + "learning_rate": 4.498107103256346e-05, + "loss": 0.2983, + "step": 7170 + }, + { + "epoch": 2.302456253010114, + "grad_norm": 0.7318181395530701, + "learning_rate": 4.495192599150044e-05, + "loss": 0.2941, + "step": 7171 + }, + { + "epoch": 2.3027773318349656, + "grad_norm": 0.8461513519287109, + "learning_rate": 4.4922787657878294e-05, + "loss": 0.3321, + "step": 7172 + }, + { + "epoch": 2.303098410659817, + "grad_norm": 0.4881146252155304, + "learning_rate": 4.48936560352474e-05, + "loss": 0.2739, + "step": 7173 + }, + { + "epoch": 2.3034194894846687, + "grad_norm": 1.058592677116394, + "learning_rate": 4.4864531127157374e-05, + "loss": 0.381, + "step": 7174 + }, + { + "epoch": 2.30374056830952, + "grad_norm": 0.6712564826011658, + "learning_rate": 4.483541293715698e-05, + "loss": 0.3067, + "step": 7175 + }, + { + "epoch": 2.3040616471343713, + "grad_norm": 0.9052884578704834, + "learning_rate": 4.480630146879419e-05, + "loss": 0.2888, + "step": 7176 + }, + { + "epoch": 2.304382725959223, + "grad_norm": 0.6388865113258362, + "learning_rate": 4.4777196725616146e-05, + "loss": 0.3019, + "step": 7177 + }, + { + "epoch": 2.3047038047840744, + "grad_norm": 1.7161591053009033, + "learning_rate": 4.474809871116916e-05, + "loss": 0.3845, + "step": 7178 + }, + { + "epoch": 2.305024883608926, + "grad_norm": 0.959423840045929, + "learning_rate": 4.471900742899876e-05, + "loss": 0.3427, + "step": 7179 + }, + { + "epoch": 2.3053459624337775, + "grad_norm": 0.47565457224845886, + "learning_rate": 4.4689922882649626e-05, + "loss": 0.6239, + "step": 7180 + }, + { + "epoch": 2.305667041258629, + "grad_norm": 0.5845574140548706, + "learning_rate": 4.46608450756656e-05, + "loss": 0.6796, + "step": 7181 + }, + { + "epoch": 2.3059881200834806, + "grad_norm": 0.6896982192993164, + "learning_rate": 4.463177401158975e-05, + "loss": 0.3241, + "step": 7182 + }, + { + "epoch": 2.306309198908332, + "grad_norm": 0.45037001371383667, + "learning_rate": 4.460270969396429e-05, + "loss": 0.1748, + "step": 7183 + }, + { + "epoch": 2.3066302777331833, + "grad_norm": 0.4934408366680145, + "learning_rate": 4.457365212633058e-05, + "loss": 0.2459, + "step": 7184 + }, + { + "epoch": 2.306951356558035, + "grad_norm": 0.6035980582237244, + "learning_rate": 4.45446013122293e-05, + "loss": 0.2141, + "step": 7185 + }, + { + "epoch": 2.3072724353828864, + "grad_norm": 0.8812675476074219, + "learning_rate": 4.451555725520009e-05, + "loss": 0.4984, + "step": 7186 + }, + { + "epoch": 2.307593514207738, + "grad_norm": 0.7593944072723389, + "learning_rate": 4.44865199587819e-05, + "loss": 0.3683, + "step": 7187 + }, + { + "epoch": 2.3079145930325895, + "grad_norm": 0.6698897480964661, + "learning_rate": 4.4457489426512947e-05, + "loss": 0.2978, + "step": 7188 + }, + { + "epoch": 2.308235671857441, + "grad_norm": 0.8218667507171631, + "learning_rate": 4.4428465661930343e-05, + "loss": 0.3963, + "step": 7189 + }, + { + "epoch": 2.3085567506822926, + "grad_norm": 0.8323404788970947, + "learning_rate": 4.43994486685707e-05, + "loss": 0.3453, + "step": 7190 + }, + { + "epoch": 2.308877829507144, + "grad_norm": 0.8024272322654724, + "learning_rate": 4.437043844996952e-05, + "loss": 0.2979, + "step": 7191 + }, + { + "epoch": 2.3091989083319957, + "grad_norm": 1.1842060089111328, + "learning_rate": 4.43414350096617e-05, + "loss": 0.4108, + "step": 7192 + }, + { + "epoch": 2.309519987156847, + "grad_norm": 0.9761362671852112, + "learning_rate": 4.431243835118124e-05, + "loss": 0.4397, + "step": 7193 + }, + { + "epoch": 2.3098410659816984, + "grad_norm": 0.8918820023536682, + "learning_rate": 4.428344847806116e-05, + "loss": 0.3433, + "step": 7194 + }, + { + "epoch": 2.31016214480655, + "grad_norm": 0.9746502041816711, + "learning_rate": 4.425446539383393e-05, + "loss": 0.404, + "step": 7195 + }, + { + "epoch": 2.3104832236314015, + "grad_norm": 0.7089119553565979, + "learning_rate": 4.4225489102030995e-05, + "loss": 0.241, + "step": 7196 + }, + { + "epoch": 2.310804302456253, + "grad_norm": 0.9124135971069336, + "learning_rate": 4.419651960618302e-05, + "loss": 0.3361, + "step": 7197 + }, + { + "epoch": 2.3111253812811046, + "grad_norm": 0.95118647813797, + "learning_rate": 4.4167556909819874e-05, + "loss": 0.3814, + "step": 7198 + }, + { + "epoch": 2.311446460105956, + "grad_norm": 1.066606044769287, + "learning_rate": 4.413860101647055e-05, + "loss": 0.3397, + "step": 7199 + }, + { + "epoch": 2.3117675389308077, + "grad_norm": 0.8261109590530396, + "learning_rate": 4.4109651929663256e-05, + "loss": 0.2883, + "step": 7200 + }, + { + "epoch": 2.312088617755659, + "grad_norm": 0.9831299185752869, + "learning_rate": 4.4080709652925336e-05, + "loss": 0.387, + "step": 7201 + }, + { + "epoch": 2.3124096965805103, + "grad_norm": 1.2849937677383423, + "learning_rate": 4.4051774189783315e-05, + "loss": 0.4081, + "step": 7202 + }, + { + "epoch": 2.312730775405362, + "grad_norm": 1.1613420248031616, + "learning_rate": 4.4022845543762915e-05, + "loss": 0.3917, + "step": 7203 + }, + { + "epoch": 2.3130518542302134, + "grad_norm": 0.9455772638320923, + "learning_rate": 4.399392371838897e-05, + "loss": 0.334, + "step": 7204 + }, + { + "epoch": 2.313372933055065, + "grad_norm": 0.8039372563362122, + "learning_rate": 4.396500871718555e-05, + "loss": 0.338, + "step": 7205 + }, + { + "epoch": 2.3136940118799165, + "grad_norm": 0.7344679832458496, + "learning_rate": 4.393610054367585e-05, + "loss": 0.3172, + "step": 7206 + }, + { + "epoch": 2.314015090704768, + "grad_norm": 0.6306678056716919, + "learning_rate": 4.39071992013822e-05, + "loss": 0.2823, + "step": 7207 + }, + { + "epoch": 2.3143361695296196, + "grad_norm": 0.9511836171150208, + "learning_rate": 4.387830469382624e-05, + "loss": 0.4142, + "step": 7208 + }, + { + "epoch": 2.314657248354471, + "grad_norm": 0.9060647487640381, + "learning_rate": 4.3849417024528564e-05, + "loss": 0.3342, + "step": 7209 + }, + { + "epoch": 2.3149783271793227, + "grad_norm": 0.821698784828186, + "learning_rate": 4.382053619700912e-05, + "loss": 0.3357, + "step": 7210 + }, + { + "epoch": 2.315299406004174, + "grad_norm": 0.7825961709022522, + "learning_rate": 4.379166221478697e-05, + "loss": 0.3329, + "step": 7211 + }, + { + "epoch": 2.3156204848290254, + "grad_norm": 0.9795680046081543, + "learning_rate": 4.3762795081380215e-05, + "loss": 0.3623, + "step": 7212 + }, + { + "epoch": 2.315941563653877, + "grad_norm": 0.7996529936790466, + "learning_rate": 4.3733934800306366e-05, + "loss": 0.2732, + "step": 7213 + }, + { + "epoch": 2.3162626424787285, + "grad_norm": 0.6189697980880737, + "learning_rate": 4.37050813750818e-05, + "loss": 0.2654, + "step": 7214 + }, + { + "epoch": 2.31658372130358, + "grad_norm": 0.7410408854484558, + "learning_rate": 4.367623480922236e-05, + "loss": 0.3059, + "step": 7215 + }, + { + "epoch": 2.3169048001284316, + "grad_norm": 0.8010258078575134, + "learning_rate": 4.364739510624286e-05, + "loss": 0.3012, + "step": 7216 + }, + { + "epoch": 2.317225878953283, + "grad_norm": 0.9137861728668213, + "learning_rate": 4.361856226965733e-05, + "loss": 0.391, + "step": 7217 + }, + { + "epoch": 2.3175469577781347, + "grad_norm": 0.7650619745254517, + "learning_rate": 4.3589736302978954e-05, + "loss": 0.3483, + "step": 7218 + }, + { + "epoch": 2.3178680366029862, + "grad_norm": 0.6533841490745544, + "learning_rate": 4.356091720972011e-05, + "loss": 0.2859, + "step": 7219 + }, + { + "epoch": 2.3181891154278373, + "grad_norm": 0.5599982738494873, + "learning_rate": 4.3532104993392306e-05, + "loss": 0.2805, + "step": 7220 + }, + { + "epoch": 2.318510194252689, + "grad_norm": 1.0042898654937744, + "learning_rate": 4.350329965750621e-05, + "loss": 0.4129, + "step": 7221 + }, + { + "epoch": 2.3188312730775404, + "grad_norm": 0.5694493055343628, + "learning_rate": 4.347450120557169e-05, + "loss": 0.269, + "step": 7222 + }, + { + "epoch": 2.319152351902392, + "grad_norm": 0.872643232345581, + "learning_rate": 4.3445709641097745e-05, + "loss": 0.3538, + "step": 7223 + }, + { + "epoch": 2.3194734307272435, + "grad_norm": 0.4571438133716583, + "learning_rate": 4.341692496759252e-05, + "loss": 0.2497, + "step": 7224 + }, + { + "epoch": 2.319794509552095, + "grad_norm": 0.6154420375823975, + "learning_rate": 4.3388147188563325e-05, + "loss": 0.2781, + "step": 7225 + }, + { + "epoch": 2.3201155883769466, + "grad_norm": 0.6062678098678589, + "learning_rate": 4.335937630751674e-05, + "loss": 0.2748, + "step": 7226 + }, + { + "epoch": 2.320436667201798, + "grad_norm": 0.8544726371765137, + "learning_rate": 4.333061232795826e-05, + "loss": 0.3392, + "step": 7227 + }, + { + "epoch": 2.3207577460266497, + "grad_norm": 0.8965178728103638, + "learning_rate": 4.3301855253392864e-05, + "loss": 0.3342, + "step": 7228 + }, + { + "epoch": 2.321078824851501, + "grad_norm": 0.7547941207885742, + "learning_rate": 4.327310508732437e-05, + "loss": 0.3099, + "step": 7229 + }, + { + "epoch": 2.3213999036763524, + "grad_norm": 0.43631595373153687, + "learning_rate": 4.324436183325593e-05, + "loss": 0.5266, + "step": 7230 + }, + { + "epoch": 2.321720982501204, + "grad_norm": 0.44915318489074707, + "learning_rate": 4.32156254946899e-05, + "loss": 0.586, + "step": 7231 + }, + { + "epoch": 2.3220420613260555, + "grad_norm": 0.4304579198360443, + "learning_rate": 4.3186896075127595e-05, + "loss": 0.3149, + "step": 7232 + }, + { + "epoch": 2.322363140150907, + "grad_norm": 0.5537540316581726, + "learning_rate": 4.315817357806974e-05, + "loss": 0.4514, + "step": 7233 + }, + { + "epoch": 2.3226842189757586, + "grad_norm": 0.4857487380504608, + "learning_rate": 4.3129458007015946e-05, + "loss": 0.3249, + "step": 7234 + }, + { + "epoch": 2.32300529780061, + "grad_norm": 0.44234228134155273, + "learning_rate": 4.310074936546521e-05, + "loss": 0.2186, + "step": 7235 + }, + { + "epoch": 2.3233263766254617, + "grad_norm": 0.29130613803863525, + "learning_rate": 4.307204765691558e-05, + "loss": 0.0781, + "step": 7236 + }, + { + "epoch": 2.3236474554503133, + "grad_norm": 0.6856487393379211, + "learning_rate": 4.304335288486426e-05, + "loss": 0.3749, + "step": 7237 + }, + { + "epoch": 2.3239685342751644, + "grad_norm": 0.7507144212722778, + "learning_rate": 4.301466505280762e-05, + "loss": 0.3243, + "step": 7238 + }, + { + "epoch": 2.324289613100016, + "grad_norm": 0.8678911328315735, + "learning_rate": 4.29859841642412e-05, + "loss": 0.4114, + "step": 7239 + }, + { + "epoch": 2.3246106919248675, + "grad_norm": 0.8836848139762878, + "learning_rate": 4.295731022265966e-05, + "loss": 0.3738, + "step": 7240 + }, + { + "epoch": 2.324931770749719, + "grad_norm": 0.7541016936302185, + "learning_rate": 4.2928643231556844e-05, + "loss": 0.3071, + "step": 7241 + }, + { + "epoch": 2.3252528495745706, + "grad_norm": 0.8909046053886414, + "learning_rate": 4.289998319442573e-05, + "loss": 0.4172, + "step": 7242 + }, + { + "epoch": 2.325573928399422, + "grad_norm": 0.733267605304718, + "learning_rate": 4.287133011475847e-05, + "loss": 0.3147, + "step": 7243 + }, + { + "epoch": 2.3258950072242737, + "grad_norm": 0.8695718050003052, + "learning_rate": 4.2842683996046327e-05, + "loss": 0.3655, + "step": 7244 + }, + { + "epoch": 2.326216086049125, + "grad_norm": 0.867275059223175, + "learning_rate": 4.2814044841779745e-05, + "loss": 0.4448, + "step": 7245 + }, + { + "epoch": 2.3265371648739768, + "grad_norm": 0.9108153581619263, + "learning_rate": 4.27854126554484e-05, + "loss": 0.3652, + "step": 7246 + }, + { + "epoch": 2.326858243698828, + "grad_norm": 0.5802766680717468, + "learning_rate": 4.2756787440540936e-05, + "loss": 0.2382, + "step": 7247 + }, + { + "epoch": 2.3271793225236794, + "grad_norm": 0.9157735109329224, + "learning_rate": 4.2728169200545286e-05, + "loss": 0.4234, + "step": 7248 + }, + { + "epoch": 2.327500401348531, + "grad_norm": 0.817433774471283, + "learning_rate": 4.26995579389485e-05, + "loss": 0.3119, + "step": 7249 + }, + { + "epoch": 2.3278214801733825, + "grad_norm": 0.793776273727417, + "learning_rate": 4.267095365923672e-05, + "loss": 0.3468, + "step": 7250 + }, + { + "epoch": 2.328142558998234, + "grad_norm": 0.7414565086364746, + "learning_rate": 4.264235636489542e-05, + "loss": 0.3007, + "step": 7251 + }, + { + "epoch": 2.3284636378230856, + "grad_norm": 0.74152672290802, + "learning_rate": 4.261376605940894e-05, + "loss": 0.2868, + "step": 7252 + }, + { + "epoch": 2.328784716647937, + "grad_norm": 0.9075784683227539, + "learning_rate": 4.2585182746261035e-05, + "loss": 0.3053, + "step": 7253 + }, + { + "epoch": 2.3291057954727887, + "grad_norm": 0.8360973000526428, + "learning_rate": 4.2556606428934443e-05, + "loss": 0.3616, + "step": 7254 + }, + { + "epoch": 2.3294268742976403, + "grad_norm": 1.0343592166900635, + "learning_rate": 4.252803711091112e-05, + "loss": 0.2661, + "step": 7255 + }, + { + "epoch": 2.3297479531224914, + "grad_norm": 0.8651792407035828, + "learning_rate": 4.249947479567218e-05, + "loss": 0.398, + "step": 7256 + }, + { + "epoch": 2.330069031947343, + "grad_norm": 0.8300990462303162, + "learning_rate": 4.2470919486697744e-05, + "loss": 0.3371, + "step": 7257 + }, + { + "epoch": 2.3303901107721945, + "grad_norm": 0.7871789932250977, + "learning_rate": 4.244237118746731e-05, + "loss": 0.2825, + "step": 7258 + }, + { + "epoch": 2.330711189597046, + "grad_norm": 0.9724371433258057, + "learning_rate": 4.2413829901459344e-05, + "loss": 0.2868, + "step": 7259 + }, + { + "epoch": 2.3310322684218976, + "grad_norm": 1.072011113166809, + "learning_rate": 4.238529563215153e-05, + "loss": 0.3748, + "step": 7260 + }, + { + "epoch": 2.331353347246749, + "grad_norm": 1.1171770095825195, + "learning_rate": 4.235676838302068e-05, + "loss": 0.5082, + "step": 7261 + }, + { + "epoch": 2.3316744260716007, + "grad_norm": 0.8948763012886047, + "learning_rate": 4.232824815754276e-05, + "loss": 0.3269, + "step": 7262 + }, + { + "epoch": 2.3319955048964522, + "grad_norm": 0.9017735123634338, + "learning_rate": 4.229973495919286e-05, + "loss": 0.3732, + "step": 7263 + }, + { + "epoch": 2.332316583721304, + "grad_norm": 0.8788909316062927, + "learning_rate": 4.227122879144523e-05, + "loss": 0.3456, + "step": 7264 + }, + { + "epoch": 2.332637662546155, + "grad_norm": 0.7098720669746399, + "learning_rate": 4.224272965777326e-05, + "loss": 0.2543, + "step": 7265 + }, + { + "epoch": 2.3329587413710065, + "grad_norm": 1.32795250415802, + "learning_rate": 4.221423756164948e-05, + "loss": 0.3675, + "step": 7266 + }, + { + "epoch": 2.333279820195858, + "grad_norm": 0.7488431930541992, + "learning_rate": 4.2185752506545585e-05, + "loss": 0.3516, + "step": 7267 + }, + { + "epoch": 2.3336008990207096, + "grad_norm": 0.5389450788497925, + "learning_rate": 4.215727449593233e-05, + "loss": 0.2551, + "step": 7268 + }, + { + "epoch": 2.333921977845561, + "grad_norm": 0.5667807459831238, + "learning_rate": 4.212880353327979e-05, + "loss": 0.2823, + "step": 7269 + }, + { + "epoch": 2.3342430566704127, + "grad_norm": 0.6024496555328369, + "learning_rate": 4.210033962205694e-05, + "loss": 0.2439, + "step": 7270 + }, + { + "epoch": 2.334564135495264, + "grad_norm": 0.6424733400344849, + "learning_rate": 4.207188276573214e-05, + "loss": 0.2782, + "step": 7271 + }, + { + "epoch": 2.3348852143201158, + "grad_norm": 1.2001317739486694, + "learning_rate": 4.204343296777265e-05, + "loss": 0.3473, + "step": 7272 + }, + { + "epoch": 2.3352062931449673, + "grad_norm": 0.7400069236755371, + "learning_rate": 4.201499023164508e-05, + "loss": 0.3178, + "step": 7273 + }, + { + "epoch": 2.3355273719698184, + "grad_norm": 0.6566978693008423, + "learning_rate": 4.1986554560815096e-05, + "loss": 0.3215, + "step": 7274 + }, + { + "epoch": 2.33584845079467, + "grad_norm": 0.6812121272087097, + "learning_rate": 4.195812595874739e-05, + "loss": 0.2494, + "step": 7275 + }, + { + "epoch": 2.3361695296195215, + "grad_norm": 0.8875928521156311, + "learning_rate": 4.1929704428906026e-05, + "loss": 0.3022, + "step": 7276 + }, + { + "epoch": 2.336490608444373, + "grad_norm": 0.6352400779724121, + "learning_rate": 4.190128997475402e-05, + "loss": 0.2617, + "step": 7277 + }, + { + "epoch": 2.3368116872692246, + "grad_norm": 0.5926759243011475, + "learning_rate": 4.1872882599753605e-05, + "loss": 0.2664, + "step": 7278 + }, + { + "epoch": 2.337132766094076, + "grad_norm": 0.41694337129592896, + "learning_rate": 4.184448230736613e-05, + "loss": 0.2739, + "step": 7279 + }, + { + "epoch": 2.3374538449189277, + "grad_norm": 0.5080072283744812, + "learning_rate": 4.181608910105207e-05, + "loss": 0.6664, + "step": 7280 + }, + { + "epoch": 2.3377749237437793, + "grad_norm": 0.468008428812027, + "learning_rate": 4.1787702984271074e-05, + "loss": 0.4515, + "step": 7281 + }, + { + "epoch": 2.338096002568631, + "grad_norm": 0.5170953869819641, + "learning_rate": 4.175932396048188e-05, + "loss": 0.2252, + "step": 7282 + }, + { + "epoch": 2.338417081393482, + "grad_norm": 0.46261003613471985, + "learning_rate": 4.173095203314241e-05, + "loss": 0.1327, + "step": 7283 + }, + { + "epoch": 2.3387381602183335, + "grad_norm": 0.5765483975410461, + "learning_rate": 4.170258720570968e-05, + "loss": 0.2548, + "step": 7284 + }, + { + "epoch": 2.339059239043185, + "grad_norm": 0.6683604121208191, + "learning_rate": 4.167422948163986e-05, + "loss": 0.2663, + "step": 7285 + }, + { + "epoch": 2.3393803178680366, + "grad_norm": 0.32332003116607666, + "learning_rate": 4.1645878864388266e-05, + "loss": 0.1391, + "step": 7286 + }, + { + "epoch": 2.339701396692888, + "grad_norm": 1.4612623453140259, + "learning_rate": 4.161753535740932e-05, + "loss": 0.3675, + "step": 7287 + }, + { + "epoch": 2.3400224755177397, + "grad_norm": 0.8074957728385925, + "learning_rate": 4.158919896415656e-05, + "loss": 0.3862, + "step": 7288 + }, + { + "epoch": 2.3403435543425912, + "grad_norm": 0.7950018048286438, + "learning_rate": 4.15608696880828e-05, + "loss": 0.3471, + "step": 7289 + }, + { + "epoch": 2.3406646331674428, + "grad_norm": 0.8618911504745483, + "learning_rate": 4.153254753263974e-05, + "loss": 0.3643, + "step": 7290 + }, + { + "epoch": 2.3409857119922943, + "grad_norm": 0.9194211959838867, + "learning_rate": 4.150423250127845e-05, + "loss": 0.4025, + "step": 7291 + }, + { + "epoch": 2.3413067908171454, + "grad_norm": 0.7834305167198181, + "learning_rate": 4.1475924597449024e-05, + "loss": 0.3126, + "step": 7292 + }, + { + "epoch": 2.341627869641997, + "grad_norm": 0.6242783665657043, + "learning_rate": 4.144762382460059e-05, + "loss": 0.2834, + "step": 7293 + }, + { + "epoch": 2.3419489484668485, + "grad_norm": 0.8568835854530334, + "learning_rate": 4.141933018618165e-05, + "loss": 0.3792, + "step": 7294 + }, + { + "epoch": 2.3422700272917, + "grad_norm": 1.036650538444519, + "learning_rate": 4.1391043685639576e-05, + "loss": 0.4148, + "step": 7295 + }, + { + "epoch": 2.3425911061165516, + "grad_norm": 0.9023113250732422, + "learning_rate": 4.1362764326421064e-05, + "loss": 0.3962, + "step": 7296 + }, + { + "epoch": 2.342912184941403, + "grad_norm": 0.9023537635803223, + "learning_rate": 4.133449211197188e-05, + "loss": 0.3818, + "step": 7297 + }, + { + "epoch": 2.3432332637662547, + "grad_norm": 1.3327304124832153, + "learning_rate": 4.130622704573685e-05, + "loss": 0.2894, + "step": 7298 + }, + { + "epoch": 2.3435543425911063, + "grad_norm": 0.8340547680854797, + "learning_rate": 4.1277969131160045e-05, + "loss": 0.3798, + "step": 7299 + }, + { + "epoch": 2.343875421415958, + "grad_norm": 0.6969391107559204, + "learning_rate": 4.1249718371684564e-05, + "loss": 0.3505, + "step": 7300 + }, + { + "epoch": 2.344196500240809, + "grad_norm": 0.7478737235069275, + "learning_rate": 4.12214747707527e-05, + "loss": 0.3221, + "step": 7301 + }, + { + "epoch": 2.3445175790656605, + "grad_norm": 1.0204554796218872, + "learning_rate": 4.1193238331805826e-05, + "loss": 0.4387, + "step": 7302 + }, + { + "epoch": 2.344838657890512, + "grad_norm": 1.032119870185852, + "learning_rate": 4.11650090582845e-05, + "loss": 0.3519, + "step": 7303 + }, + { + "epoch": 2.3451597367153636, + "grad_norm": 0.8070648908615112, + "learning_rate": 4.1136786953628334e-05, + "loss": 0.3181, + "step": 7304 + }, + { + "epoch": 2.345480815540215, + "grad_norm": 0.9976387023925781, + "learning_rate": 4.110857202127615e-05, + "loss": 0.2975, + "step": 7305 + }, + { + "epoch": 2.3458018943650667, + "grad_norm": 0.7166529297828674, + "learning_rate": 4.1080364264665774e-05, + "loss": 0.2918, + "step": 7306 + }, + { + "epoch": 2.3461229731899182, + "grad_norm": 1.119057297706604, + "learning_rate": 4.1052163687234366e-05, + "loss": 0.3691, + "step": 7307 + }, + { + "epoch": 2.34644405201477, + "grad_norm": 0.8771776556968689, + "learning_rate": 4.1023970292417935e-05, + "loss": 0.3273, + "step": 7308 + }, + { + "epoch": 2.3467651308396213, + "grad_norm": 1.007441759109497, + "learning_rate": 4.099578408365191e-05, + "loss": 0.39, + "step": 7309 + }, + { + "epoch": 2.3470862096644725, + "grad_norm": 0.7695625424385071, + "learning_rate": 4.096760506437057e-05, + "loss": 0.3232, + "step": 7310 + }, + { + "epoch": 2.347407288489324, + "grad_norm": 0.8058918714523315, + "learning_rate": 4.093943323800745e-05, + "loss": 0.279, + "step": 7311 + }, + { + "epoch": 2.3477283673141756, + "grad_norm": 1.1080282926559448, + "learning_rate": 4.0911268607995325e-05, + "loss": 0.4143, + "step": 7312 + }, + { + "epoch": 2.348049446139027, + "grad_norm": 0.9627980589866638, + "learning_rate": 4.08831111777658e-05, + "loss": 0.3703, + "step": 7313 + }, + { + "epoch": 2.3483705249638787, + "grad_norm": 0.8395978808403015, + "learning_rate": 4.08549609507499e-05, + "loss": 0.3836, + "step": 7314 + }, + { + "epoch": 2.34869160378873, + "grad_norm": 0.8003190755844116, + "learning_rate": 4.08268179303776e-05, + "loss": 0.3474, + "step": 7315 + }, + { + "epoch": 2.3490126826135818, + "grad_norm": 0.987792432308197, + "learning_rate": 4.0798682120078044e-05, + "loss": 0.3746, + "step": 7316 + }, + { + "epoch": 2.3493337614384333, + "grad_norm": 0.8142846822738647, + "learning_rate": 4.077055352327953e-05, + "loss": 0.3273, + "step": 7317 + }, + { + "epoch": 2.349654840263285, + "grad_norm": 0.8976936340332031, + "learning_rate": 4.074243214340934e-05, + "loss": 0.3165, + "step": 7318 + }, + { + "epoch": 2.349975919088136, + "grad_norm": 0.8904353380203247, + "learning_rate": 4.071431798389408e-05, + "loss": 0.296, + "step": 7319 + }, + { + "epoch": 2.3502969979129875, + "grad_norm": 1.0113767385482788, + "learning_rate": 4.068621104815934e-05, + "loss": 0.3628, + "step": 7320 + }, + { + "epoch": 2.350618076737839, + "grad_norm": 0.5627405643463135, + "learning_rate": 4.065811133962987e-05, + "loss": 0.2582, + "step": 7321 + }, + { + "epoch": 2.3509391555626906, + "grad_norm": 0.9106473326683044, + "learning_rate": 4.063001886172952e-05, + "loss": 0.3047, + "step": 7322 + }, + { + "epoch": 2.351260234387542, + "grad_norm": 1.275215744972229, + "learning_rate": 4.0601933617881294e-05, + "loss": 0.3613, + "step": 7323 + }, + { + "epoch": 2.3515813132123937, + "grad_norm": 0.7992632985115051, + "learning_rate": 4.057385561150727e-05, + "loss": 0.3085, + "step": 7324 + }, + { + "epoch": 2.3519023920372453, + "grad_norm": 0.7819589972496033, + "learning_rate": 4.05457848460287e-05, + "loss": 0.3735, + "step": 7325 + }, + { + "epoch": 2.352223470862097, + "grad_norm": 1.0428626537322998, + "learning_rate": 4.0517721324865884e-05, + "loss": 0.3611, + "step": 7326 + }, + { + "epoch": 2.3525445496869484, + "grad_norm": 0.7494415640830994, + "learning_rate": 4.048966505143831e-05, + "loss": 0.3112, + "step": 7327 + }, + { + "epoch": 2.3528656285117995, + "grad_norm": 0.7105958461761475, + "learning_rate": 4.0461616029164526e-05, + "loss": 0.3408, + "step": 7328 + }, + { + "epoch": 2.353186707336651, + "grad_norm": 0.5049455165863037, + "learning_rate": 4.0433574261462206e-05, + "loss": 0.3125, + "step": 7329 + }, + { + "epoch": 2.3535077861615026, + "grad_norm": 0.4792654514312744, + "learning_rate": 4.040553975174823e-05, + "loss": 0.6885, + "step": 7330 + }, + { + "epoch": 2.353828864986354, + "grad_norm": 0.450774222612381, + "learning_rate": 4.037751250343841e-05, + "loss": 0.4495, + "step": 7331 + }, + { + "epoch": 2.3541499438112057, + "grad_norm": 0.5843284130096436, + "learning_rate": 4.0349492519947904e-05, + "loss": 0.5532, + "step": 7332 + }, + { + "epoch": 2.3544710226360572, + "grad_norm": 0.535630464553833, + "learning_rate": 4.032147980469072e-05, + "loss": 0.316, + "step": 7333 + }, + { + "epoch": 2.354792101460909, + "grad_norm": 0.4793131649494171, + "learning_rate": 4.0293474361080244e-05, + "loss": 0.1362, + "step": 7334 + }, + { + "epoch": 2.3551131802857603, + "grad_norm": 0.4620800316333771, + "learning_rate": 4.026547619252883e-05, + "loss": 0.1559, + "step": 7335 + }, + { + "epoch": 2.355434259110612, + "grad_norm": 0.44343894720077515, + "learning_rate": 4.023748530244789e-05, + "loss": 0.162, + "step": 7336 + }, + { + "epoch": 2.355755337935463, + "grad_norm": 0.5997874140739441, + "learning_rate": 4.020950169424815e-05, + "loss": 0.2213, + "step": 7337 + }, + { + "epoch": 2.3560764167603145, + "grad_norm": 0.9611859917640686, + "learning_rate": 4.018152537133919e-05, + "loss": 0.5503, + "step": 7338 + }, + { + "epoch": 2.356397495585166, + "grad_norm": 0.9722205996513367, + "learning_rate": 4.015355633712996e-05, + "loss": 0.4235, + "step": 7339 + }, + { + "epoch": 2.3567185744100176, + "grad_norm": 0.6975258588790894, + "learning_rate": 4.012559459502835e-05, + "loss": 0.3188, + "step": 7340 + }, + { + "epoch": 2.357039653234869, + "grad_norm": 0.7221542000770569, + "learning_rate": 4.009764014844143e-05, + "loss": 0.3202, + "step": 7341 + }, + { + "epoch": 2.3573607320597207, + "grad_norm": 0.7738749980926514, + "learning_rate": 4.006969300077534e-05, + "loss": 0.2977, + "step": 7342 + }, + { + "epoch": 2.3576818108845723, + "grad_norm": 0.7268139123916626, + "learning_rate": 4.004175315543538e-05, + "loss": 0.2917, + "step": 7343 + }, + { + "epoch": 2.358002889709424, + "grad_norm": 0.9446465373039246, + "learning_rate": 4.001382061582593e-05, + "loss": 0.3598, + "step": 7344 + }, + { + "epoch": 2.3583239685342754, + "grad_norm": 1.0309242010116577, + "learning_rate": 3.9985895385350456e-05, + "loss": 0.349, + "step": 7345 + }, + { + "epoch": 2.3586450473591265, + "grad_norm": 0.9522349834442139, + "learning_rate": 3.9957977467411615e-05, + "loss": 0.3701, + "step": 7346 + }, + { + "epoch": 2.358966126183978, + "grad_norm": 0.9686524271965027, + "learning_rate": 3.9930066865411075e-05, + "loss": 0.4144, + "step": 7347 + }, + { + "epoch": 2.3592872050088296, + "grad_norm": 0.9719577431678772, + "learning_rate": 3.990216358274969e-05, + "loss": 0.3456, + "step": 7348 + }, + { + "epoch": 2.359608283833681, + "grad_norm": 0.8357756733894348, + "learning_rate": 3.987426762282733e-05, + "loss": 0.3546, + "step": 7349 + }, + { + "epoch": 2.3599293626585327, + "grad_norm": 0.7465749979019165, + "learning_rate": 3.9846378989043156e-05, + "loss": 0.3632, + "step": 7350 + }, + { + "epoch": 2.3602504414833843, + "grad_norm": 1.1760458946228027, + "learning_rate": 3.981849768479517e-05, + "loss": 0.3777, + "step": 7351 + }, + { + "epoch": 2.360571520308236, + "grad_norm": 0.82599937915802, + "learning_rate": 3.979062371348075e-05, + "loss": 0.3254, + "step": 7352 + }, + { + "epoch": 2.3608925991330874, + "grad_norm": 0.9712439179420471, + "learning_rate": 3.976275707849616e-05, + "loss": 0.4092, + "step": 7353 + }, + { + "epoch": 2.361213677957939, + "grad_norm": 1.3784904479980469, + "learning_rate": 3.973489778323688e-05, + "loss": 0.433, + "step": 7354 + }, + { + "epoch": 2.36153475678279, + "grad_norm": 0.9245650172233582, + "learning_rate": 3.9707045831097555e-05, + "loss": 0.343, + "step": 7355 + }, + { + "epoch": 2.3618558356076416, + "grad_norm": 0.994295060634613, + "learning_rate": 3.967920122547175e-05, + "loss": 0.3945, + "step": 7356 + }, + { + "epoch": 2.362176914432493, + "grad_norm": 0.8298449516296387, + "learning_rate": 3.9651363969752344e-05, + "loss": 0.3194, + "step": 7357 + }, + { + "epoch": 2.3624979932573447, + "grad_norm": 1.0529513359069824, + "learning_rate": 3.962353406733117e-05, + "loss": 0.3887, + "step": 7358 + }, + { + "epoch": 2.362819072082196, + "grad_norm": 1.0410268306732178, + "learning_rate": 3.9595711521599224e-05, + "loss": 0.4083, + "step": 7359 + }, + { + "epoch": 2.3631401509070478, + "grad_norm": 0.7109187245368958, + "learning_rate": 3.956789633594661e-05, + "loss": 0.2836, + "step": 7360 + }, + { + "epoch": 2.3634612297318993, + "grad_norm": 1.154443383216858, + "learning_rate": 3.954008851376252e-05, + "loss": 0.3007, + "step": 7361 + }, + { + "epoch": 2.363782308556751, + "grad_norm": 0.7189384698867798, + "learning_rate": 3.9512288058435256e-05, + "loss": 0.2745, + "step": 7362 + }, + { + "epoch": 2.3641033873816024, + "grad_norm": 0.5539619326591492, + "learning_rate": 3.948449497335219e-05, + "loss": 0.2625, + "step": 7363 + }, + { + "epoch": 2.3644244662064535, + "grad_norm": 0.5823628306388855, + "learning_rate": 3.945670926189987e-05, + "loss": 0.2553, + "step": 7364 + }, + { + "epoch": 2.364745545031305, + "grad_norm": 0.793720006942749, + "learning_rate": 3.942893092746387e-05, + "loss": 0.315, + "step": 7365 + }, + { + "epoch": 2.3650666238561566, + "grad_norm": 0.7939404845237732, + "learning_rate": 3.940115997342891e-05, + "loss": 0.3487, + "step": 7366 + }, + { + "epoch": 2.365387702681008, + "grad_norm": 0.80466628074646, + "learning_rate": 3.9373396403178786e-05, + "loss": 0.3018, + "step": 7367 + }, + { + "epoch": 2.3657087815058597, + "grad_norm": 1.0654207468032837, + "learning_rate": 3.9345640220096417e-05, + "loss": 0.4041, + "step": 7368 + }, + { + "epoch": 2.3660298603307113, + "grad_norm": 1.0553652048110962, + "learning_rate": 3.931789142756377e-05, + "loss": 0.3632, + "step": 7369 + }, + { + "epoch": 2.366350939155563, + "grad_norm": 0.7207555770874023, + "learning_rate": 3.9290150028962044e-05, + "loss": 0.3082, + "step": 7370 + }, + { + "epoch": 2.3666720179804144, + "grad_norm": 1.4064568281173706, + "learning_rate": 3.9262416027671356e-05, + "loss": 0.4244, + "step": 7371 + }, + { + "epoch": 2.366993096805266, + "grad_norm": 0.7677832841873169, + "learning_rate": 3.9234689427071006e-05, + "loss": 0.2716, + "step": 7372 + }, + { + "epoch": 2.367314175630117, + "grad_norm": 0.6761558651924133, + "learning_rate": 3.920697023053949e-05, + "loss": 0.2968, + "step": 7373 + }, + { + "epoch": 2.3676352544549686, + "grad_norm": 0.7641769051551819, + "learning_rate": 3.917925844145418e-05, + "loss": 0.3176, + "step": 7374 + }, + { + "epoch": 2.36795633327982, + "grad_norm": 0.5635340213775635, + "learning_rate": 3.915155406319181e-05, + "loss": 0.2882, + "step": 7375 + }, + { + "epoch": 2.3682774121046717, + "grad_norm": 0.689179539680481, + "learning_rate": 3.9123857099127936e-05, + "loss": 0.2785, + "step": 7376 + }, + { + "epoch": 2.3685984909295232, + "grad_norm": 0.6026139259338379, + "learning_rate": 3.9096167552637454e-05, + "loss": 0.2739, + "step": 7377 + }, + { + "epoch": 2.368919569754375, + "grad_norm": 0.6535028219223022, + "learning_rate": 3.9068485427094205e-05, + "loss": 0.2989, + "step": 7378 + }, + { + "epoch": 2.3692406485792263, + "grad_norm": 1.0952959060668945, + "learning_rate": 3.904081072587119e-05, + "loss": 0.3505, + "step": 7379 + }, + { + "epoch": 2.369561727404078, + "grad_norm": 0.41570642590522766, + "learning_rate": 3.9013143452340475e-05, + "loss": 0.3985, + "step": 7380 + }, + { + "epoch": 2.3698828062289294, + "grad_norm": 0.5073022246360779, + "learning_rate": 3.8985483609873244e-05, + "loss": 0.4265, + "step": 7381 + }, + { + "epoch": 2.3702038850537805, + "grad_norm": 0.5192015171051025, + "learning_rate": 3.895783120183976e-05, + "loss": 0.1128, + "step": 7382 + }, + { + "epoch": 2.370524963878632, + "grad_norm": 0.46728768944740295, + "learning_rate": 3.893018623160938e-05, + "loss": 0.1821, + "step": 7383 + }, + { + "epoch": 2.3708460427034836, + "grad_norm": 0.4381241202354431, + "learning_rate": 3.890254870255055e-05, + "loss": 0.1916, + "step": 7384 + }, + { + "epoch": 2.371167121528335, + "grad_norm": 0.6580232977867126, + "learning_rate": 3.887491861803085e-05, + "loss": 0.3314, + "step": 7385 + }, + { + "epoch": 2.3714882003531867, + "grad_norm": 1.0685169696807861, + "learning_rate": 3.8847295981416896e-05, + "loss": 0.5294, + "step": 7386 + }, + { + "epoch": 2.3718092791780383, + "grad_norm": 1.036335825920105, + "learning_rate": 3.88196807960744e-05, + "loss": 0.4178, + "step": 7387 + }, + { + "epoch": 2.37213035800289, + "grad_norm": 0.7811726331710815, + "learning_rate": 3.879207306536829e-05, + "loss": 0.3555, + "step": 7388 + }, + { + "epoch": 2.3724514368277414, + "grad_norm": 0.7611583471298218, + "learning_rate": 3.876447279266238e-05, + "loss": 0.2854, + "step": 7389 + }, + { + "epoch": 2.372772515652593, + "grad_norm": 0.8770664930343628, + "learning_rate": 3.8736879981319695e-05, + "loss": 0.398, + "step": 7390 + }, + { + "epoch": 2.373093594477444, + "grad_norm": 1.128092885017395, + "learning_rate": 3.8709294634702376e-05, + "loss": 0.3486, + "step": 7391 + }, + { + "epoch": 2.3734146733022956, + "grad_norm": 1.1329503059387207, + "learning_rate": 3.868171675617155e-05, + "loss": 0.2897, + "step": 7392 + }, + { + "epoch": 2.373735752127147, + "grad_norm": 0.9214721322059631, + "learning_rate": 3.8654146349087606e-05, + "loss": 0.3414, + "step": 7393 + }, + { + "epoch": 2.3740568309519987, + "grad_norm": 0.9622142910957336, + "learning_rate": 3.862658341680977e-05, + "loss": 0.2992, + "step": 7394 + }, + { + "epoch": 2.3743779097768503, + "grad_norm": 0.94224613904953, + "learning_rate": 3.859902796269663e-05, + "loss": 0.2816, + "step": 7395 + }, + { + "epoch": 2.374698988601702, + "grad_norm": 0.7693343758583069, + "learning_rate": 3.857147999010567e-05, + "loss": 0.2936, + "step": 7396 + }, + { + "epoch": 2.3750200674265534, + "grad_norm": 0.846794068813324, + "learning_rate": 3.854393950239355e-05, + "loss": 0.2686, + "step": 7397 + }, + { + "epoch": 2.375341146251405, + "grad_norm": 0.8755759000778198, + "learning_rate": 3.851640650291603e-05, + "loss": 0.3316, + "step": 7398 + }, + { + "epoch": 2.3756622250762565, + "grad_norm": 0.867134153842926, + "learning_rate": 3.8488880995027786e-05, + "loss": 0.2939, + "step": 7399 + }, + { + "epoch": 2.3759833039011076, + "grad_norm": 0.9087311029434204, + "learning_rate": 3.846136298208285e-05, + "loss": 0.4108, + "step": 7400 + }, + { + "epoch": 2.376304382725959, + "grad_norm": 1.0739927291870117, + "learning_rate": 3.843385246743417e-05, + "loss": 0.5237, + "step": 7401 + }, + { + "epoch": 2.3766254615508107, + "grad_norm": 0.8434580564498901, + "learning_rate": 3.840634945443382e-05, + "loss": 0.3213, + "step": 7402 + }, + { + "epoch": 2.376946540375662, + "grad_norm": 0.887002170085907, + "learning_rate": 3.837885394643296e-05, + "loss": 0.3309, + "step": 7403 + }, + { + "epoch": 2.3772676192005138, + "grad_norm": 0.9817523956298828, + "learning_rate": 3.835136594678183e-05, + "loss": 0.3857, + "step": 7404 + }, + { + "epoch": 2.3775886980253653, + "grad_norm": 0.9014565944671631, + "learning_rate": 3.832388545882976e-05, + "loss": 0.3774, + "step": 7405 + }, + { + "epoch": 2.377909776850217, + "grad_norm": 0.9643536806106567, + "learning_rate": 3.829641248592515e-05, + "loss": 0.3548, + "step": 7406 + }, + { + "epoch": 2.3782308556750684, + "grad_norm": 0.7037240266799927, + "learning_rate": 3.8268947031415514e-05, + "loss": 0.2919, + "step": 7407 + }, + { + "epoch": 2.37855193449992, + "grad_norm": 0.9688251614570618, + "learning_rate": 3.824148909864744e-05, + "loss": 0.3215, + "step": 7408 + }, + { + "epoch": 2.378873013324771, + "grad_norm": 0.6717649698257446, + "learning_rate": 3.821403869096658e-05, + "loss": 0.3176, + "step": 7409 + }, + { + "epoch": 2.3791940921496226, + "grad_norm": 0.7562946081161499, + "learning_rate": 3.818659581171766e-05, + "loss": 0.3263, + "step": 7410 + }, + { + "epoch": 2.379515170974474, + "grad_norm": 0.9467628598213196, + "learning_rate": 3.8159160464244606e-05, + "loss": 0.3804, + "step": 7411 + }, + { + "epoch": 2.3798362497993257, + "grad_norm": 0.824565052986145, + "learning_rate": 3.81317326518902e-05, + "loss": 0.3667, + "step": 7412 + }, + { + "epoch": 2.3801573286241773, + "grad_norm": 0.9573568105697632, + "learning_rate": 3.8104312377996564e-05, + "loss": 0.4441, + "step": 7413 + }, + { + "epoch": 2.380478407449029, + "grad_norm": 0.6948115229606628, + "learning_rate": 3.807689964590466e-05, + "loss": 0.2781, + "step": 7414 + }, + { + "epoch": 2.3807994862738804, + "grad_norm": 0.6470552682876587, + "learning_rate": 3.8049494458954725e-05, + "loss": 0.3053, + "step": 7415 + }, + { + "epoch": 2.3811205650987315, + "grad_norm": 0.6895164847373962, + "learning_rate": 3.802209682048602e-05, + "loss": 0.3258, + "step": 7416 + }, + { + "epoch": 2.3814416439235835, + "grad_norm": 1.2019386291503906, + "learning_rate": 3.799470673383674e-05, + "loss": 0.3939, + "step": 7417 + }, + { + "epoch": 2.3817627227484346, + "grad_norm": 0.884876012802124, + "learning_rate": 3.796732420234443e-05, + "loss": 0.3349, + "step": 7418 + }, + { + "epoch": 2.382083801573286, + "grad_norm": 0.8389701247215271, + "learning_rate": 3.793994922934544e-05, + "loss": 0.3409, + "step": 7419 + }, + { + "epoch": 2.3824048803981377, + "grad_norm": 0.9509716629981995, + "learning_rate": 3.791258181817542e-05, + "loss": 0.3532, + "step": 7420 + }, + { + "epoch": 2.3827259592229892, + "grad_norm": 0.8582257628440857, + "learning_rate": 3.788522197216897e-05, + "loss": 0.3035, + "step": 7421 + }, + { + "epoch": 2.383047038047841, + "grad_norm": 0.9152477979660034, + "learning_rate": 3.785786969465981e-05, + "loss": 0.3324, + "step": 7422 + }, + { + "epoch": 2.3833681168726923, + "grad_norm": 0.8005543947219849, + "learning_rate": 3.783052498898073e-05, + "loss": 0.3398, + "step": 7423 + }, + { + "epoch": 2.383689195697544, + "grad_norm": 0.5987676382064819, + "learning_rate": 3.7803187858463605e-05, + "loss": 0.287, + "step": 7424 + }, + { + "epoch": 2.384010274522395, + "grad_norm": 0.4976361393928528, + "learning_rate": 3.777585830643937e-05, + "loss": 0.2578, + "step": 7425 + }, + { + "epoch": 2.384331353347247, + "grad_norm": 0.7815603017807007, + "learning_rate": 3.774853633623806e-05, + "loss": 0.3316, + "step": 7426 + }, + { + "epoch": 2.384652432172098, + "grad_norm": 0.36369022727012634, + "learning_rate": 3.7721221951188765e-05, + "loss": 0.2671, + "step": 7427 + }, + { + "epoch": 2.3849735109969497, + "grad_norm": 0.5498353242874146, + "learning_rate": 3.769391515461966e-05, + "loss": 0.2742, + "step": 7428 + }, + { + "epoch": 2.385294589821801, + "grad_norm": 0.7687121629714966, + "learning_rate": 3.766661594985801e-05, + "loss": 0.2926, + "step": 7429 + }, + { + "epoch": 2.3856156686466528, + "grad_norm": 0.5641812086105347, + "learning_rate": 3.7639324340230085e-05, + "loss": 0.5458, + "step": 7430 + }, + { + "epoch": 2.3859367474715043, + "grad_norm": 0.49390679597854614, + "learning_rate": 3.7612040329061405e-05, + "loss": 0.5351, + "step": 7431 + }, + { + "epoch": 2.386257826296356, + "grad_norm": 0.48149552941322327, + "learning_rate": 3.7584763919676294e-05, + "loss": 0.2797, + "step": 7432 + }, + { + "epoch": 2.3865789051212074, + "grad_norm": 0.45125648379325867, + "learning_rate": 3.755749511539845e-05, + "loss": 0.2214, + "step": 7433 + }, + { + "epoch": 2.3868999839460585, + "grad_norm": 0.28079113364219666, + "learning_rate": 3.753023391955037e-05, + "loss": 0.0733, + "step": 7434 + }, + { + "epoch": 2.3872210627709105, + "grad_norm": 0.6530325412750244, + "learning_rate": 3.7502980335453774e-05, + "loss": 0.3432, + "step": 7435 + }, + { + "epoch": 2.3875421415957616, + "grad_norm": 0.9349000453948975, + "learning_rate": 3.747573436642951e-05, + "loss": 0.4311, + "step": 7436 + }, + { + "epoch": 2.387863220420613, + "grad_norm": 0.8858532905578613, + "learning_rate": 3.7448496015797295e-05, + "loss": 0.4246, + "step": 7437 + }, + { + "epoch": 2.3881842992454647, + "grad_norm": 1.0124034881591797, + "learning_rate": 3.742126528687614e-05, + "loss": 0.3771, + "step": 7438 + }, + { + "epoch": 2.3885053780703163, + "grad_norm": 0.7207339406013489, + "learning_rate": 3.739404218298398e-05, + "loss": 0.2938, + "step": 7439 + }, + { + "epoch": 2.388826456895168, + "grad_norm": 0.8159089684486389, + "learning_rate": 3.736682670743787e-05, + "loss": 0.3231, + "step": 7440 + }, + { + "epoch": 2.3891475357200194, + "grad_norm": 0.8773292303085327, + "learning_rate": 3.733961886355398e-05, + "loss": 0.3656, + "step": 7441 + }, + { + "epoch": 2.389468614544871, + "grad_norm": 0.7147423624992371, + "learning_rate": 3.7312418654647406e-05, + "loss": 0.311, + "step": 7442 + }, + { + "epoch": 2.389789693369722, + "grad_norm": 0.8349779844284058, + "learning_rate": 3.728522608403249e-05, + "loss": 0.3847, + "step": 7443 + }, + { + "epoch": 2.3901107721945736, + "grad_norm": 0.7083280682563782, + "learning_rate": 3.725804115502254e-05, + "loss": 0.265, + "step": 7444 + }, + { + "epoch": 2.390431851019425, + "grad_norm": 0.8586387634277344, + "learning_rate": 3.7230863870929964e-05, + "loss": 0.3667, + "step": 7445 + }, + { + "epoch": 2.3907529298442767, + "grad_norm": 0.7868812680244446, + "learning_rate": 3.720369423506622e-05, + "loss": 0.335, + "step": 7446 + }, + { + "epoch": 2.3910740086691282, + "grad_norm": 0.919206976890564, + "learning_rate": 3.717653225074186e-05, + "loss": 0.4075, + "step": 7447 + }, + { + "epoch": 2.3913950874939798, + "grad_norm": 0.9069111347198486, + "learning_rate": 3.714937792126647e-05, + "loss": 0.3928, + "step": 7448 + }, + { + "epoch": 2.3917161663188313, + "grad_norm": 0.7450005412101746, + "learning_rate": 3.712223124994875e-05, + "loss": 0.3179, + "step": 7449 + }, + { + "epoch": 2.392037245143683, + "grad_norm": 0.90874844789505, + "learning_rate": 3.709509224009641e-05, + "loss": 0.3934, + "step": 7450 + }, + { + "epoch": 2.3923583239685344, + "grad_norm": 0.797484815120697, + "learning_rate": 3.7067960895016275e-05, + "loss": 0.3171, + "step": 7451 + }, + { + "epoch": 2.3926794027933855, + "grad_norm": 1.188254475593567, + "learning_rate": 3.704083721801421e-05, + "loss": 0.4681, + "step": 7452 + }, + { + "epoch": 2.393000481618237, + "grad_norm": 0.8540223836898804, + "learning_rate": 3.701372121239512e-05, + "loss": 0.2897, + "step": 7453 + }, + { + "epoch": 2.3933215604430886, + "grad_norm": 0.6741155385971069, + "learning_rate": 3.698661288146311e-05, + "loss": 0.265, + "step": 7454 + }, + { + "epoch": 2.39364263926794, + "grad_norm": 0.8153589367866516, + "learning_rate": 3.695951222852112e-05, + "loss": 0.3477, + "step": 7455 + }, + { + "epoch": 2.3939637180927917, + "grad_norm": 0.7138598561286926, + "learning_rate": 3.6932419256871406e-05, + "loss": 0.3001, + "step": 7456 + }, + { + "epoch": 2.3942847969176433, + "grad_norm": 1.1851181983947754, + "learning_rate": 3.690533396981504e-05, + "loss": 0.4506, + "step": 7457 + }, + { + "epoch": 2.394605875742495, + "grad_norm": 0.9050045013427734, + "learning_rate": 3.687825637065236e-05, + "loss": 0.3637, + "step": 7458 + }, + { + "epoch": 2.3949269545673464, + "grad_norm": 0.7797844409942627, + "learning_rate": 3.685118646268272e-05, + "loss": 0.336, + "step": 7459 + }, + { + "epoch": 2.395248033392198, + "grad_norm": 1.0224764347076416, + "learning_rate": 3.682412424920438e-05, + "loss": 0.3581, + "step": 7460 + }, + { + "epoch": 2.395569112217049, + "grad_norm": 0.8182712197303772, + "learning_rate": 3.679706973351491e-05, + "loss": 0.307, + "step": 7461 + }, + { + "epoch": 2.3958901910419006, + "grad_norm": 0.8494866490364075, + "learning_rate": 3.677002291891078e-05, + "loss": 0.3239, + "step": 7462 + }, + { + "epoch": 2.396211269866752, + "grad_norm": 0.9537984728813171, + "learning_rate": 3.674298380868756e-05, + "loss": 0.3285, + "step": 7463 + }, + { + "epoch": 2.3965323486916037, + "grad_norm": 0.9284290075302124, + "learning_rate": 3.6715952406139885e-05, + "loss": 0.3925, + "step": 7464 + }, + { + "epoch": 2.3968534275164552, + "grad_norm": 1.1413391828536987, + "learning_rate": 3.668892871456144e-05, + "loss": 0.3875, + "step": 7465 + }, + { + "epoch": 2.397174506341307, + "grad_norm": 0.987825870513916, + "learning_rate": 3.6661912737245e-05, + "loss": 0.3362, + "step": 7466 + }, + { + "epoch": 2.3974955851661583, + "grad_norm": 0.8978787660598755, + "learning_rate": 3.6634904477482354e-05, + "loss": 0.3855, + "step": 7467 + }, + { + "epoch": 2.39781666399101, + "grad_norm": 0.7444339990615845, + "learning_rate": 3.6607903938564405e-05, + "loss": 0.2704, + "step": 7468 + }, + { + "epoch": 2.3981377428158615, + "grad_norm": 1.1076825857162476, + "learning_rate": 3.6580911123781056e-05, + "loss": 0.4605, + "step": 7469 + }, + { + "epoch": 2.3984588216407126, + "grad_norm": 0.8118841052055359, + "learning_rate": 3.6553926036421326e-05, + "loss": 0.2976, + "step": 7470 + }, + { + "epoch": 2.398779900465564, + "grad_norm": 0.8877241611480713, + "learning_rate": 3.6526948679773257e-05, + "loss": 0.3332, + "step": 7471 + }, + { + "epoch": 2.3991009792904157, + "grad_norm": 0.7601701617240906, + "learning_rate": 3.649997905712396e-05, + "loss": 0.3221, + "step": 7472 + }, + { + "epoch": 2.399422058115267, + "grad_norm": 1.4962290525436401, + "learning_rate": 3.647301717175956e-05, + "loss": 0.4457, + "step": 7473 + }, + { + "epoch": 2.3997431369401188, + "grad_norm": 0.8516422510147095, + "learning_rate": 3.6446063026965384e-05, + "loss": 0.3285, + "step": 7474 + }, + { + "epoch": 2.4000642157649703, + "grad_norm": 0.5712471008300781, + "learning_rate": 3.641911662602559e-05, + "loss": 0.2645, + "step": 7475 + }, + { + "epoch": 2.400385294589822, + "grad_norm": 0.7448149919509888, + "learning_rate": 3.6392177972223594e-05, + "loss": 0.3071, + "step": 7476 + }, + { + "epoch": 2.4007063734146734, + "grad_norm": 0.5864403247833252, + "learning_rate": 3.6365247068841814e-05, + "loss": 0.2586, + "step": 7477 + }, + { + "epoch": 2.401027452239525, + "grad_norm": 0.6279107928276062, + "learning_rate": 3.633832391916159e-05, + "loss": 0.2896, + "step": 7478 + }, + { + "epoch": 2.401348531064376, + "grad_norm": 0.7097012996673584, + "learning_rate": 3.631140852646355e-05, + "loss": 0.3143, + "step": 7479 + }, + { + "epoch": 2.4016696098892276, + "grad_norm": 0.42605191469192505, + "learning_rate": 3.628450089402713e-05, + "loss": 0.4756, + "step": 7480 + }, + { + "epoch": 2.401990688714079, + "grad_norm": 0.4604340195655823, + "learning_rate": 3.6257601025131026e-05, + "loss": 0.7031, + "step": 7481 + }, + { + "epoch": 2.4023117675389307, + "grad_norm": 0.3688209652900696, + "learning_rate": 3.62307089230529e-05, + "loss": 0.247, + "step": 7482 + }, + { + "epoch": 2.4026328463637823, + "grad_norm": 0.48397693037986755, + "learning_rate": 3.6203824591069456e-05, + "loss": 0.2488, + "step": 7483 + }, + { + "epoch": 2.402953925188634, + "grad_norm": 0.6211444735527039, + "learning_rate": 3.6176948032456473e-05, + "loss": 0.3253, + "step": 7484 + }, + { + "epoch": 2.4032750040134854, + "grad_norm": 0.5152127146720886, + "learning_rate": 3.615007925048878e-05, + "loss": 0.209, + "step": 7485 + }, + { + "epoch": 2.403596082838337, + "grad_norm": 0.44418853521347046, + "learning_rate": 3.612321824844024e-05, + "loss": 0.0993, + "step": 7486 + }, + { + "epoch": 2.4039171616631885, + "grad_norm": 0.5577342510223389, + "learning_rate": 3.60963650295838e-05, + "loss": 0.2749, + "step": 7487 + }, + { + "epoch": 2.4042382404880396, + "grad_norm": 0.7818149328231812, + "learning_rate": 3.606951959719145e-05, + "loss": 0.4772, + "step": 7488 + }, + { + "epoch": 2.404559319312891, + "grad_norm": 0.7911422252655029, + "learning_rate": 3.604268195453421e-05, + "loss": 0.3716, + "step": 7489 + }, + { + "epoch": 2.4048803981377427, + "grad_norm": 0.8051799535751343, + "learning_rate": 3.601585210488218e-05, + "loss": 0.4383, + "step": 7490 + }, + { + "epoch": 2.4052014769625942, + "grad_norm": 0.8039785623550415, + "learning_rate": 3.5989030051504434e-05, + "loss": 0.3691, + "step": 7491 + }, + { + "epoch": 2.405522555787446, + "grad_norm": 0.6632509827613831, + "learning_rate": 3.59622157976693e-05, + "loss": 0.2926, + "step": 7492 + }, + { + "epoch": 2.4058436346122973, + "grad_norm": 0.8523510694503784, + "learning_rate": 3.5935409346643835e-05, + "loss": 0.3532, + "step": 7493 + }, + { + "epoch": 2.406164713437149, + "grad_norm": 0.9791433811187744, + "learning_rate": 3.590861070169449e-05, + "loss": 0.3427, + "step": 7494 + }, + { + "epoch": 2.4064857922620004, + "grad_norm": 0.6575258374214172, + "learning_rate": 3.5881819866086484e-05, + "loss": 0.3172, + "step": 7495 + }, + { + "epoch": 2.406806871086852, + "grad_norm": 0.833772599697113, + "learning_rate": 3.585503684308421e-05, + "loss": 0.414, + "step": 7496 + }, + { + "epoch": 2.407127949911703, + "grad_norm": 0.9059905409812927, + "learning_rate": 3.582826163595119e-05, + "loss": 0.3809, + "step": 7497 + }, + { + "epoch": 2.4074490287365546, + "grad_norm": 1.0018295049667358, + "learning_rate": 3.580149424794976e-05, + "loss": 0.4041, + "step": 7498 + }, + { + "epoch": 2.407770107561406, + "grad_norm": 0.6466946005821228, + "learning_rate": 3.577473468234156e-05, + "loss": 0.2834, + "step": 7499 + }, + { + "epoch": 2.4080911863862577, + "grad_norm": 0.7386956214904785, + "learning_rate": 3.574798294238713e-05, + "loss": 0.3142, + "step": 7500 + }, + { + "epoch": 2.4084122652111093, + "grad_norm": 0.9483809471130371, + "learning_rate": 3.5721239031346066e-05, + "loss": 0.3326, + "step": 7501 + }, + { + "epoch": 2.408733344035961, + "grad_norm": 0.9816690683364868, + "learning_rate": 3.569450295247706e-05, + "loss": 0.3653, + "step": 7502 + }, + { + "epoch": 2.4090544228608124, + "grad_norm": 0.8358184099197388, + "learning_rate": 3.56677747090378e-05, + "loss": 0.3264, + "step": 7503 + }, + { + "epoch": 2.409375501685664, + "grad_norm": 0.8334686160087585, + "learning_rate": 3.564105430428506e-05, + "loss": 0.3277, + "step": 7504 + }, + { + "epoch": 2.4096965805105155, + "grad_norm": 0.8004488945007324, + "learning_rate": 3.561434174147463e-05, + "loss": 0.2976, + "step": 7505 + }, + { + "epoch": 2.4100176593353666, + "grad_norm": 0.6906147599220276, + "learning_rate": 3.558763702386135e-05, + "loss": 0.2825, + "step": 7506 + }, + { + "epoch": 2.410338738160218, + "grad_norm": 0.8044391870498657, + "learning_rate": 3.556094015469913e-05, + "loss": 0.3592, + "step": 7507 + }, + { + "epoch": 2.4106598169850697, + "grad_norm": 0.8112503290176392, + "learning_rate": 3.553425113724088e-05, + "loss": 0.36, + "step": 7508 + }, + { + "epoch": 2.4109808958099213, + "grad_norm": 0.9423220753669739, + "learning_rate": 3.5507569974738574e-05, + "loss": 0.3293, + "step": 7509 + }, + { + "epoch": 2.411301974634773, + "grad_norm": 0.973623514175415, + "learning_rate": 3.548089667044325e-05, + "loss": 0.4033, + "step": 7510 + }, + { + "epoch": 2.4116230534596244, + "grad_norm": 0.937961995601654, + "learning_rate": 3.545423122760493e-05, + "loss": 0.3519, + "step": 7511 + }, + { + "epoch": 2.411944132284476, + "grad_norm": 0.8615179061889648, + "learning_rate": 3.542757364947281e-05, + "loss": 0.4388, + "step": 7512 + }, + { + "epoch": 2.4122652111093275, + "grad_norm": 0.753441572189331, + "learning_rate": 3.540092393929494e-05, + "loss": 0.3198, + "step": 7513 + }, + { + "epoch": 2.412586289934179, + "grad_norm": 0.9324014186859131, + "learning_rate": 3.537428210031849e-05, + "loss": 0.3705, + "step": 7514 + }, + { + "epoch": 2.41290736875903, + "grad_norm": 0.7881529927253723, + "learning_rate": 3.534764813578982e-05, + "loss": 0.2409, + "step": 7515 + }, + { + "epoch": 2.4132284475838817, + "grad_norm": 0.6942733526229858, + "learning_rate": 3.5321022048954035e-05, + "loss": 0.2954, + "step": 7516 + }, + { + "epoch": 2.413549526408733, + "grad_norm": 0.7293675541877747, + "learning_rate": 3.52944038430556e-05, + "loss": 0.3183, + "step": 7517 + }, + { + "epoch": 2.4138706052335848, + "grad_norm": 1.0365976095199585, + "learning_rate": 3.52677935213377e-05, + "loss": 0.3541, + "step": 7518 + }, + { + "epoch": 2.4141916840584363, + "grad_norm": 0.6401132345199585, + "learning_rate": 3.524119108704286e-05, + "loss": 0.2692, + "step": 7519 + }, + { + "epoch": 2.414512762883288, + "grad_norm": 0.7052464485168457, + "learning_rate": 3.521459654341244e-05, + "loss": 0.3227, + "step": 7520 + }, + { + "epoch": 2.4148338417081394, + "grad_norm": 0.4697404205799103, + "learning_rate": 3.518800989368691e-05, + "loss": 0.2497, + "step": 7521 + }, + { + "epoch": 2.415154920532991, + "grad_norm": 0.8850452303886414, + "learning_rate": 3.516143114110582e-05, + "loss": 0.3252, + "step": 7522 + }, + { + "epoch": 2.4154759993578425, + "grad_norm": 0.8042798638343811, + "learning_rate": 3.51348602889076e-05, + "loss": 0.3285, + "step": 7523 + }, + { + "epoch": 2.4157970781826936, + "grad_norm": 0.9007611274719238, + "learning_rate": 3.510829734032993e-05, + "loss": 0.3187, + "step": 7524 + }, + { + "epoch": 2.416118157007545, + "grad_norm": 1.6435385942459106, + "learning_rate": 3.50817422986094e-05, + "loss": 0.2828, + "step": 7525 + }, + { + "epoch": 2.4164392358323967, + "grad_norm": 0.31267601251602173, + "learning_rate": 3.5055195166981645e-05, + "loss": 0.2312, + "step": 7526 + }, + { + "epoch": 2.4167603146572483, + "grad_norm": 0.6988046169281006, + "learning_rate": 3.5028655948681355e-05, + "loss": 0.314, + "step": 7527 + }, + { + "epoch": 2.4170813934821, + "grad_norm": 0.6505711078643799, + "learning_rate": 3.500212464694227e-05, + "loss": 0.289, + "step": 7528 + }, + { + "epoch": 2.4174024723069514, + "grad_norm": 0.6511385440826416, + "learning_rate": 3.497560126499709e-05, + "loss": 0.298, + "step": 7529 + }, + { + "epoch": 2.417723551131803, + "grad_norm": 0.5870752930641174, + "learning_rate": 3.494908580607774e-05, + "loss": 0.7781, + "step": 7530 + }, + { + "epoch": 2.4180446299566545, + "grad_norm": 0.5827964544296265, + "learning_rate": 3.492257827341492e-05, + "loss": 0.5468, + "step": 7531 + }, + { + "epoch": 2.418365708781506, + "grad_norm": 0.3877847194671631, + "learning_rate": 3.489607867023854e-05, + "loss": 0.2175, + "step": 7532 + }, + { + "epoch": 2.418686787606357, + "grad_norm": 0.42042702436447144, + "learning_rate": 3.4869586999777495e-05, + "loss": 0.1292, + "step": 7533 + }, + { + "epoch": 2.4190078664312087, + "grad_norm": 0.42274191975593567, + "learning_rate": 3.484310326525967e-05, + "loss": 0.0746, + "step": 7534 + }, + { + "epoch": 2.4193289452560602, + "grad_norm": 0.24631276726722717, + "learning_rate": 3.481662746991214e-05, + "loss": 0.0749, + "step": 7535 + }, + { + "epoch": 2.419650024080912, + "grad_norm": 0.48242461681365967, + "learning_rate": 3.479015961696077e-05, + "loss": 0.1554, + "step": 7536 + }, + { + "epoch": 2.4199711029057633, + "grad_norm": 0.8864598870277405, + "learning_rate": 3.4763699709630716e-05, + "loss": 0.4802, + "step": 7537 + }, + { + "epoch": 2.420292181730615, + "grad_norm": 1.016950249671936, + "learning_rate": 3.4737247751145896e-05, + "loss": 0.5052, + "step": 7538 + }, + { + "epoch": 2.4206132605554664, + "grad_norm": 0.9051849842071533, + "learning_rate": 3.4710803744729515e-05, + "loss": 0.381, + "step": 7539 + }, + { + "epoch": 2.420934339380318, + "grad_norm": 0.9121940732002258, + "learning_rate": 3.468436769360368e-05, + "loss": 0.3998, + "step": 7540 + }, + { + "epoch": 2.4212554182051695, + "grad_norm": 0.698441743850708, + "learning_rate": 3.465793960098945e-05, + "loss": 0.2916, + "step": 7541 + }, + { + "epoch": 2.4215764970300206, + "grad_norm": 0.7434770464897156, + "learning_rate": 3.463151947010712e-05, + "loss": 0.3105, + "step": 7542 + }, + { + "epoch": 2.421897575854872, + "grad_norm": 0.6803128123283386, + "learning_rate": 3.460510730417585e-05, + "loss": 0.332, + "step": 7543 + }, + { + "epoch": 2.4222186546797237, + "grad_norm": 0.7351831197738647, + "learning_rate": 3.4578703106413904e-05, + "loss": 0.2964, + "step": 7544 + }, + { + "epoch": 2.4225397335045753, + "grad_norm": 0.8372403383255005, + "learning_rate": 3.455230688003852e-05, + "loss": 0.3135, + "step": 7545 + }, + { + "epoch": 2.422860812329427, + "grad_norm": 0.7963913679122925, + "learning_rate": 3.452591862826603e-05, + "loss": 0.3311, + "step": 7546 + }, + { + "epoch": 2.4231818911542784, + "grad_norm": 0.9390442967414856, + "learning_rate": 3.4499538354311755e-05, + "loss": 0.4368, + "step": 7547 + }, + { + "epoch": 2.42350296997913, + "grad_norm": 0.8431958556175232, + "learning_rate": 3.447316606139004e-05, + "loss": 0.3649, + "step": 7548 + }, + { + "epoch": 2.4238240488039815, + "grad_norm": 1.0568405389785767, + "learning_rate": 3.444680175271428e-05, + "loss": 0.369, + "step": 7549 + }, + { + "epoch": 2.424145127628833, + "grad_norm": 0.8188615441322327, + "learning_rate": 3.442044543149688e-05, + "loss": 0.348, + "step": 7550 + }, + { + "epoch": 2.424466206453684, + "grad_norm": 1.063718557357788, + "learning_rate": 3.439409710094929e-05, + "loss": 0.4778, + "step": 7551 + }, + { + "epoch": 2.4247872852785357, + "grad_norm": 0.9398375153541565, + "learning_rate": 3.4367756764281955e-05, + "loss": 0.3605, + "step": 7552 + }, + { + "epoch": 2.4251083641033873, + "grad_norm": 0.9994428157806396, + "learning_rate": 3.4341424424704375e-05, + "loss": 0.3958, + "step": 7553 + }, + { + "epoch": 2.425429442928239, + "grad_norm": 0.8988476395606995, + "learning_rate": 3.4315100085425034e-05, + "loss": 0.3561, + "step": 7554 + }, + { + "epoch": 2.4257505217530904, + "grad_norm": 1.0344244241714478, + "learning_rate": 3.4288783749651564e-05, + "loss": 0.402, + "step": 7555 + }, + { + "epoch": 2.426071600577942, + "grad_norm": 1.1983534097671509, + "learning_rate": 3.426247542059041e-05, + "loss": 0.403, + "step": 7556 + }, + { + "epoch": 2.4263926794027935, + "grad_norm": 0.7842648029327393, + "learning_rate": 3.423617510144727e-05, + "loss": 0.3219, + "step": 7557 + }, + { + "epoch": 2.426713758227645, + "grad_norm": 0.8381021022796631, + "learning_rate": 3.4209882795426716e-05, + "loss": 0.3593, + "step": 7558 + }, + { + "epoch": 2.4270348370524966, + "grad_norm": 1.0330874919891357, + "learning_rate": 3.418359850573234e-05, + "loss": 0.3792, + "step": 7559 + }, + { + "epoch": 2.4273559158773477, + "grad_norm": 0.9594546556472778, + "learning_rate": 3.415732223556689e-05, + "loss": 0.3862, + "step": 7560 + }, + { + "epoch": 2.427676994702199, + "grad_norm": 0.9516655206680298, + "learning_rate": 3.413105398813195e-05, + "loss": 0.297, + "step": 7561 + }, + { + "epoch": 2.4279980735270508, + "grad_norm": 0.8917472958564758, + "learning_rate": 3.4104793766628304e-05, + "loss": 0.2836, + "step": 7562 + }, + { + "epoch": 2.4283191523519023, + "grad_norm": 0.5682129263877869, + "learning_rate": 3.4078541574255664e-05, + "loss": 0.2329, + "step": 7563 + }, + { + "epoch": 2.428640231176754, + "grad_norm": 0.8117547631263733, + "learning_rate": 3.4052297414212777e-05, + "loss": 0.3157, + "step": 7564 + }, + { + "epoch": 2.4289613100016054, + "grad_norm": 0.8948675394058228, + "learning_rate": 3.40260612896974e-05, + "loss": 0.2925, + "step": 7565 + }, + { + "epoch": 2.429282388826457, + "grad_norm": 0.8941418528556824, + "learning_rate": 3.3999833203906326e-05, + "loss": 0.2747, + "step": 7566 + }, + { + "epoch": 2.4296034676513085, + "grad_norm": 0.6833550333976746, + "learning_rate": 3.397361316003539e-05, + "loss": 0.2783, + "step": 7567 + }, + { + "epoch": 2.42992454647616, + "grad_norm": 0.571975588798523, + "learning_rate": 3.394740116127941e-05, + "loss": 0.2373, + "step": 7568 + }, + { + "epoch": 2.430245625301011, + "grad_norm": 0.6630203723907471, + "learning_rate": 3.3921197210832235e-05, + "loss": 0.2824, + "step": 7569 + }, + { + "epoch": 2.4305667041258627, + "grad_norm": 0.7997381091117859, + "learning_rate": 3.389500131188674e-05, + "loss": 0.3572, + "step": 7570 + }, + { + "epoch": 2.4308877829507143, + "grad_norm": 0.9815787076950073, + "learning_rate": 3.386881346763483e-05, + "loss": 0.4214, + "step": 7571 + }, + { + "epoch": 2.431208861775566, + "grad_norm": 0.8200189471244812, + "learning_rate": 3.3842633681267356e-05, + "loss": 0.3118, + "step": 7572 + }, + { + "epoch": 2.4315299406004174, + "grad_norm": 0.8650208115577698, + "learning_rate": 3.3816461955974365e-05, + "loss": 0.3753, + "step": 7573 + }, + { + "epoch": 2.431851019425269, + "grad_norm": 0.7072727084159851, + "learning_rate": 3.379029829494469e-05, + "loss": 0.2879, + "step": 7574 + }, + { + "epoch": 2.4321720982501205, + "grad_norm": 0.8465923070907593, + "learning_rate": 3.376414270136633e-05, + "loss": 0.3194, + "step": 7575 + }, + { + "epoch": 2.432493177074972, + "grad_norm": 0.7302094101905823, + "learning_rate": 3.373799517842627e-05, + "loss": 0.307, + "step": 7576 + }, + { + "epoch": 2.4328142558998236, + "grad_norm": 0.8451035022735596, + "learning_rate": 3.371185572931048e-05, + "loss": 0.3004, + "step": 7577 + }, + { + "epoch": 2.4331353347246747, + "grad_norm": 0.870238184928894, + "learning_rate": 3.3685724357204054e-05, + "loss": 0.2926, + "step": 7578 + }, + { + "epoch": 2.4334564135495262, + "grad_norm": 0.525023877620697, + "learning_rate": 3.3659601065290893e-05, + "loss": 0.282, + "step": 7579 + }, + { + "epoch": 2.433777492374378, + "grad_norm": 0.5372437834739685, + "learning_rate": 3.363348585675414e-05, + "loss": 0.5397, + "step": 7580 + }, + { + "epoch": 2.4340985711992293, + "grad_norm": 0.5941777229309082, + "learning_rate": 3.360737873477584e-05, + "loss": 0.6689, + "step": 7581 + }, + { + "epoch": 2.434419650024081, + "grad_norm": 0.35295045375823975, + "learning_rate": 3.358127970253704e-05, + "loss": 0.2129, + "step": 7582 + }, + { + "epoch": 2.4347407288489324, + "grad_norm": 0.43210405111312866, + "learning_rate": 3.355518876321787e-05, + "loss": 0.1644, + "step": 7583 + }, + { + "epoch": 2.435061807673784, + "grad_norm": 0.26534727215766907, + "learning_rate": 3.352910591999734e-05, + "loss": 0.076, + "step": 7584 + }, + { + "epoch": 2.4353828864986355, + "grad_norm": 0.1900699883699417, + "learning_rate": 3.3503031176053656e-05, + "loss": 0.072, + "step": 7585 + }, + { + "epoch": 2.435703965323487, + "grad_norm": 0.45870304107666016, + "learning_rate": 3.347696453456393e-05, + "loss": 0.1865, + "step": 7586 + }, + { + "epoch": 2.436025044148338, + "grad_norm": 0.7371842861175537, + "learning_rate": 3.3450905998704275e-05, + "loss": 0.341, + "step": 7587 + }, + { + "epoch": 2.4363461229731898, + "grad_norm": 0.7692381143569946, + "learning_rate": 3.342485557164986e-05, + "loss": 0.4095, + "step": 7588 + }, + { + "epoch": 2.4366672017980413, + "grad_norm": 0.7908117175102234, + "learning_rate": 3.339881325657484e-05, + "loss": 0.3606, + "step": 7589 + }, + { + "epoch": 2.436988280622893, + "grad_norm": 0.7690086364746094, + "learning_rate": 3.3372779056652426e-05, + "loss": 0.3458, + "step": 7590 + }, + { + "epoch": 2.4373093594477444, + "grad_norm": 0.8492259383201599, + "learning_rate": 3.334675297505476e-05, + "loss": 0.3107, + "step": 7591 + }, + { + "epoch": 2.437630438272596, + "grad_norm": 0.6811227202415466, + "learning_rate": 3.3320735014953076e-05, + "loss": 0.2832, + "step": 7592 + }, + { + "epoch": 2.4379515170974475, + "grad_norm": 0.8364322781562805, + "learning_rate": 3.3294725179517574e-05, + "loss": 0.322, + "step": 7593 + }, + { + "epoch": 2.438272595922299, + "grad_norm": 0.9228085279464722, + "learning_rate": 3.326872347191746e-05, + "loss": 0.4433, + "step": 7594 + }, + { + "epoch": 2.4385936747471506, + "grad_norm": 0.5413333773612976, + "learning_rate": 3.3242729895320946e-05, + "loss": 0.2079, + "step": 7595 + }, + { + "epoch": 2.4389147535720017, + "grad_norm": 0.7873682975769043, + "learning_rate": 3.3216744452895354e-05, + "loss": 0.3379, + "step": 7596 + }, + { + "epoch": 2.4392358323968533, + "grad_norm": 0.885543167591095, + "learning_rate": 3.319076714780682e-05, + "loss": 0.3296, + "step": 7597 + }, + { + "epoch": 2.439556911221705, + "grad_norm": 0.9024078845977783, + "learning_rate": 3.316479798322072e-05, + "loss": 0.3597, + "step": 7598 + }, + { + "epoch": 2.4398779900465564, + "grad_norm": 0.9780003428459167, + "learning_rate": 3.313883696230119e-05, + "loss": 0.4125, + "step": 7599 + }, + { + "epoch": 2.440199068871408, + "grad_norm": 2.2751457691192627, + "learning_rate": 3.311288408821159e-05, + "loss": 0.402, + "step": 7600 + }, + { + "epoch": 2.4405201476962595, + "grad_norm": 1.0487151145935059, + "learning_rate": 3.308693936411421e-05, + "loss": 0.4024, + "step": 7601 + }, + { + "epoch": 2.440841226521111, + "grad_norm": 1.0639290809631348, + "learning_rate": 3.306100279317024e-05, + "loss": 0.4246, + "step": 7602 + }, + { + "epoch": 2.4411623053459626, + "grad_norm": 0.9267032146453857, + "learning_rate": 3.303507437854009e-05, + "loss": 0.3029, + "step": 7603 + }, + { + "epoch": 2.441483384170814, + "grad_norm": 0.8686235547065735, + "learning_rate": 3.3009154123382936e-05, + "loss": 0.3351, + "step": 7604 + }, + { + "epoch": 2.4418044629956652, + "grad_norm": 0.9154815077781677, + "learning_rate": 3.2983242030857174e-05, + "loss": 0.2886, + "step": 7605 + }, + { + "epoch": 2.4421255418205168, + "grad_norm": 0.6667101383209229, + "learning_rate": 3.2957338104120096e-05, + "loss": 0.312, + "step": 7606 + }, + { + "epoch": 2.4424466206453683, + "grad_norm": 0.8355054259300232, + "learning_rate": 3.2931442346328004e-05, + "loss": 0.384, + "step": 7607 + }, + { + "epoch": 2.44276769947022, + "grad_norm": 0.8191713094711304, + "learning_rate": 3.290555476063622e-05, + "loss": 0.3421, + "step": 7608 + }, + { + "epoch": 2.4430887782950714, + "grad_norm": 0.9118049740791321, + "learning_rate": 3.287967535019908e-05, + "loss": 0.3099, + "step": 7609 + }, + { + "epoch": 2.443409857119923, + "grad_norm": 1.0478988885879517, + "learning_rate": 3.285380411816988e-05, + "loss": 0.4101, + "step": 7610 + }, + { + "epoch": 2.4437309359447745, + "grad_norm": 1.4113534688949585, + "learning_rate": 3.2827941067700996e-05, + "loss": 0.3687, + "step": 7611 + }, + { + "epoch": 2.444052014769626, + "grad_norm": 0.7927390336990356, + "learning_rate": 3.2802086201943724e-05, + "loss": 0.2911, + "step": 7612 + }, + { + "epoch": 2.4443730935944776, + "grad_norm": 0.8057661056518555, + "learning_rate": 3.277623952404842e-05, + "loss": 0.3085, + "step": 7613 + }, + { + "epoch": 2.4446941724193287, + "grad_norm": 0.8335930705070496, + "learning_rate": 3.275040103716441e-05, + "loss": 0.3725, + "step": 7614 + }, + { + "epoch": 2.4450152512441803, + "grad_norm": 0.7692309617996216, + "learning_rate": 3.272457074444003e-05, + "loss": 0.3191, + "step": 7615 + }, + { + "epoch": 2.445336330069032, + "grad_norm": 0.9450210928916931, + "learning_rate": 3.269874864902269e-05, + "loss": 0.3662, + "step": 7616 + }, + { + "epoch": 2.4456574088938834, + "grad_norm": 0.6600220203399658, + "learning_rate": 3.2672934754058616e-05, + "loss": 0.3059, + "step": 7617 + }, + { + "epoch": 2.445978487718735, + "grad_norm": 0.8459724187850952, + "learning_rate": 3.264712906269328e-05, + "loss": 0.3184, + "step": 7618 + }, + { + "epoch": 2.4462995665435865, + "grad_norm": 1.1846095323562622, + "learning_rate": 3.2621331578070934e-05, + "loss": 0.2196, + "step": 7619 + }, + { + "epoch": 2.446620645368438, + "grad_norm": 0.5867680311203003, + "learning_rate": 3.2595542303334924e-05, + "loss": 0.2611, + "step": 7620 + }, + { + "epoch": 2.4469417241932896, + "grad_norm": 0.8343913555145264, + "learning_rate": 3.2569761241627696e-05, + "loss": 0.3502, + "step": 7621 + }, + { + "epoch": 2.447262803018141, + "grad_norm": 0.803591251373291, + "learning_rate": 3.254398839609044e-05, + "loss": 0.2839, + "step": 7622 + }, + { + "epoch": 2.4475838818429922, + "grad_norm": 0.7186933755874634, + "learning_rate": 3.251822376986363e-05, + "loss": 0.3149, + "step": 7623 + }, + { + "epoch": 2.447904960667844, + "grad_norm": 0.8497990369796753, + "learning_rate": 3.249246736608655e-05, + "loss": 0.3579, + "step": 7624 + }, + { + "epoch": 2.4482260394926953, + "grad_norm": 0.5227159857749939, + "learning_rate": 3.246671918789755e-05, + "loss": 0.2571, + "step": 7625 + }, + { + "epoch": 2.448547118317547, + "grad_norm": 0.8302345275878906, + "learning_rate": 3.244097923843398e-05, + "loss": 0.3503, + "step": 7626 + }, + { + "epoch": 2.4488681971423985, + "grad_norm": 0.5355742573738098, + "learning_rate": 3.2415247520832146e-05, + "loss": 0.2695, + "step": 7627 + }, + { + "epoch": 2.44918927596725, + "grad_norm": 0.7447862029075623, + "learning_rate": 3.23895240382274e-05, + "loss": 0.2746, + "step": 7628 + }, + { + "epoch": 2.4495103547921016, + "grad_norm": 0.46801793575286865, + "learning_rate": 3.236380879375408e-05, + "loss": 0.3045, + "step": 7629 + }, + { + "epoch": 2.449831433616953, + "grad_norm": 0.5388785004615784, + "learning_rate": 3.233810179054548e-05, + "loss": 0.9426, + "step": 7630 + }, + { + "epoch": 2.4501525124418047, + "grad_norm": 0.4183870553970337, + "learning_rate": 3.231240303173394e-05, + "loss": 0.399, + "step": 7631 + }, + { + "epoch": 2.4504735912666558, + "grad_norm": 0.9477866888046265, + "learning_rate": 3.2286712520450765e-05, + "loss": 0.2928, + "step": 7632 + }, + { + "epoch": 2.4507946700915073, + "grad_norm": 0.34258604049682617, + "learning_rate": 3.226103025982628e-05, + "loss": 0.0871, + "step": 7633 + }, + { + "epoch": 2.451115748916359, + "grad_norm": 0.6076464056968689, + "learning_rate": 3.223535625298979e-05, + "loss": 0.2203, + "step": 7634 + }, + { + "epoch": 2.4514368277412104, + "grad_norm": 0.3136209547519684, + "learning_rate": 3.220969050306955e-05, + "loss": 0.1264, + "step": 7635 + }, + { + "epoch": 2.451757906566062, + "grad_norm": 0.8506674766540527, + "learning_rate": 3.218403301319296e-05, + "loss": 0.4329, + "step": 7636 + }, + { + "epoch": 2.4520789853909135, + "grad_norm": 0.8831260800361633, + "learning_rate": 3.21583837864862e-05, + "loss": 0.4926, + "step": 7637 + }, + { + "epoch": 2.452400064215765, + "grad_norm": 0.8367322683334351, + "learning_rate": 3.213274282607457e-05, + "loss": 0.3901, + "step": 7638 + }, + { + "epoch": 2.4527211430406166, + "grad_norm": 0.824145495891571, + "learning_rate": 3.210711013508242e-05, + "loss": 0.3007, + "step": 7639 + }, + { + "epoch": 2.453042221865468, + "grad_norm": 0.8876867890357971, + "learning_rate": 3.208148571663289e-05, + "loss": 0.3398, + "step": 7640 + }, + { + "epoch": 2.4533633006903193, + "grad_norm": 0.6797581315040588, + "learning_rate": 3.205586957384838e-05, + "loss": 0.2937, + "step": 7641 + }, + { + "epoch": 2.453684379515171, + "grad_norm": 0.7675800919532776, + "learning_rate": 3.2030261709849996e-05, + "loss": 0.3564, + "step": 7642 + }, + { + "epoch": 2.4540054583400224, + "grad_norm": 0.7745330929756165, + "learning_rate": 3.200466212775808e-05, + "loss": 0.3317, + "step": 7643 + }, + { + "epoch": 2.454326537164874, + "grad_norm": 0.7452265024185181, + "learning_rate": 3.197907083069184e-05, + "loss": 0.3462, + "step": 7644 + }, + { + "epoch": 2.4546476159897255, + "grad_norm": 0.8958961963653564, + "learning_rate": 3.195348782176948e-05, + "loss": 0.3256, + "step": 7645 + }, + { + "epoch": 2.454968694814577, + "grad_norm": 0.9540597200393677, + "learning_rate": 3.192791310410822e-05, + "loss": 0.377, + "step": 7646 + }, + { + "epoch": 2.4552897736394286, + "grad_norm": 1.0042481422424316, + "learning_rate": 3.190234668082427e-05, + "loss": 0.3533, + "step": 7647 + }, + { + "epoch": 2.45561085246428, + "grad_norm": 1.0041635036468506, + "learning_rate": 3.187678855503282e-05, + "loss": 0.3521, + "step": 7648 + }, + { + "epoch": 2.4559319312891317, + "grad_norm": 0.8743497133255005, + "learning_rate": 3.1851238729848034e-05, + "loss": 0.333, + "step": 7649 + }, + { + "epoch": 2.456253010113983, + "grad_norm": 0.8867911696434021, + "learning_rate": 3.1825697208383096e-05, + "loss": 0.3217, + "step": 7650 + }, + { + "epoch": 2.4565740889388343, + "grad_norm": 0.6222565770149231, + "learning_rate": 3.1800163993750166e-05, + "loss": 0.2614, + "step": 7651 + }, + { + "epoch": 2.456895167763686, + "grad_norm": 1.0342620611190796, + "learning_rate": 3.1774639089060363e-05, + "loss": 0.3954, + "step": 7652 + }, + { + "epoch": 2.4572162465885374, + "grad_norm": 0.7965120077133179, + "learning_rate": 3.174912249742382e-05, + "loss": 0.2804, + "step": 7653 + }, + { + "epoch": 2.457537325413389, + "grad_norm": 0.7983464002609253, + "learning_rate": 3.172361422194974e-05, + "loss": 0.2956, + "step": 7654 + }, + { + "epoch": 2.4578584042382405, + "grad_norm": 0.8190217018127441, + "learning_rate": 3.1698114265746124e-05, + "loss": 0.3554, + "step": 7655 + }, + { + "epoch": 2.458179483063092, + "grad_norm": 0.7282566428184509, + "learning_rate": 3.16726226319201e-05, + "loss": 0.2899, + "step": 7656 + }, + { + "epoch": 2.4585005618879436, + "grad_norm": 0.8753344416618347, + "learning_rate": 3.164713932357776e-05, + "loss": 0.347, + "step": 7657 + }, + { + "epoch": 2.458821640712795, + "grad_norm": 0.8597754836082458, + "learning_rate": 3.162166434382412e-05, + "loss": 0.2803, + "step": 7658 + }, + { + "epoch": 2.4591427195376463, + "grad_norm": 0.7598649263381958, + "learning_rate": 3.159619769576333e-05, + "loss": 0.2793, + "step": 7659 + }, + { + "epoch": 2.459463798362498, + "grad_norm": 1.1359360218048096, + "learning_rate": 3.157073938249829e-05, + "loss": 0.3773, + "step": 7660 + }, + { + "epoch": 2.4597848771873494, + "grad_norm": 1.1523734331130981, + "learning_rate": 3.154528940713113e-05, + "loss": 0.3819, + "step": 7661 + }, + { + "epoch": 2.460105956012201, + "grad_norm": 0.9038000106811523, + "learning_rate": 3.15198477727628e-05, + "loss": 0.3285, + "step": 7662 + }, + { + "epoch": 2.4604270348370525, + "grad_norm": 0.7829211354255676, + "learning_rate": 3.1494414482493304e-05, + "loss": 0.3033, + "step": 7663 + }, + { + "epoch": 2.460748113661904, + "grad_norm": 0.8830636739730835, + "learning_rate": 3.146898953942163e-05, + "loss": 0.3084, + "step": 7664 + }, + { + "epoch": 2.4610691924867556, + "grad_norm": 0.527309775352478, + "learning_rate": 3.144357294664565e-05, + "loss": 0.2242, + "step": 7665 + }, + { + "epoch": 2.461390271311607, + "grad_norm": 1.0569745302200317, + "learning_rate": 3.141816470726238e-05, + "loss": 0.3431, + "step": 7666 + }, + { + "epoch": 2.4617113501364587, + "grad_norm": 0.7063748240470886, + "learning_rate": 3.1392764824367704e-05, + "loss": 0.2735, + "step": 7667 + }, + { + "epoch": 2.46203242896131, + "grad_norm": 0.5551251769065857, + "learning_rate": 3.1367373301056536e-05, + "loss": 0.2571, + "step": 7668 + }, + { + "epoch": 2.4623535077861614, + "grad_norm": 0.8918856978416443, + "learning_rate": 3.134199014042274e-05, + "loss": 0.305, + "step": 7669 + }, + { + "epoch": 2.462674586611013, + "grad_norm": 0.7437170743942261, + "learning_rate": 3.1316615345559185e-05, + "loss": 0.2788, + "step": 7670 + }, + { + "epoch": 2.4629956654358645, + "grad_norm": 1.070360779762268, + "learning_rate": 3.129124891955771e-05, + "loss": 0.4669, + "step": 7671 + }, + { + "epoch": 2.463316744260716, + "grad_norm": 0.8715602159500122, + "learning_rate": 3.126589086550914e-05, + "loss": 0.3444, + "step": 7672 + }, + { + "epoch": 2.4636378230855676, + "grad_norm": 0.7617392539978027, + "learning_rate": 3.124054118650327e-05, + "loss": 0.2923, + "step": 7673 + }, + { + "epoch": 2.463958901910419, + "grad_norm": 0.5233041644096375, + "learning_rate": 3.12151998856289e-05, + "loss": 0.2442, + "step": 7674 + }, + { + "epoch": 2.4642799807352707, + "grad_norm": 0.776607871055603, + "learning_rate": 3.1189866965973766e-05, + "loss": 0.3125, + "step": 7675 + }, + { + "epoch": 2.464601059560122, + "grad_norm": 0.5157756209373474, + "learning_rate": 3.116454243062459e-05, + "loss": 0.2563, + "step": 7676 + }, + { + "epoch": 2.4649221383849733, + "grad_norm": 0.7321842312812805, + "learning_rate": 3.113922628266718e-05, + "loss": 0.3024, + "step": 7677 + }, + { + "epoch": 2.465243217209825, + "grad_norm": 0.6504801511764526, + "learning_rate": 3.111391852518611e-05, + "loss": 0.3101, + "step": 7678 + }, + { + "epoch": 2.4655642960346764, + "grad_norm": 0.39098212122917175, + "learning_rate": 3.108861916126518e-05, + "loss": 0.2716, + "step": 7679 + }, + { + "epoch": 2.465885374859528, + "grad_norm": 0.5527970194816589, + "learning_rate": 3.1063328193986904e-05, + "loss": 0.8426, + "step": 7680 + }, + { + "epoch": 2.4662064536843795, + "grad_norm": 0.3802039623260498, + "learning_rate": 3.103804562643302e-05, + "loss": 0.4868, + "step": 7681 + }, + { + "epoch": 2.466527532509231, + "grad_norm": 0.40410691499710083, + "learning_rate": 3.101277146168412e-05, + "loss": 0.2456, + "step": 7682 + }, + { + "epoch": 2.4668486113340826, + "grad_norm": 0.2699540853500366, + "learning_rate": 3.098750570281969e-05, + "loss": 0.0755, + "step": 7683 + }, + { + "epoch": 2.467169690158934, + "grad_norm": 0.47554266452789307, + "learning_rate": 3.096224835291839e-05, + "loss": 0.1443, + "step": 7684 + }, + { + "epoch": 2.4674907689837857, + "grad_norm": 0.6371931433677673, + "learning_rate": 3.093699941505771e-05, + "loss": 0.2333, + "step": 7685 + }, + { + "epoch": 2.467811847808637, + "grad_norm": 0.8317315578460693, + "learning_rate": 3.0911758892314166e-05, + "loss": 0.4051, + "step": 7686 + }, + { + "epoch": 2.4681329266334884, + "grad_norm": 0.8877337574958801, + "learning_rate": 3.0886526787763234e-05, + "loss": 0.4092, + "step": 7687 + }, + { + "epoch": 2.46845400545834, + "grad_norm": 0.8633880019187927, + "learning_rate": 3.086130310447937e-05, + "loss": 0.4231, + "step": 7688 + }, + { + "epoch": 2.4687750842831915, + "grad_norm": 0.7211251854896545, + "learning_rate": 3.0836087845536e-05, + "loss": 0.2923, + "step": 7689 + }, + { + "epoch": 2.469096163108043, + "grad_norm": 0.9071685671806335, + "learning_rate": 3.081088101400552e-05, + "loss": 0.3378, + "step": 7690 + }, + { + "epoch": 2.4694172419328946, + "grad_norm": 0.9691846370697021, + "learning_rate": 3.078568261295933e-05, + "loss": 0.3977, + "step": 7691 + }, + { + "epoch": 2.469738320757746, + "grad_norm": 0.9273397326469421, + "learning_rate": 3.0760492645467765e-05, + "loss": 0.3731, + "step": 7692 + }, + { + "epoch": 2.4700593995825977, + "grad_norm": 0.8951248526573181, + "learning_rate": 3.073531111460013e-05, + "loss": 0.4538, + "step": 7693 + }, + { + "epoch": 2.4703804784074492, + "grad_norm": 0.868762731552124, + "learning_rate": 3.071013802342475e-05, + "loss": 0.374, + "step": 7694 + }, + { + "epoch": 2.4707015572323003, + "grad_norm": 0.614288330078125, + "learning_rate": 3.068497337500886e-05, + "loss": 0.2634, + "step": 7695 + }, + { + "epoch": 2.471022636057152, + "grad_norm": 1.016533374786377, + "learning_rate": 3.0659817172418693e-05, + "loss": 0.3648, + "step": 7696 + }, + { + "epoch": 2.4713437148820034, + "grad_norm": 1.118348240852356, + "learning_rate": 3.063466941871952e-05, + "loss": 0.4014, + "step": 7697 + }, + { + "epoch": 2.471664793706855, + "grad_norm": 0.9089675545692444, + "learning_rate": 3.060953011697545e-05, + "loss": 0.3507, + "step": 7698 + }, + { + "epoch": 2.4719858725317065, + "grad_norm": 0.9801891446113586, + "learning_rate": 3.058439927024962e-05, + "loss": 0.388, + "step": 7699 + }, + { + "epoch": 2.472306951356558, + "grad_norm": 1.0153359174728394, + "learning_rate": 3.0559276881604236e-05, + "loss": 0.4152, + "step": 7700 + }, + { + "epoch": 2.4726280301814096, + "grad_norm": 0.6754663586616516, + "learning_rate": 3.053416295410026e-05, + "loss": 0.279, + "step": 7701 + }, + { + "epoch": 2.472949109006261, + "grad_norm": 0.8051523566246033, + "learning_rate": 3.0509057490797888e-05, + "loss": 0.3163, + "step": 7702 + }, + { + "epoch": 2.4732701878311127, + "grad_norm": 0.8289381861686707, + "learning_rate": 3.0483960494756016e-05, + "loss": 0.281, + "step": 7703 + }, + { + "epoch": 2.473591266655964, + "grad_norm": 0.7152822613716125, + "learning_rate": 3.045887196903271e-05, + "loss": 0.3059, + "step": 7704 + }, + { + "epoch": 2.4739123454808154, + "grad_norm": 1.1248416900634766, + "learning_rate": 3.0433791916684916e-05, + "loss": 0.4753, + "step": 7705 + }, + { + "epoch": 2.474233424305667, + "grad_norm": 1.4356248378753662, + "learning_rate": 3.0408720340768572e-05, + "loss": 0.5532, + "step": 7706 + }, + { + "epoch": 2.4745545031305185, + "grad_norm": 0.6347241401672363, + "learning_rate": 3.038365724433858e-05, + "loss": 0.2673, + "step": 7707 + }, + { + "epoch": 2.47487558195537, + "grad_norm": 0.8377917408943176, + "learning_rate": 3.035860263044873e-05, + "loss": 0.3614, + "step": 7708 + }, + { + "epoch": 2.4751966607802216, + "grad_norm": 0.866231381893158, + "learning_rate": 3.0333556502151926e-05, + "loss": 0.318, + "step": 7709 + }, + { + "epoch": 2.475517739605073, + "grad_norm": 0.899551272392273, + "learning_rate": 3.0308518862499957e-05, + "loss": 0.4259, + "step": 7710 + }, + { + "epoch": 2.4758388184299247, + "grad_norm": 0.8847672343254089, + "learning_rate": 3.0283489714543556e-05, + "loss": 0.3208, + "step": 7711 + }, + { + "epoch": 2.4761598972547763, + "grad_norm": 1.1146432161331177, + "learning_rate": 3.0258469061332463e-05, + "loss": 0.4248, + "step": 7712 + }, + { + "epoch": 2.4764809760796274, + "grad_norm": 0.6833528876304626, + "learning_rate": 3.023345690591537e-05, + "loss": 0.2858, + "step": 7713 + }, + { + "epoch": 2.476802054904479, + "grad_norm": 0.6902426481246948, + "learning_rate": 3.0208453251339885e-05, + "loss": 0.3027, + "step": 7714 + }, + { + "epoch": 2.4771231337293305, + "grad_norm": 0.7399451732635498, + "learning_rate": 3.018345810065275e-05, + "loss": 0.2832, + "step": 7715 + }, + { + "epoch": 2.477444212554182, + "grad_norm": 0.6635550856590271, + "learning_rate": 3.0158471456899428e-05, + "loss": 0.2934, + "step": 7716 + }, + { + "epoch": 2.4777652913790336, + "grad_norm": 0.7805300354957581, + "learning_rate": 3.0133493323124505e-05, + "loss": 0.3349, + "step": 7717 + }, + { + "epoch": 2.478086370203885, + "grad_norm": 1.5365228652954102, + "learning_rate": 3.0108523702371505e-05, + "loss": 0.3909, + "step": 7718 + }, + { + "epoch": 2.4784074490287367, + "grad_norm": 0.8297216892242432, + "learning_rate": 3.0083562597682847e-05, + "loss": 0.2861, + "step": 7719 + }, + { + "epoch": 2.478728527853588, + "grad_norm": 0.5565605163574219, + "learning_rate": 3.0058610012100074e-05, + "loss": 0.2976, + "step": 7720 + }, + { + "epoch": 2.4790496066784398, + "grad_norm": 1.5360130071640015, + "learning_rate": 3.0033665948663448e-05, + "loss": 0.3868, + "step": 7721 + }, + { + "epoch": 2.479370685503291, + "grad_norm": 0.8052739500999451, + "learning_rate": 3.0008730410412466e-05, + "loss": 0.3711, + "step": 7722 + }, + { + "epoch": 2.4796917643281424, + "grad_norm": 0.7085245847702026, + "learning_rate": 2.9983803400385312e-05, + "loss": 0.3142, + "step": 7723 + }, + { + "epoch": 2.480012843152994, + "grad_norm": 0.939500629901886, + "learning_rate": 2.9958884921619367e-05, + "loss": 0.3342, + "step": 7724 + }, + { + "epoch": 2.4803339219778455, + "grad_norm": 0.8791356682777405, + "learning_rate": 2.993397497715086e-05, + "loss": 0.2739, + "step": 7725 + }, + { + "epoch": 2.480655000802697, + "grad_norm": 0.8108116984367371, + "learning_rate": 2.9909073570014912e-05, + "loss": 0.301, + "step": 7726 + }, + { + "epoch": 2.4809760796275486, + "grad_norm": 0.6944962739944458, + "learning_rate": 2.9884180703245767e-05, + "loss": 0.3222, + "step": 7727 + }, + { + "epoch": 2.4812971584524, + "grad_norm": 0.3254513144493103, + "learning_rate": 2.9859296379876523e-05, + "loss": 0.2407, + "step": 7728 + }, + { + "epoch": 2.4816182372772517, + "grad_norm": 0.7786895632743835, + "learning_rate": 2.9834420602939263e-05, + "loss": 0.3052, + "step": 7729 + }, + { + "epoch": 2.4819393161021033, + "grad_norm": 0.7794302701950073, + "learning_rate": 2.9809553375465004e-05, + "loss": 0.7187, + "step": 7730 + }, + { + "epoch": 2.4822603949269544, + "grad_norm": 0.33989983797073364, + "learning_rate": 2.9784694700483762e-05, + "loss": 0.2142, + "step": 7731 + }, + { + "epoch": 2.482581473751806, + "grad_norm": 0.5084500908851624, + "learning_rate": 2.9759844581024486e-05, + "loss": 0.2271, + "step": 7732 + }, + { + "epoch": 2.4829025525766575, + "grad_norm": 0.4874570667743683, + "learning_rate": 2.9735003020115092e-05, + "loss": 0.0856, + "step": 7733 + }, + { + "epoch": 2.483223631401509, + "grad_norm": 0.3979865610599518, + "learning_rate": 2.9710170020782435e-05, + "loss": 0.188, + "step": 7734 + }, + { + "epoch": 2.4835447102263606, + "grad_norm": 0.5099960565567017, + "learning_rate": 2.968534558605236e-05, + "loss": 0.3362, + "step": 7735 + }, + { + "epoch": 2.483865789051212, + "grad_norm": 0.5169116854667664, + "learning_rate": 2.9660529718949627e-05, + "loss": 0.3097, + "step": 7736 + }, + { + "epoch": 2.4841868678760637, + "grad_norm": 0.9039494395256042, + "learning_rate": 2.9635722422497993e-05, + "loss": 0.4115, + "step": 7737 + }, + { + "epoch": 2.4845079467009152, + "grad_norm": 0.8627937436103821, + "learning_rate": 2.961092369972014e-05, + "loss": 0.4387, + "step": 7738 + }, + { + "epoch": 2.484829025525767, + "grad_norm": 0.8400534391403198, + "learning_rate": 2.9586133553637683e-05, + "loss": 0.4016, + "step": 7739 + }, + { + "epoch": 2.485150104350618, + "grad_norm": 0.7134156823158264, + "learning_rate": 2.9561351987271334e-05, + "loss": 0.3185, + "step": 7740 + }, + { + "epoch": 2.4854711831754694, + "grad_norm": 0.8354820609092712, + "learning_rate": 2.953657900364053e-05, + "loss": 0.4111, + "step": 7741 + }, + { + "epoch": 2.485792262000321, + "grad_norm": 0.66590815782547, + "learning_rate": 2.9511814605763855e-05, + "loss": 0.2975, + "step": 7742 + }, + { + "epoch": 2.4861133408251725, + "grad_norm": 0.7249523997306824, + "learning_rate": 2.9487058796658783e-05, + "loss": 0.3311, + "step": 7743 + }, + { + "epoch": 2.486434419650024, + "grad_norm": 0.6020844578742981, + "learning_rate": 2.9462311579341663e-05, + "loss": 0.252, + "step": 7744 + }, + { + "epoch": 2.4867554984748756, + "grad_norm": 0.8182882070541382, + "learning_rate": 2.9437572956827964e-05, + "loss": 0.3021, + "step": 7745 + }, + { + "epoch": 2.487076577299727, + "grad_norm": 0.6254271268844604, + "learning_rate": 2.94128429321319e-05, + "loss": 0.2792, + "step": 7746 + }, + { + "epoch": 2.4873976561245787, + "grad_norm": 0.83849036693573, + "learning_rate": 2.938812150826684e-05, + "loss": 0.3111, + "step": 7747 + }, + { + "epoch": 2.4877187349494303, + "grad_norm": 1.0279197692871094, + "learning_rate": 2.9363408688245e-05, + "loss": 0.3787, + "step": 7748 + }, + { + "epoch": 2.4880398137742814, + "grad_norm": 0.6685528755187988, + "learning_rate": 2.933870447507753e-05, + "loss": 0.3001, + "step": 7749 + }, + { + "epoch": 2.488360892599133, + "grad_norm": 0.9342240691184998, + "learning_rate": 2.931400887177459e-05, + "loss": 0.3955, + "step": 7750 + }, + { + "epoch": 2.4886819714239845, + "grad_norm": 0.6949548125267029, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.3251, + "step": 7751 + }, + { + "epoch": 2.489003050248836, + "grad_norm": 1.0305014848709106, + "learning_rate": 2.926464350679756e-05, + "loss": 0.3924, + "step": 7752 + }, + { + "epoch": 2.4893241290736876, + "grad_norm": 0.8009200692176819, + "learning_rate": 2.9239973751138495e-05, + "loss": 0.3076, + "step": 7753 + }, + { + "epoch": 2.489645207898539, + "grad_norm": 0.944476306438446, + "learning_rate": 2.921531261737398e-05, + "loss": 0.3573, + "step": 7754 + }, + { + "epoch": 2.4899662867233907, + "grad_norm": 1.1194026470184326, + "learning_rate": 2.9190660108508917e-05, + "loss": 0.419, + "step": 7755 + }, + { + "epoch": 2.4902873655482423, + "grad_norm": 0.9375110864639282, + "learning_rate": 2.9166016227547133e-05, + "loss": 0.3467, + "step": 7756 + }, + { + "epoch": 2.490608444373094, + "grad_norm": 0.7370654344558716, + "learning_rate": 2.9141380977491373e-05, + "loss": 0.3359, + "step": 7757 + }, + { + "epoch": 2.490929523197945, + "grad_norm": 0.9551532864570618, + "learning_rate": 2.911675436134347e-05, + "loss": 0.3477, + "step": 7758 + }, + { + "epoch": 2.4912506020227965, + "grad_norm": 0.8018618226051331, + "learning_rate": 2.9092136382103973e-05, + "loss": 0.2881, + "step": 7759 + }, + { + "epoch": 2.491571680847648, + "grad_norm": 0.8496661186218262, + "learning_rate": 2.9067527042772636e-05, + "loss": 0.3263, + "step": 7760 + }, + { + "epoch": 2.4918927596724996, + "grad_norm": 0.5800387859344482, + "learning_rate": 2.904292634634793e-05, + "loss": 0.2534, + "step": 7761 + }, + { + "epoch": 2.492213838497351, + "grad_norm": 0.9090917706489563, + "learning_rate": 2.9018334295827388e-05, + "loss": 0.3376, + "step": 7762 + }, + { + "epoch": 2.4925349173222027, + "grad_norm": 0.84961998462677, + "learning_rate": 2.899375089420756e-05, + "loss": 0.306, + "step": 7763 + }, + { + "epoch": 2.492855996147054, + "grad_norm": 0.7612555623054504, + "learning_rate": 2.8969176144483744e-05, + "loss": 0.2584, + "step": 7764 + }, + { + "epoch": 2.4931770749719058, + "grad_norm": 0.7672126293182373, + "learning_rate": 2.894461004965038e-05, + "loss": 0.2945, + "step": 7765 + }, + { + "epoch": 2.4934981537967573, + "grad_norm": 1.0515177249908447, + "learning_rate": 2.8920052612700754e-05, + "loss": 0.3602, + "step": 7766 + }, + { + "epoch": 2.4938192326216084, + "grad_norm": 0.710355818271637, + "learning_rate": 2.8895503836627103e-05, + "loss": 0.2777, + "step": 7767 + }, + { + "epoch": 2.49414031144646, + "grad_norm": 0.9773350954055786, + "learning_rate": 2.887096372442063e-05, + "loss": 0.3712, + "step": 7768 + }, + { + "epoch": 2.4944613902713115, + "grad_norm": 0.9016016125679016, + "learning_rate": 2.8846432279071467e-05, + "loss": 0.3312, + "step": 7769 + }, + { + "epoch": 2.494782469096163, + "grad_norm": 0.8454534411430359, + "learning_rate": 2.88219095035687e-05, + "loss": 0.3164, + "step": 7770 + }, + { + "epoch": 2.4951035479210146, + "grad_norm": 0.6082969307899475, + "learning_rate": 2.879739540090036e-05, + "loss": 0.2435, + "step": 7771 + }, + { + "epoch": 2.495424626745866, + "grad_norm": 1.3337501287460327, + "learning_rate": 2.877288997405341e-05, + "loss": 0.315, + "step": 7772 + }, + { + "epoch": 2.4957457055707177, + "grad_norm": 0.6798154711723328, + "learning_rate": 2.874839322601375e-05, + "loss": 0.2806, + "step": 7773 + }, + { + "epoch": 2.4960667843955693, + "grad_norm": 1.1664059162139893, + "learning_rate": 2.872390515976625e-05, + "loss": 0.3415, + "step": 7774 + }, + { + "epoch": 2.496387863220421, + "grad_norm": 0.40442001819610596, + "learning_rate": 2.869942577829471e-05, + "loss": 0.2215, + "step": 7775 + }, + { + "epoch": 2.496708942045272, + "grad_norm": 0.6790067553520203, + "learning_rate": 2.8674955084581857e-05, + "loss": 0.2721, + "step": 7776 + }, + { + "epoch": 2.4970300208701235, + "grad_norm": 0.6047253608703613, + "learning_rate": 2.865049308160934e-05, + "loss": 0.2712, + "step": 7777 + }, + { + "epoch": 2.497351099694975, + "grad_norm": 0.2929036617279053, + "learning_rate": 2.8626039772357882e-05, + "loss": 0.2281, + "step": 7778 + }, + { + "epoch": 2.4976721785198266, + "grad_norm": 0.517440140247345, + "learning_rate": 2.860159515980695e-05, + "loss": 0.2949, + "step": 7779 + }, + { + "epoch": 2.497993257344678, + "grad_norm": 0.5559537410736084, + "learning_rate": 2.8577159246935037e-05, + "loss": 0.6561, + "step": 7780 + }, + { + "epoch": 2.4983143361695297, + "grad_norm": 0.4277403652667999, + "learning_rate": 2.8552732036719687e-05, + "loss": 0.5281, + "step": 7781 + }, + { + "epoch": 2.4986354149943812, + "grad_norm": 0.5072173476219177, + "learning_rate": 2.852831353213715e-05, + "loss": 0.5995, + "step": 7782 + }, + { + "epoch": 2.498956493819233, + "grad_norm": 0.4317438006401062, + "learning_rate": 2.8503903736162875e-05, + "loss": 0.1693, + "step": 7783 + }, + { + "epoch": 2.4992775726440843, + "grad_norm": 0.39404723048210144, + "learning_rate": 2.8479502651770995e-05, + "loss": 0.1665, + "step": 7784 + }, + { + "epoch": 2.4995986514689355, + "grad_norm": 0.3772273659706116, + "learning_rate": 2.8455110281934803e-05, + "loss": 0.1733, + "step": 7785 + }, + { + "epoch": 2.499919730293787, + "grad_norm": 0.2818797528743744, + "learning_rate": 2.8430726629626413e-05, + "loss": 0.0803, + "step": 7786 + }, + { + "epoch": 2.5002408091186386, + "grad_norm": 0.6350283026695251, + "learning_rate": 2.840635169781688e-05, + "loss": 0.3251, + "step": 7787 + }, + { + "epoch": 2.50056188794349, + "grad_norm": 0.8361577987670898, + "learning_rate": 2.838198548947627e-05, + "loss": 0.4189, + "step": 7788 + }, + { + "epoch": 2.5008829667683417, + "grad_norm": 0.7367003560066223, + "learning_rate": 2.835762800757341e-05, + "loss": 0.2988, + "step": 7789 + }, + { + "epoch": 2.501204045593193, + "grad_norm": 0.8890562057495117, + "learning_rate": 2.8333279255076306e-05, + "loss": 0.4109, + "step": 7790 + }, + { + "epoch": 2.5015251244180448, + "grad_norm": 0.8579180836677551, + "learning_rate": 2.8308939234951726e-05, + "loss": 0.3457, + "step": 7791 + }, + { + "epoch": 2.501846203242896, + "grad_norm": 0.8420111536979675, + "learning_rate": 2.8284607950165442e-05, + "loss": 0.3082, + "step": 7792 + }, + { + "epoch": 2.502167282067748, + "grad_norm": 0.78440922498703, + "learning_rate": 2.826028540368215e-05, + "loss": 0.2953, + "step": 7793 + }, + { + "epoch": 2.502488360892599, + "grad_norm": 0.5694653391838074, + "learning_rate": 2.823597159846547e-05, + "loss": 0.2637, + "step": 7794 + }, + { + "epoch": 2.5028094397174505, + "grad_norm": 0.77173912525177, + "learning_rate": 2.8211666537477933e-05, + "loss": 0.3009, + "step": 7795 + }, + { + "epoch": 2.503130518542302, + "grad_norm": 0.9021281599998474, + "learning_rate": 2.8187370223681132e-05, + "loss": 0.3543, + "step": 7796 + }, + { + "epoch": 2.5034515973671536, + "grad_norm": 0.8287353515625, + "learning_rate": 2.816308266003541e-05, + "loss": 0.307, + "step": 7797 + }, + { + "epoch": 2.503772676192005, + "grad_norm": 1.0291224718093872, + "learning_rate": 2.813880384950016e-05, + "loss": 0.419, + "step": 7798 + }, + { + "epoch": 2.5040937550168567, + "grad_norm": 0.7518448233604431, + "learning_rate": 2.8114533795033683e-05, + "loss": 0.3259, + "step": 7799 + }, + { + "epoch": 2.5044148338417083, + "grad_norm": 0.9459495544433594, + "learning_rate": 2.8090272499593173e-05, + "loss": 0.3573, + "step": 7800 + }, + { + "epoch": 2.5047359126665594, + "grad_norm": 0.5885202884674072, + "learning_rate": 2.8066019966134904e-05, + "loss": 0.2741, + "step": 7801 + }, + { + "epoch": 2.5050569914914114, + "grad_norm": 0.9306980967521667, + "learning_rate": 2.8041776197613844e-05, + "loss": 0.3992, + "step": 7802 + }, + { + "epoch": 2.5053780703162625, + "grad_norm": 0.9558607339859009, + "learning_rate": 2.8017541196984142e-05, + "loss": 0.3941, + "step": 7803 + }, + { + "epoch": 2.505699149141114, + "grad_norm": 0.9242830276489258, + "learning_rate": 2.7993314967198635e-05, + "loss": 0.3423, + "step": 7804 + }, + { + "epoch": 2.5060202279659656, + "grad_norm": 0.6770075559616089, + "learning_rate": 2.7969097511209308e-05, + "loss": 0.273, + "step": 7805 + }, + { + "epoch": 2.506341306790817, + "grad_norm": 0.7561593055725098, + "learning_rate": 2.7944888831966987e-05, + "loss": 0.2808, + "step": 7806 + }, + { + "epoch": 2.5066623856156687, + "grad_norm": 0.8181188702583313, + "learning_rate": 2.7920688932421335e-05, + "loss": 0.3705, + "step": 7807 + }, + { + "epoch": 2.5069834644405202, + "grad_norm": 0.762374758720398, + "learning_rate": 2.7896497815521128e-05, + "loss": 0.3188, + "step": 7808 + }, + { + "epoch": 2.5073045432653718, + "grad_norm": 1.0830848217010498, + "learning_rate": 2.7872315484213952e-05, + "loss": 0.4268, + "step": 7809 + }, + { + "epoch": 2.507625622090223, + "grad_norm": 1.1398093700408936, + "learning_rate": 2.7848141941446347e-05, + "loss": 0.4128, + "step": 7810 + }, + { + "epoch": 2.507946700915075, + "grad_norm": 0.8498218059539795, + "learning_rate": 2.7823977190163786e-05, + "loss": 0.2717, + "step": 7811 + }, + { + "epoch": 2.508267779739926, + "grad_norm": 1.4102894067764282, + "learning_rate": 2.7799821233310674e-05, + "loss": 0.4987, + "step": 7812 + }, + { + "epoch": 2.5085888585647775, + "grad_norm": 1.8042616844177246, + "learning_rate": 2.7775674073830337e-05, + "loss": 0.3982, + "step": 7813 + }, + { + "epoch": 2.508909937389629, + "grad_norm": 0.7573620080947876, + "learning_rate": 2.775153571466502e-05, + "loss": 0.3681, + "step": 7814 + }, + { + "epoch": 2.5092310162144806, + "grad_norm": 1.1396507024765015, + "learning_rate": 2.772740615875594e-05, + "loss": 0.2719, + "step": 7815 + }, + { + "epoch": 2.509552095039332, + "grad_norm": 0.5033369064331055, + "learning_rate": 2.770328540904319e-05, + "loss": 0.2497, + "step": 7816 + }, + { + "epoch": 2.5098731738641837, + "grad_norm": 0.7801833152770996, + "learning_rate": 2.7679173468465812e-05, + "loss": 0.2883, + "step": 7817 + }, + { + "epoch": 2.5101942526890353, + "grad_norm": 0.8423357009887695, + "learning_rate": 2.7655070339961776e-05, + "loss": 0.3138, + "step": 7818 + }, + { + "epoch": 2.5105153315138864, + "grad_norm": 0.672028124332428, + "learning_rate": 2.7630976026467968e-05, + "loss": 0.2912, + "step": 7819 + }, + { + "epoch": 2.5108364103387384, + "grad_norm": 0.8141403198242188, + "learning_rate": 2.7606890530920195e-05, + "loss": 0.3421, + "step": 7820 + }, + { + "epoch": 2.5111574891635895, + "grad_norm": 0.9063519239425659, + "learning_rate": 2.7582813856253275e-05, + "loss": 0.3336, + "step": 7821 + }, + { + "epoch": 2.511478567988441, + "grad_norm": 0.767846941947937, + "learning_rate": 2.755874600540078e-05, + "loss": 0.3033, + "step": 7822 + }, + { + "epoch": 2.5117996468132926, + "grad_norm": 2.2708170413970947, + "learning_rate": 2.753468698129533e-05, + "loss": 0.4001, + "step": 7823 + }, + { + "epoch": 2.512120725638144, + "grad_norm": 1.0831410884857178, + "learning_rate": 2.7510636786868514e-05, + "loss": 0.3136, + "step": 7824 + }, + { + "epoch": 2.5124418044629957, + "grad_norm": 0.8670080900192261, + "learning_rate": 2.7486595425050665e-05, + "loss": 0.3511, + "step": 7825 + }, + { + "epoch": 2.5127628832878472, + "grad_norm": 0.7800338268280029, + "learning_rate": 2.746256289877126e-05, + "loss": 0.3466, + "step": 7826 + }, + { + "epoch": 2.513083962112699, + "grad_norm": 0.871461033821106, + "learning_rate": 2.743853921095848e-05, + "loss": 0.2794, + "step": 7827 + }, + { + "epoch": 2.51340504093755, + "grad_norm": 0.7801231741905212, + "learning_rate": 2.741452436453963e-05, + "loss": 0.3329, + "step": 7828 + }, + { + "epoch": 2.513726119762402, + "grad_norm": 0.9049973487854004, + "learning_rate": 2.7390518362440808e-05, + "loss": 0.3623, + "step": 7829 + }, + { + "epoch": 2.514047198587253, + "grad_norm": 0.6395286321640015, + "learning_rate": 2.736652120758708e-05, + "loss": 0.7761, + "step": 7830 + }, + { + "epoch": 2.5143682774121046, + "grad_norm": 0.4424824118614197, + "learning_rate": 2.734253290290242e-05, + "loss": 0.6206, + "step": 7831 + }, + { + "epoch": 2.514689356236956, + "grad_norm": 0.5469640493392944, + "learning_rate": 2.7318553451309726e-05, + "loss": 0.3887, + "step": 7832 + }, + { + "epoch": 2.5150104350618077, + "grad_norm": 0.3367752134799957, + "learning_rate": 2.7294582855730832e-05, + "loss": 0.0776, + "step": 7833 + }, + { + "epoch": 2.515331513886659, + "grad_norm": 0.5025734305381775, + "learning_rate": 2.727062111908647e-05, + "loss": 0.2786, + "step": 7834 + }, + { + "epoch": 2.5156525927115108, + "grad_norm": 0.4243975281715393, + "learning_rate": 2.7246668244296323e-05, + "loss": 0.2066, + "step": 7835 + }, + { + "epoch": 2.5159736715363623, + "grad_norm": 0.6841888427734375, + "learning_rate": 2.722272423427896e-05, + "loss": 0.3834, + "step": 7836 + }, + { + "epoch": 2.5162947503612134, + "grad_norm": 0.6961179375648499, + "learning_rate": 2.7198789091951902e-05, + "loss": 0.4068, + "step": 7837 + }, + { + "epoch": 2.5166158291860654, + "grad_norm": 0.9804593920707703, + "learning_rate": 2.717486282023153e-05, + "loss": 0.3961, + "step": 7838 + }, + { + "epoch": 2.5169369080109165, + "grad_norm": 0.7120915651321411, + "learning_rate": 2.715094542203327e-05, + "loss": 0.3416, + "step": 7839 + }, + { + "epoch": 2.517257986835768, + "grad_norm": 0.7814658284187317, + "learning_rate": 2.7127036900271317e-05, + "loss": 0.3016, + "step": 7840 + }, + { + "epoch": 2.5175790656606196, + "grad_norm": 0.5942254662513733, + "learning_rate": 2.7103137257858868e-05, + "loss": 0.281, + "step": 7841 + }, + { + "epoch": 2.517900144485471, + "grad_norm": 0.8657404780387878, + "learning_rate": 2.707924649770802e-05, + "loss": 0.3926, + "step": 7842 + }, + { + "epoch": 2.5182212233103227, + "grad_norm": 0.7496175765991211, + "learning_rate": 2.7055364622729773e-05, + "loss": 0.3238, + "step": 7843 + }, + { + "epoch": 2.5185423021351743, + "grad_norm": 0.9622612595558167, + "learning_rate": 2.7031491635834137e-05, + "loss": 0.3779, + "step": 7844 + }, + { + "epoch": 2.518863380960026, + "grad_norm": 1.0655999183654785, + "learning_rate": 2.700762753992985e-05, + "loss": 0.2991, + "step": 7845 + }, + { + "epoch": 2.519184459784877, + "grad_norm": 0.988420844078064, + "learning_rate": 2.698377233792476e-05, + "loss": 0.4061, + "step": 7846 + }, + { + "epoch": 2.519505538609729, + "grad_norm": 0.6470503211021423, + "learning_rate": 2.6959926032725535e-05, + "loss": 0.2876, + "step": 7847 + }, + { + "epoch": 2.51982661743458, + "grad_norm": 0.7641369104385376, + "learning_rate": 2.6936088627237765e-05, + "loss": 0.2941, + "step": 7848 + }, + { + "epoch": 2.5201476962594316, + "grad_norm": 0.9086601138114929, + "learning_rate": 2.6912260124366006e-05, + "loss": 0.3384, + "step": 7849 + }, + { + "epoch": 2.520468775084283, + "grad_norm": 0.7186222672462463, + "learning_rate": 2.688844052701359e-05, + "loss": 0.2905, + "step": 7850 + }, + { + "epoch": 2.5207898539091347, + "grad_norm": 0.9355291724205017, + "learning_rate": 2.6864629838082956e-05, + "loss": 0.4228, + "step": 7851 + }, + { + "epoch": 2.5211109327339862, + "grad_norm": 0.6973735690116882, + "learning_rate": 2.6840828060475332e-05, + "loss": 0.2988, + "step": 7852 + }, + { + "epoch": 2.521432011558838, + "grad_norm": 0.8942835330963135, + "learning_rate": 2.681703519709089e-05, + "loss": 0.3534, + "step": 7853 + }, + { + "epoch": 2.5217530903836893, + "grad_norm": 0.9520301222801208, + "learning_rate": 2.679325125082872e-05, + "loss": 0.3538, + "step": 7854 + }, + { + "epoch": 2.5220741692085404, + "grad_norm": 0.8248050212860107, + "learning_rate": 2.676947622458683e-05, + "loss": 0.332, + "step": 7855 + }, + { + "epoch": 2.5223952480333924, + "grad_norm": 1.0398519039154053, + "learning_rate": 2.6745710121262136e-05, + "loss": 0.4481, + "step": 7856 + }, + { + "epoch": 2.5227163268582435, + "grad_norm": 0.9661288857460022, + "learning_rate": 2.672195294375045e-05, + "loss": 0.3479, + "step": 7857 + }, + { + "epoch": 2.523037405683095, + "grad_norm": 0.7361083030700684, + "learning_rate": 2.6698204694946527e-05, + "loss": 0.2805, + "step": 7858 + }, + { + "epoch": 2.5233584845079466, + "grad_norm": 0.7523530125617981, + "learning_rate": 2.6674465377744017e-05, + "loss": 0.2939, + "step": 7859 + }, + { + "epoch": 2.523679563332798, + "grad_norm": 1.0147556066513062, + "learning_rate": 2.6650734995035477e-05, + "loss": 0.3913, + "step": 7860 + }, + { + "epoch": 2.5240006421576497, + "grad_norm": 0.7158978581428528, + "learning_rate": 2.6627013549712355e-05, + "loss": 0.2834, + "step": 7861 + }, + { + "epoch": 2.5243217209825013, + "grad_norm": 0.8579987287521362, + "learning_rate": 2.660330104466513e-05, + "loss": 0.3325, + "step": 7862 + }, + { + "epoch": 2.524642799807353, + "grad_norm": 0.8115719556808472, + "learning_rate": 2.657959748278297e-05, + "loss": 0.3175, + "step": 7863 + }, + { + "epoch": 2.524963878632204, + "grad_norm": 0.7678377032279968, + "learning_rate": 2.655590286695422e-05, + "loss": 0.3265, + "step": 7864 + }, + { + "epoch": 2.525284957457056, + "grad_norm": 0.8553791642189026, + "learning_rate": 2.6532217200065858e-05, + "loss": 0.3043, + "step": 7865 + }, + { + "epoch": 2.525606036281907, + "grad_norm": 0.8253560662269592, + "learning_rate": 2.6508540485004006e-05, + "loss": 0.2808, + "step": 7866 + }, + { + "epoch": 2.5259271151067586, + "grad_norm": 0.8303504586219788, + "learning_rate": 2.6484872724653608e-05, + "loss": 0.3263, + "step": 7867 + }, + { + "epoch": 2.52624819393161, + "grad_norm": 0.8876441717147827, + "learning_rate": 2.646121392189841e-05, + "loss": 0.3131, + "step": 7868 + }, + { + "epoch": 2.5265692727564617, + "grad_norm": 0.9586053490638733, + "learning_rate": 2.6437564079621267e-05, + "loss": 0.3133, + "step": 7869 + }, + { + "epoch": 2.5268903515813133, + "grad_norm": 1.3426408767700195, + "learning_rate": 2.6413923200703794e-05, + "loss": 0.3845, + "step": 7870 + }, + { + "epoch": 2.527211430406165, + "grad_norm": 0.7227518558502197, + "learning_rate": 2.639029128802657e-05, + "loss": 0.2797, + "step": 7871 + }, + { + "epoch": 2.5275325092310164, + "grad_norm": 0.6822044253349304, + "learning_rate": 2.636666834446907e-05, + "loss": 0.2739, + "step": 7872 + }, + { + "epoch": 2.5278535880558675, + "grad_norm": 0.6981026530265808, + "learning_rate": 2.634305437290968e-05, + "loss": 0.2966, + "step": 7873 + }, + { + "epoch": 2.5281746668807195, + "grad_norm": 0.6758238673210144, + "learning_rate": 2.631944937622569e-05, + "loss": 0.2845, + "step": 7874 + }, + { + "epoch": 2.5284957457055706, + "grad_norm": 0.8002867698669434, + "learning_rate": 2.6295853357293298e-05, + "loss": 0.3453, + "step": 7875 + }, + { + "epoch": 2.528816824530422, + "grad_norm": 0.7498476505279541, + "learning_rate": 2.6272266318987603e-05, + "loss": 0.3061, + "step": 7876 + }, + { + "epoch": 2.5291379033552737, + "grad_norm": 0.633752703666687, + "learning_rate": 2.624868826418262e-05, + "loss": 0.2688, + "step": 7877 + }, + { + "epoch": 2.529458982180125, + "grad_norm": 0.6049948334693909, + "learning_rate": 2.6225119195751258e-05, + "loss": 0.2902, + "step": 7878 + }, + { + "epoch": 2.5297800610049768, + "grad_norm": 0.6882827877998352, + "learning_rate": 2.6201559116565345e-05, + "loss": 0.3093, + "step": 7879 + }, + { + "epoch": 2.5301011398298283, + "grad_norm": 0.4292110800743103, + "learning_rate": 2.6178008029495592e-05, + "loss": 0.7076, + "step": 7880 + }, + { + "epoch": 2.53042221865468, + "grad_norm": 0.42428478598594666, + "learning_rate": 2.615446593741161e-05, + "loss": 0.3113, + "step": 7881 + }, + { + "epoch": 2.530743297479531, + "grad_norm": 0.4815612733364105, + "learning_rate": 2.613093284318201e-05, + "loss": 0.3371, + "step": 7882 + }, + { + "epoch": 2.531064376304383, + "grad_norm": 0.5496947765350342, + "learning_rate": 2.6107408749674122e-05, + "loss": 0.2977, + "step": 7883 + }, + { + "epoch": 2.531385455129234, + "grad_norm": 0.33660855889320374, + "learning_rate": 2.6083893659754356e-05, + "loss": 0.0772, + "step": 7884 + }, + { + "epoch": 2.5317065339540856, + "grad_norm": 0.40665629506111145, + "learning_rate": 2.606038757628798e-05, + "loss": 0.1418, + "step": 7885 + }, + { + "epoch": 2.532027612778937, + "grad_norm": 0.2342071682214737, + "learning_rate": 2.603689050213902e-05, + "loss": 0.0695, + "step": 7886 + }, + { + "epoch": 2.5323486916037887, + "grad_norm": 0.29153773188591003, + "learning_rate": 2.6013402440170676e-05, + "loss": 0.116, + "step": 7887 + }, + { + "epoch": 2.5326697704286403, + "grad_norm": 1.029974341392517, + "learning_rate": 2.5989923393244742e-05, + "loss": 0.4608, + "step": 7888 + }, + { + "epoch": 2.532990849253492, + "grad_norm": 0.7505784034729004, + "learning_rate": 2.5966453364222186e-05, + "loss": 0.3698, + "step": 7889 + }, + { + "epoch": 2.5333119280783434, + "grad_norm": 0.8639885783195496, + "learning_rate": 2.5942992355962727e-05, + "loss": 0.313, + "step": 7890 + }, + { + "epoch": 2.5336330069031945, + "grad_norm": 0.9737856388092041, + "learning_rate": 2.5919540371325e-05, + "loss": 0.431, + "step": 7891 + }, + { + "epoch": 2.5339540857280465, + "grad_norm": 0.6815305352210999, + "learning_rate": 2.5896097413166564e-05, + "loss": 0.2643, + "step": 7892 + }, + { + "epoch": 2.5342751645528976, + "grad_norm": 0.9808143377304077, + "learning_rate": 2.5872663484343884e-05, + "loss": 0.4601, + "step": 7893 + }, + { + "epoch": 2.534596243377749, + "grad_norm": 0.9797862768173218, + "learning_rate": 2.584923858771231e-05, + "loss": 0.3746, + "step": 7894 + }, + { + "epoch": 2.5349173222026007, + "grad_norm": 0.7613912224769592, + "learning_rate": 2.582582272612609e-05, + "loss": 0.3253, + "step": 7895 + }, + { + "epoch": 2.5352384010274522, + "grad_norm": 0.8609127402305603, + "learning_rate": 2.580241590243837e-05, + "loss": 0.3091, + "step": 7896 + }, + { + "epoch": 2.535559479852304, + "grad_norm": 0.8038222789764404, + "learning_rate": 2.5779018119501208e-05, + "loss": 0.3423, + "step": 7897 + }, + { + "epoch": 2.5358805586771553, + "grad_norm": 0.8532456159591675, + "learning_rate": 2.575562938016556e-05, + "loss": 0.3167, + "step": 7898 + }, + { + "epoch": 2.536201637502007, + "grad_norm": 0.8715474009513855, + "learning_rate": 2.573224968728123e-05, + "loss": 0.3521, + "step": 7899 + }, + { + "epoch": 2.536522716326858, + "grad_norm": 0.95112144947052, + "learning_rate": 2.5708879043697054e-05, + "loss": 0.3424, + "step": 7900 + }, + { + "epoch": 2.53684379515171, + "grad_norm": 0.8659119009971619, + "learning_rate": 2.5685517452260567e-05, + "loss": 0.3874, + "step": 7901 + }, + { + "epoch": 2.537164873976561, + "grad_norm": 0.9276009798049927, + "learning_rate": 2.566216491581841e-05, + "loss": 0.3019, + "step": 7902 + }, + { + "epoch": 2.5374859528014126, + "grad_norm": 0.8854154944419861, + "learning_rate": 2.5638821437215944e-05, + "loss": 0.3359, + "step": 7903 + }, + { + "epoch": 2.537807031626264, + "grad_norm": 0.9696468114852905, + "learning_rate": 2.561548701929749e-05, + "loss": 0.4029, + "step": 7904 + }, + { + "epoch": 2.5381281104511157, + "grad_norm": 0.8522638082504272, + "learning_rate": 2.5592161664906368e-05, + "loss": 0.3034, + "step": 7905 + }, + { + "epoch": 2.5384491892759673, + "grad_norm": 0.8860786557197571, + "learning_rate": 2.5568845376884587e-05, + "loss": 0.2844, + "step": 7906 + }, + { + "epoch": 2.538770268100819, + "grad_norm": 0.919465184211731, + "learning_rate": 2.554553815807328e-05, + "loss": 0.3504, + "step": 7907 + }, + { + "epoch": 2.5390913469256704, + "grad_norm": 0.8986783623695374, + "learning_rate": 2.5522240011312247e-05, + "loss": 0.3556, + "step": 7908 + }, + { + "epoch": 2.5394124257505215, + "grad_norm": 0.5819917321205139, + "learning_rate": 2.549895093944039e-05, + "loss": 0.2474, + "step": 7909 + }, + { + "epoch": 2.5397335045753735, + "grad_norm": 0.701481282711029, + "learning_rate": 2.547567094529537e-05, + "loss": 0.2874, + "step": 7910 + }, + { + "epoch": 2.5400545834002246, + "grad_norm": 0.9206550121307373, + "learning_rate": 2.5452400031713785e-05, + "loss": 0.327, + "step": 7911 + }, + { + "epoch": 2.540375662225076, + "grad_norm": 0.8680040836334229, + "learning_rate": 2.542913820153113e-05, + "loss": 0.3246, + "step": 7912 + }, + { + "epoch": 2.5406967410499277, + "grad_norm": 0.9277855157852173, + "learning_rate": 2.540588545758179e-05, + "loss": 0.2907, + "step": 7913 + }, + { + "epoch": 2.5410178198747793, + "grad_norm": 0.8211881518363953, + "learning_rate": 2.5382641802699035e-05, + "loss": 0.2917, + "step": 7914 + }, + { + "epoch": 2.541338898699631, + "grad_norm": 1.1107295751571655, + "learning_rate": 2.535940723971505e-05, + "loss": 0.4215, + "step": 7915 + }, + { + "epoch": 2.5416599775244824, + "grad_norm": 0.9217866659164429, + "learning_rate": 2.5336181771460876e-05, + "loss": 0.3113, + "step": 7916 + }, + { + "epoch": 2.541981056349334, + "grad_norm": 0.8937177062034607, + "learning_rate": 2.5312965400766474e-05, + "loss": 0.2755, + "step": 7917 + }, + { + "epoch": 2.542302135174185, + "grad_norm": 0.5329973697662354, + "learning_rate": 2.5289758130460683e-05, + "loss": 0.2237, + "step": 7918 + }, + { + "epoch": 2.542623213999037, + "grad_norm": 0.9037802815437317, + "learning_rate": 2.5266559963371216e-05, + "loss": 0.3482, + "step": 7919 + }, + { + "epoch": 2.542944292823888, + "grad_norm": 1.0273380279541016, + "learning_rate": 2.5243370902324792e-05, + "loss": 0.3545, + "step": 7920 + }, + { + "epoch": 2.5432653716487397, + "grad_norm": 0.8402252197265625, + "learning_rate": 2.5220190950146827e-05, + "loss": 0.3273, + "step": 7921 + }, + { + "epoch": 2.543586450473591, + "grad_norm": 1.0652894973754883, + "learning_rate": 2.5197020109661772e-05, + "loss": 0.348, + "step": 7922 + }, + { + "epoch": 2.5439075292984428, + "grad_norm": 1.2122594118118286, + "learning_rate": 2.5173858383692906e-05, + "loss": 0.4131, + "step": 7923 + }, + { + "epoch": 2.5442286081232943, + "grad_norm": 1.0098888874053955, + "learning_rate": 2.51507057750624e-05, + "loss": 0.3763, + "step": 7924 + }, + { + "epoch": 2.544549686948146, + "grad_norm": 0.7478073835372925, + "learning_rate": 2.512756228659141e-05, + "loss": 0.2882, + "step": 7925 + }, + { + "epoch": 2.5448707657729974, + "grad_norm": 0.486265629529953, + "learning_rate": 2.5104427921099782e-05, + "loss": 0.2455, + "step": 7926 + }, + { + "epoch": 2.5451918445978485, + "grad_norm": 0.5730571746826172, + "learning_rate": 2.508130268140646e-05, + "loss": 0.2751, + "step": 7927 + }, + { + "epoch": 2.5455129234227005, + "grad_norm": 0.6584112048149109, + "learning_rate": 2.5058186570329156e-05, + "loss": 0.2707, + "step": 7928 + }, + { + "epoch": 2.5458340022475516, + "grad_norm": 0.6145785450935364, + "learning_rate": 2.5035079590684497e-05, + "loss": 0.323, + "step": 7929 + }, + { + "epoch": 2.546155081072403, + "grad_norm": 0.4932810366153717, + "learning_rate": 2.5011981745288015e-05, + "loss": 0.5853, + "step": 7930 + }, + { + "epoch": 2.5464761598972547, + "grad_norm": 0.5000682473182678, + "learning_rate": 2.4988893036954043e-05, + "loss": 0.3042, + "step": 7931 + }, + { + "epoch": 2.5467972387221063, + "grad_norm": 0.4661361873149872, + "learning_rate": 2.496581346849596e-05, + "loss": 0.2213, + "step": 7932 + }, + { + "epoch": 2.547118317546958, + "grad_norm": 0.6509041786193848, + "learning_rate": 2.4942743042725892e-05, + "loss": 0.3622, + "step": 7933 + }, + { + "epoch": 2.5474393963718094, + "grad_norm": 0.3958999216556549, + "learning_rate": 2.4919681762454918e-05, + "loss": 0.2316, + "step": 7934 + }, + { + "epoch": 2.547760475196661, + "grad_norm": 0.45946624875068665, + "learning_rate": 2.4896629630492973e-05, + "loss": 0.2446, + "step": 7935 + }, + { + "epoch": 2.548081554021512, + "grad_norm": 0.3704213500022888, + "learning_rate": 2.4873586649648894e-05, + "loss": 0.0943, + "step": 7936 + }, + { + "epoch": 2.548402632846364, + "grad_norm": 0.6541774272918701, + "learning_rate": 2.48505528227304e-05, + "loss": 0.2996, + "step": 7937 + }, + { + "epoch": 2.548723711671215, + "grad_norm": 0.9613190293312073, + "learning_rate": 2.48275281525441e-05, + "loss": 0.564, + "step": 7938 + }, + { + "epoch": 2.5490447904960667, + "grad_norm": 0.8291346430778503, + "learning_rate": 2.480451264189546e-05, + "loss": 0.4429, + "step": 7939 + }, + { + "epoch": 2.5493658693209182, + "grad_norm": 0.6980516910552979, + "learning_rate": 2.4781506293588873e-05, + "loss": 0.3154, + "step": 7940 + }, + { + "epoch": 2.54968694814577, + "grad_norm": 0.7115036249160767, + "learning_rate": 2.4758509110427575e-05, + "loss": 0.2834, + "step": 7941 + }, + { + "epoch": 2.5500080269706213, + "grad_norm": 0.7645947933197021, + "learning_rate": 2.4735521095213687e-05, + "loss": 0.3213, + "step": 7942 + }, + { + "epoch": 2.550329105795473, + "grad_norm": 0.8161126971244812, + "learning_rate": 2.4712542250748304e-05, + "loss": 0.3376, + "step": 7943 + }, + { + "epoch": 2.5506501846203244, + "grad_norm": 0.9567040801048279, + "learning_rate": 2.4689572579831222e-05, + "loss": 0.379, + "step": 7944 + }, + { + "epoch": 2.5509712634451756, + "grad_norm": 0.878835916519165, + "learning_rate": 2.4666612085261342e-05, + "loss": 0.3387, + "step": 7945 + }, + { + "epoch": 2.5512923422700275, + "grad_norm": 0.9040188193321228, + "learning_rate": 2.464366076983623e-05, + "loss": 0.3488, + "step": 7946 + }, + { + "epoch": 2.5516134210948787, + "grad_norm": 0.9474675059318542, + "learning_rate": 2.4620718636352457e-05, + "loss": 0.3582, + "step": 7947 + }, + { + "epoch": 2.55193449991973, + "grad_norm": 0.6305925250053406, + "learning_rate": 2.4597785687605513e-05, + "loss": 0.256, + "step": 7948 + }, + { + "epoch": 2.5522555787445818, + "grad_norm": 0.7935699820518494, + "learning_rate": 2.4574861926389615e-05, + "loss": 0.3362, + "step": 7949 + }, + { + "epoch": 2.5525766575694333, + "grad_norm": 0.9189661741256714, + "learning_rate": 2.4551947355498027e-05, + "loss": 0.3227, + "step": 7950 + }, + { + "epoch": 2.552897736394285, + "grad_norm": 1.0050448179244995, + "learning_rate": 2.45290419777228e-05, + "loss": 0.4139, + "step": 7951 + }, + { + "epoch": 2.5532188152191364, + "grad_norm": 1.0046634674072266, + "learning_rate": 2.4506145795854873e-05, + "loss": 0.4224, + "step": 7952 + }, + { + "epoch": 2.553539894043988, + "grad_norm": 0.7242197394371033, + "learning_rate": 2.4483258812684096e-05, + "loss": 0.3028, + "step": 7953 + }, + { + "epoch": 2.553860972868839, + "grad_norm": 0.6570572257041931, + "learning_rate": 2.4460381030999158e-05, + "loss": 0.2526, + "step": 7954 + }, + { + "epoch": 2.554182051693691, + "grad_norm": 0.7426683902740479, + "learning_rate": 2.443751245358765e-05, + "loss": 0.2802, + "step": 7955 + }, + { + "epoch": 2.554503130518542, + "grad_norm": 0.7006813883781433, + "learning_rate": 2.441465308323605e-05, + "loss": 0.3058, + "step": 7956 + }, + { + "epoch": 2.5548242093433937, + "grad_norm": 0.7258092164993286, + "learning_rate": 2.43918029227297e-05, + "loss": 0.2643, + "step": 7957 + }, + { + "epoch": 2.5551452881682453, + "grad_norm": 1.4597491025924683, + "learning_rate": 2.436896197485282e-05, + "loss": 0.3756, + "step": 7958 + }, + { + "epoch": 2.555466366993097, + "grad_norm": 0.7586961984634399, + "learning_rate": 2.43461302423885e-05, + "loss": 0.3493, + "step": 7959 + }, + { + "epoch": 2.5557874458179484, + "grad_norm": 1.1637358665466309, + "learning_rate": 2.4323307728118738e-05, + "loss": 0.4117, + "step": 7960 + }, + { + "epoch": 2.5561085246428, + "grad_norm": 0.764044463634491, + "learning_rate": 2.4300494434824373e-05, + "loss": 0.2827, + "step": 7961 + }, + { + "epoch": 2.5564296034676515, + "grad_norm": 0.7120959162712097, + "learning_rate": 2.4277690365285112e-05, + "loss": 0.2924, + "step": 7962 + }, + { + "epoch": 2.5567506822925026, + "grad_norm": 0.6114968061447144, + "learning_rate": 2.425489552227964e-05, + "loss": 0.2665, + "step": 7963 + }, + { + "epoch": 2.5570717611173546, + "grad_norm": 0.710640013217926, + "learning_rate": 2.4232109908585377e-05, + "loss": 0.3022, + "step": 7964 + }, + { + "epoch": 2.5573928399422057, + "grad_norm": 0.6761636137962341, + "learning_rate": 2.420933352697865e-05, + "loss": 0.2818, + "step": 7965 + }, + { + "epoch": 2.5577139187670572, + "grad_norm": 0.927751898765564, + "learning_rate": 2.4186566380234798e-05, + "loss": 0.363, + "step": 7966 + }, + { + "epoch": 2.5580349975919088, + "grad_norm": 0.8839023113250732, + "learning_rate": 2.4163808471127812e-05, + "loss": 0.3743, + "step": 7967 + }, + { + "epoch": 2.5583560764167603, + "grad_norm": 0.7871977090835571, + "learning_rate": 2.4141059802430777e-05, + "loss": 0.3433, + "step": 7968 + }, + { + "epoch": 2.558677155241612, + "grad_norm": 0.7037361860275269, + "learning_rate": 2.411832037691545e-05, + "loss": 0.2797, + "step": 7969 + }, + { + "epoch": 2.5589982340664634, + "grad_norm": 0.6723461151123047, + "learning_rate": 2.4095590197352635e-05, + "loss": 0.2862, + "step": 7970 + }, + { + "epoch": 2.559319312891315, + "grad_norm": 1.6565648317337036, + "learning_rate": 2.407286926651192e-05, + "loss": 0.3784, + "step": 7971 + }, + { + "epoch": 2.559640391716166, + "grad_norm": 0.8423799276351929, + "learning_rate": 2.405015758716177e-05, + "loss": 0.3592, + "step": 7972 + }, + { + "epoch": 2.559961470541018, + "grad_norm": 1.355826735496521, + "learning_rate": 2.4027455162069567e-05, + "loss": 0.3569, + "step": 7973 + }, + { + "epoch": 2.560282549365869, + "grad_norm": 0.8471829891204834, + "learning_rate": 2.4004761994001435e-05, + "loss": 0.2898, + "step": 7974 + }, + { + "epoch": 2.5606036281907207, + "grad_norm": 0.7122255563735962, + "learning_rate": 2.3982078085722582e-05, + "loss": 0.3251, + "step": 7975 + }, + { + "epoch": 2.5609247070155723, + "grad_norm": 0.792010486125946, + "learning_rate": 2.3959403439996907e-05, + "loss": 0.2844, + "step": 7976 + }, + { + "epoch": 2.561245785840424, + "grad_norm": 0.43959325551986694, + "learning_rate": 2.3936738059587282e-05, + "loss": 0.2479, + "step": 7977 + }, + { + "epoch": 2.5615668646652754, + "grad_norm": 0.33454373478889465, + "learning_rate": 2.3914081947255397e-05, + "loss": 0.2251, + "step": 7978 + }, + { + "epoch": 2.561887943490127, + "grad_norm": 0.6790142059326172, + "learning_rate": 2.389143510576184e-05, + "loss": 0.2833, + "step": 7979 + }, + { + "epoch": 2.5622090223149785, + "grad_norm": 0.5404451489448547, + "learning_rate": 2.3868797537866016e-05, + "loss": 0.6905, + "step": 7980 + }, + { + "epoch": 2.5625301011398296, + "grad_norm": 0.36907678842544556, + "learning_rate": 2.3846169246326343e-05, + "loss": 0.3373, + "step": 7981 + }, + { + "epoch": 2.5628511799646816, + "grad_norm": 0.5592865347862244, + "learning_rate": 2.3823550233899915e-05, + "loss": 0.4475, + "step": 7982 + }, + { + "epoch": 2.5631722587895327, + "grad_norm": 0.4797590374946594, + "learning_rate": 2.3800940503342828e-05, + "loss": 0.1677, + "step": 7983 + }, + { + "epoch": 2.5634933376143842, + "grad_norm": 0.6905731558799744, + "learning_rate": 2.3778340057409998e-05, + "loss": 0.2087, + "step": 7984 + }, + { + "epoch": 2.563814416439236, + "grad_norm": 0.22037728130817413, + "learning_rate": 2.37557488988552e-05, + "loss": 0.0671, + "step": 7985 + }, + { + "epoch": 2.5641354952640873, + "grad_norm": 0.41251930594444275, + "learning_rate": 2.373316703043119e-05, + "loss": 0.1551, + "step": 7986 + }, + { + "epoch": 2.564456574088939, + "grad_norm": 0.3948359191417694, + "learning_rate": 2.3710594454889378e-05, + "loss": 0.1627, + "step": 7987 + }, + { + "epoch": 2.5647776529137905, + "grad_norm": 0.5292665362358093, + "learning_rate": 2.3688031174980275e-05, + "loss": 0.2721, + "step": 7988 + }, + { + "epoch": 2.565098731738642, + "grad_norm": 0.8082164525985718, + "learning_rate": 2.3665477193453034e-05, + "loss": 0.3934, + "step": 7989 + }, + { + "epoch": 2.565419810563493, + "grad_norm": 0.9454381465911865, + "learning_rate": 2.3642932513055884e-05, + "loss": 0.4163, + "step": 7990 + }, + { + "epoch": 2.565740889388345, + "grad_norm": 0.889689564704895, + "learning_rate": 2.362039713653581e-05, + "loss": 0.4446, + "step": 7991 + }, + { + "epoch": 2.566061968213196, + "grad_norm": 0.7542763948440552, + "learning_rate": 2.359787106663861e-05, + "loss": 0.3018, + "step": 7992 + }, + { + "epoch": 2.5663830470380478, + "grad_norm": 0.7268791198730469, + "learning_rate": 2.35753543061091e-05, + "loss": 0.3345, + "step": 7993 + }, + { + "epoch": 2.5667041258628993, + "grad_norm": 0.6492055654525757, + "learning_rate": 2.3552846857690846e-05, + "loss": 0.3103, + "step": 7994 + }, + { + "epoch": 2.567025204687751, + "grad_norm": 0.7166942358016968, + "learning_rate": 2.3530348724126307e-05, + "loss": 0.2754, + "step": 7995 + }, + { + "epoch": 2.5673462835126024, + "grad_norm": 0.9371656179428101, + "learning_rate": 2.3507859908156827e-05, + "loss": 0.5017, + "step": 7996 + }, + { + "epoch": 2.567667362337454, + "grad_norm": 1.0456211566925049, + "learning_rate": 2.3485380412522585e-05, + "loss": 0.3592, + "step": 7997 + }, + { + "epoch": 2.5679884411623055, + "grad_norm": 0.6862274408340454, + "learning_rate": 2.3462910239962654e-05, + "loss": 0.2818, + "step": 7998 + }, + { + "epoch": 2.5683095199871566, + "grad_norm": 1.1700456142425537, + "learning_rate": 2.3440449393214948e-05, + "loss": 0.3444, + "step": 7999 + }, + { + "epoch": 2.5686305988120086, + "grad_norm": 0.9991983771324158, + "learning_rate": 2.341799787501625e-05, + "loss": 0.4639, + "step": 8000 + }, + { + "epoch": 2.5689516776368597, + "grad_norm": 0.9430015683174133, + "learning_rate": 2.339555568810221e-05, + "loss": 0.4327, + "step": 8001 + }, + { + "epoch": 2.5692727564617113, + "grad_norm": 0.7065159678459167, + "learning_rate": 2.337312283520735e-05, + "loss": 0.2867, + "step": 8002 + }, + { + "epoch": 2.569593835286563, + "grad_norm": 0.9828401803970337, + "learning_rate": 2.3350699319065026e-05, + "loss": 0.3369, + "step": 8003 + }, + { + "epoch": 2.5699149141114144, + "grad_norm": 2.36687970161438, + "learning_rate": 2.3328285142407503e-05, + "loss": 0.3983, + "step": 8004 + }, + { + "epoch": 2.570235992936266, + "grad_norm": 0.9053291082382202, + "learning_rate": 2.3305880307965833e-05, + "loss": 0.3354, + "step": 8005 + }, + { + "epoch": 2.5705570717611175, + "grad_norm": 1.2159992456436157, + "learning_rate": 2.328348481847006e-05, + "loss": 0.5254, + "step": 8006 + }, + { + "epoch": 2.570878150585969, + "grad_norm": 0.8332436680793762, + "learning_rate": 2.3261098676648907e-05, + "loss": 0.324, + "step": 8007 + }, + { + "epoch": 2.57119922941082, + "grad_norm": 0.9708757400512695, + "learning_rate": 2.323872188523013e-05, + "loss": 0.3851, + "step": 8008 + }, + { + "epoch": 2.571520308235672, + "grad_norm": 1.2236950397491455, + "learning_rate": 2.321635444694028e-05, + "loss": 0.3511, + "step": 8009 + }, + { + "epoch": 2.5718413870605232, + "grad_norm": 0.8926087617874146, + "learning_rate": 2.319399636450468e-05, + "loss": 0.3669, + "step": 8010 + }, + { + "epoch": 2.572162465885375, + "grad_norm": 0.9284133315086365, + "learning_rate": 2.3171647640647687e-05, + "loss": 0.348, + "step": 8011 + }, + { + "epoch": 2.5724835447102263, + "grad_norm": 0.856762170791626, + "learning_rate": 2.3149308278092342e-05, + "loss": 0.3501, + "step": 8012 + }, + { + "epoch": 2.572804623535078, + "grad_norm": 0.7136096358299255, + "learning_rate": 2.3126978279560684e-05, + "loss": 0.3205, + "step": 8013 + }, + { + "epoch": 2.5731257023599294, + "grad_norm": 0.570107102394104, + "learning_rate": 2.3104657647773554e-05, + "loss": 0.2764, + "step": 8014 + }, + { + "epoch": 2.573446781184781, + "grad_norm": 0.7570350766181946, + "learning_rate": 2.3082346385450637e-05, + "loss": 0.3118, + "step": 8015 + }, + { + "epoch": 2.5737678600096325, + "grad_norm": 0.9262259006500244, + "learning_rate": 2.3060044495310505e-05, + "loss": 0.4234, + "step": 8016 + }, + { + "epoch": 2.5740889388344836, + "grad_norm": 0.3663389980792999, + "learning_rate": 2.3037751980070555e-05, + "loss": 0.2028, + "step": 8017 + }, + { + "epoch": 2.5744100176593356, + "grad_norm": 0.8820987343788147, + "learning_rate": 2.3015468842447086e-05, + "loss": 0.3276, + "step": 8018 + }, + { + "epoch": 2.5747310964841867, + "grad_norm": 0.7441263198852539, + "learning_rate": 2.2993195085155205e-05, + "loss": 0.2819, + "step": 8019 + }, + { + "epoch": 2.5750521753090383, + "grad_norm": 0.5265788435935974, + "learning_rate": 2.2970930710908935e-05, + "loss": 0.2229, + "step": 8020 + }, + { + "epoch": 2.57537325413389, + "grad_norm": 0.9287590980529785, + "learning_rate": 2.2948675722421086e-05, + "loss": 0.3445, + "step": 8021 + }, + { + "epoch": 2.5756943329587414, + "grad_norm": 0.8301019072532654, + "learning_rate": 2.2926430122403386e-05, + "loss": 0.3109, + "step": 8022 + }, + { + "epoch": 2.576015411783593, + "grad_norm": 0.7834892868995667, + "learning_rate": 2.2904193913566363e-05, + "loss": 0.35, + "step": 8023 + }, + { + "epoch": 2.5763364906084445, + "grad_norm": 0.5827646851539612, + "learning_rate": 2.2881967098619506e-05, + "loss": 0.256, + "step": 8024 + }, + { + "epoch": 2.576657569433296, + "grad_norm": 0.6390456557273865, + "learning_rate": 2.2859749680270982e-05, + "loss": 0.272, + "step": 8025 + }, + { + "epoch": 2.576978648258147, + "grad_norm": 0.9293005466461182, + "learning_rate": 2.2837541661228025e-05, + "loss": 0.3369, + "step": 8026 + }, + { + "epoch": 2.577299727082999, + "grad_norm": 0.8350579738616943, + "learning_rate": 2.2815343044196525e-05, + "loss": 0.3416, + "step": 8027 + }, + { + "epoch": 2.5776208059078503, + "grad_norm": 0.4396490156650543, + "learning_rate": 2.279315383188132e-05, + "loss": 0.2917, + "step": 8028 + }, + { + "epoch": 2.577941884732702, + "grad_norm": 0.3990170359611511, + "learning_rate": 2.277097402698619e-05, + "loss": 0.2969, + "step": 8029 + }, + { + "epoch": 2.5782629635575534, + "grad_norm": 0.5404382348060608, + "learning_rate": 2.2748803632213557e-05, + "loss": 0.8456, + "step": 8030 + }, + { + "epoch": 2.578584042382405, + "grad_norm": 0.5150883793830872, + "learning_rate": 2.2726642650264895e-05, + "loss": 0.8014, + "step": 8031 + }, + { + "epoch": 2.5789051212072565, + "grad_norm": 0.4833203852176666, + "learning_rate": 2.270449108384044e-05, + "loss": 0.5567, + "step": 8032 + }, + { + "epoch": 2.579226200032108, + "grad_norm": 0.5803343057632446, + "learning_rate": 2.2682348935639274e-05, + "loss": 0.1951, + "step": 8033 + }, + { + "epoch": 2.5795472788569596, + "grad_norm": 0.43384525179862976, + "learning_rate": 2.2660216208359365e-05, + "loss": 0.2521, + "step": 8034 + }, + { + "epoch": 2.5798683576818107, + "grad_norm": 0.3641275465488434, + "learning_rate": 2.2638092904697518e-05, + "loss": 0.1943, + "step": 8035 + }, + { + "epoch": 2.5801894365066627, + "grad_norm": 0.3974734842777252, + "learning_rate": 2.2615979027349387e-05, + "loss": 0.1705, + "step": 8036 + }, + { + "epoch": 2.5805105153315138, + "grad_norm": 0.3340223431587219, + "learning_rate": 2.259387457900948e-05, + "loss": 0.1161, + "step": 8037 + }, + { + "epoch": 2.5808315941563653, + "grad_norm": 0.750033974647522, + "learning_rate": 2.2571779562371153e-05, + "loss": 0.3098, + "step": 8038 + }, + { + "epoch": 2.581152672981217, + "grad_norm": 0.7684246301651001, + "learning_rate": 2.254969398012663e-05, + "loss": 0.3886, + "step": 8039 + }, + { + "epoch": 2.5814737518060684, + "grad_norm": 0.8656273484230042, + "learning_rate": 2.2527617834966954e-05, + "loss": 0.3571, + "step": 8040 + }, + { + "epoch": 2.58179483063092, + "grad_norm": 0.7932442426681519, + "learning_rate": 2.2505551129582047e-05, + "loss": 0.3677, + "step": 8041 + }, + { + "epoch": 2.5821159094557715, + "grad_norm": 0.7484654188156128, + "learning_rate": 2.2483493866660676e-05, + "loss": 0.3368, + "step": 8042 + }, + { + "epoch": 2.582436988280623, + "grad_norm": 0.9018601179122925, + "learning_rate": 2.246144604889042e-05, + "loss": 0.4161, + "step": 8043 + }, + { + "epoch": 2.582758067105474, + "grad_norm": 0.7390075325965881, + "learning_rate": 2.2439407678957812e-05, + "loss": 0.2773, + "step": 8044 + }, + { + "epoch": 2.583079145930326, + "grad_norm": 0.7586901783943176, + "learning_rate": 2.241737875954808e-05, + "loss": 0.2567, + "step": 8045 + }, + { + "epoch": 2.5834002247551773, + "grad_norm": 1.0482172966003418, + "learning_rate": 2.2395359293345396e-05, + "loss": 0.3444, + "step": 8046 + }, + { + "epoch": 2.583721303580029, + "grad_norm": 0.7774665951728821, + "learning_rate": 2.237334928303283e-05, + "loss": 0.2983, + "step": 8047 + }, + { + "epoch": 2.5840423824048804, + "grad_norm": 0.8551390767097473, + "learning_rate": 2.235134873129213e-05, + "loss": 0.325, + "step": 8048 + }, + { + "epoch": 2.584363461229732, + "grad_norm": 0.8655200600624084, + "learning_rate": 2.2329357640804117e-05, + "loss": 0.3297, + "step": 8049 + }, + { + "epoch": 2.5846845400545835, + "grad_norm": 0.7637379765510559, + "learning_rate": 2.2307376014248216e-05, + "loss": 0.2911, + "step": 8050 + }, + { + "epoch": 2.585005618879435, + "grad_norm": 0.9776454567909241, + "learning_rate": 2.2285403854302912e-05, + "loss": 0.3879, + "step": 8051 + }, + { + "epoch": 2.5853266977042866, + "grad_norm": 1.0411674976348877, + "learning_rate": 2.2263441163645403e-05, + "loss": 0.4912, + "step": 8052 + }, + { + "epoch": 2.5856477765291377, + "grad_norm": 1.0440319776535034, + "learning_rate": 2.22414879449518e-05, + "loss": 0.4109, + "step": 8053 + }, + { + "epoch": 2.5859688553539897, + "grad_norm": 0.7270945310592651, + "learning_rate": 2.2219544200897025e-05, + "loss": 0.3516, + "step": 8054 + }, + { + "epoch": 2.586289934178841, + "grad_norm": 0.867685079574585, + "learning_rate": 2.219760993415485e-05, + "loss": 0.3819, + "step": 8055 + }, + { + "epoch": 2.5866110130036923, + "grad_norm": 0.6216245293617249, + "learning_rate": 2.2175685147397906e-05, + "loss": 0.2304, + "step": 8056 + }, + { + "epoch": 2.586932091828544, + "grad_norm": 0.9045018553733826, + "learning_rate": 2.2153769843297667e-05, + "loss": 0.3466, + "step": 8057 + }, + { + "epoch": 2.5872531706533954, + "grad_norm": 1.0591224431991577, + "learning_rate": 2.213186402452443e-05, + "loss": 0.4561, + "step": 8058 + }, + { + "epoch": 2.587574249478247, + "grad_norm": 0.8752357959747314, + "learning_rate": 2.210996769374737e-05, + "loss": 0.3874, + "step": 8059 + }, + { + "epoch": 2.5878953283030985, + "grad_norm": 0.8251414895057678, + "learning_rate": 2.2088080853634473e-05, + "loss": 0.3303, + "step": 8060 + }, + { + "epoch": 2.58821640712795, + "grad_norm": 1.0302598476409912, + "learning_rate": 2.2066203506852566e-05, + "loss": 0.3676, + "step": 8061 + }, + { + "epoch": 2.588537485952801, + "grad_norm": 0.9010094404220581, + "learning_rate": 2.204433565606743e-05, + "loss": 0.3819, + "step": 8062 + }, + { + "epoch": 2.5888585647776527, + "grad_norm": 0.7407838702201843, + "learning_rate": 2.202247730394349e-05, + "loss": 0.2689, + "step": 8063 + }, + { + "epoch": 2.5891796436025043, + "grad_norm": 1.1048128604888916, + "learning_rate": 2.200062845314417e-05, + "loss": 0.3919, + "step": 8064 + }, + { + "epoch": 2.589500722427356, + "grad_norm": 0.9192834496498108, + "learning_rate": 2.1978789106331665e-05, + "loss": 0.2973, + "step": 8065 + }, + { + "epoch": 2.5898218012522074, + "grad_norm": 0.882653534412384, + "learning_rate": 2.195695926616702e-05, + "loss": 0.279, + "step": 8066 + }, + { + "epoch": 2.590142880077059, + "grad_norm": 0.9665889143943787, + "learning_rate": 2.1935138935310206e-05, + "loss": 0.3333, + "step": 8067 + }, + { + "epoch": 2.5904639589019105, + "grad_norm": 0.8618683218955994, + "learning_rate": 2.1913328116419873e-05, + "loss": 0.3014, + "step": 8068 + }, + { + "epoch": 2.590785037726762, + "grad_norm": 0.8841969966888428, + "learning_rate": 2.1891526812153672e-05, + "loss": 0.3502, + "step": 8069 + }, + { + "epoch": 2.5911061165516136, + "grad_norm": 1.0003149509429932, + "learning_rate": 2.1869735025168026e-05, + "loss": 0.333, + "step": 8070 + }, + { + "epoch": 2.5914271953764647, + "grad_norm": 0.9237295985221863, + "learning_rate": 2.1847952758118117e-05, + "loss": 0.3748, + "step": 8071 + }, + { + "epoch": 2.5917482742013163, + "grad_norm": 0.9068896770477295, + "learning_rate": 2.1826180013658172e-05, + "loss": 0.2793, + "step": 8072 + }, + { + "epoch": 2.592069353026168, + "grad_norm": 1.0544371604919434, + "learning_rate": 2.1804416794440995e-05, + "loss": 0.3318, + "step": 8073 + }, + { + "epoch": 2.5923904318510194, + "grad_norm": 1.1038298606872559, + "learning_rate": 2.178266310311847e-05, + "loss": 0.4288, + "step": 8074 + }, + { + "epoch": 2.592711510675871, + "grad_norm": 0.530205488204956, + "learning_rate": 2.1760918942341192e-05, + "loss": 0.2411, + "step": 8075 + }, + { + "epoch": 2.5930325895007225, + "grad_norm": 0.5433615446090698, + "learning_rate": 2.173918431475861e-05, + "loss": 0.2607, + "step": 8076 + }, + { + "epoch": 2.593353668325574, + "grad_norm": 0.8718168139457703, + "learning_rate": 2.1717459223019033e-05, + "loss": 0.3014, + "step": 8077 + }, + { + "epoch": 2.5936747471504256, + "grad_norm": 0.506506085395813, + "learning_rate": 2.1695743669769596e-05, + "loss": 0.268, + "step": 8078 + }, + { + "epoch": 2.593995825975277, + "grad_norm": 0.8781798481941223, + "learning_rate": 2.1674037657656266e-05, + "loss": 0.3089, + "step": 8079 + }, + { + "epoch": 2.594316904800128, + "grad_norm": 0.5046657919883728, + "learning_rate": 2.1652341189323866e-05, + "loss": 0.7222, + "step": 8080 + }, + { + "epoch": 2.5946379836249798, + "grad_norm": 0.40744638442993164, + "learning_rate": 2.163065426741603e-05, + "loss": 0.4146, + "step": 8081 + }, + { + "epoch": 2.5949590624498313, + "grad_norm": 0.49407830834388733, + "learning_rate": 2.160897689457526e-05, + "loss": 0.3484, + "step": 8082 + }, + { + "epoch": 2.595280141274683, + "grad_norm": 0.34928953647613525, + "learning_rate": 2.1587309073442863e-05, + "loss": 0.0671, + "step": 8083 + }, + { + "epoch": 2.5956012200995344, + "grad_norm": 0.22757402062416077, + "learning_rate": 2.1565650806658975e-05, + "loss": 0.0641, + "step": 8084 + }, + { + "epoch": 2.595922298924386, + "grad_norm": 0.3532765805721283, + "learning_rate": 2.154400209686268e-05, + "loss": 0.1308, + "step": 8085 + }, + { + "epoch": 2.5962433777492375, + "grad_norm": 0.7194594144821167, + "learning_rate": 2.1522362946691698e-05, + "loss": 0.347, + "step": 8086 + }, + { + "epoch": 2.596564456574089, + "grad_norm": 0.8896145224571228, + "learning_rate": 2.1500733358782786e-05, + "loss": 0.4766, + "step": 8087 + }, + { + "epoch": 2.5968855353989406, + "grad_norm": 1.069779872894287, + "learning_rate": 2.1479113335771383e-05, + "loss": 0.4661, + "step": 8088 + }, + { + "epoch": 2.5972066142237917, + "grad_norm": 0.7583945989608765, + "learning_rate": 2.1457502880291812e-05, + "loss": 0.3911, + "step": 8089 + }, + { + "epoch": 2.5975276930486433, + "grad_norm": 0.6904090046882629, + "learning_rate": 2.1435901994977326e-05, + "loss": 0.3297, + "step": 8090 + }, + { + "epoch": 2.597848771873495, + "grad_norm": 0.7934258580207825, + "learning_rate": 2.1414310682459802e-05, + "loss": 0.3471, + "step": 8091 + }, + { + "epoch": 2.5981698506983464, + "grad_norm": 1.1645605564117432, + "learning_rate": 2.1392728945370222e-05, + "loss": 0.365, + "step": 8092 + }, + { + "epoch": 2.598490929523198, + "grad_norm": 0.57047039270401, + "learning_rate": 2.137115678633811e-05, + "loss": 0.2229, + "step": 8093 + }, + { + "epoch": 2.5988120083480495, + "grad_norm": 0.8101687431335449, + "learning_rate": 2.1349594207992064e-05, + "loss": 0.3051, + "step": 8094 + }, + { + "epoch": 2.599133087172901, + "grad_norm": 0.785053551197052, + "learning_rate": 2.1328041212959403e-05, + "loss": 0.3114, + "step": 8095 + }, + { + "epoch": 2.5994541659977526, + "grad_norm": 0.7656204104423523, + "learning_rate": 2.1306497803866277e-05, + "loss": 0.3462, + "step": 8096 + }, + { + "epoch": 2.599775244822604, + "grad_norm": 0.861936092376709, + "learning_rate": 2.128496398333768e-05, + "loss": 0.356, + "step": 8097 + }, + { + "epoch": 2.6000963236474552, + "grad_norm": 0.8918510675430298, + "learning_rate": 2.126343975399747e-05, + "loss": 0.3629, + "step": 8098 + }, + { + "epoch": 2.600417402472307, + "grad_norm": 0.8870481848716736, + "learning_rate": 2.1241925118468287e-05, + "loss": 0.3867, + "step": 8099 + }, + { + "epoch": 2.6007384812971583, + "grad_norm": 1.0247498750686646, + "learning_rate": 2.1220420079371628e-05, + "loss": 0.3675, + "step": 8100 + }, + { + "epoch": 2.60105956012201, + "grad_norm": 0.9953125715255737, + "learning_rate": 2.119892463932781e-05, + "loss": 0.3568, + "step": 8101 + }, + { + "epoch": 2.6013806389468614, + "grad_norm": 0.9158168435096741, + "learning_rate": 2.1177438800956007e-05, + "loss": 0.3369, + "step": 8102 + }, + { + "epoch": 2.601701717771713, + "grad_norm": 0.8674771785736084, + "learning_rate": 2.115596256687419e-05, + "loss": 0.3141, + "step": 8103 + }, + { + "epoch": 2.6020227965965645, + "grad_norm": 0.6369979381561279, + "learning_rate": 2.113449593969915e-05, + "loss": 0.2762, + "step": 8104 + }, + { + "epoch": 2.602343875421416, + "grad_norm": 0.8605027794837952, + "learning_rate": 2.1113038922046602e-05, + "loss": 0.3994, + "step": 8105 + }, + { + "epoch": 2.6026649542462676, + "grad_norm": 1.133817434310913, + "learning_rate": 2.1091591516530952e-05, + "loss": 0.3732, + "step": 8106 + }, + { + "epoch": 2.6029860330711188, + "grad_norm": 0.7040675282478333, + "learning_rate": 2.107015372576552e-05, + "loss": 0.3186, + "step": 8107 + }, + { + "epoch": 2.6033071118959703, + "grad_norm": 0.7996187210083008, + "learning_rate": 2.1048725552362435e-05, + "loss": 0.3545, + "step": 8108 + }, + { + "epoch": 2.603628190720822, + "grad_norm": 1.0517889261245728, + "learning_rate": 2.1027306998932627e-05, + "loss": 0.3864, + "step": 8109 + }, + { + "epoch": 2.6039492695456734, + "grad_norm": 0.5958194136619568, + "learning_rate": 2.100589806808597e-05, + "loss": 0.2543, + "step": 8110 + }, + { + "epoch": 2.604270348370525, + "grad_norm": 0.7594183683395386, + "learning_rate": 2.098449876243096e-05, + "loss": 0.328, + "step": 8111 + }, + { + "epoch": 2.6045914271953765, + "grad_norm": 1.2153469324111938, + "learning_rate": 2.096310908457513e-05, + "loss": 0.3756, + "step": 8112 + }, + { + "epoch": 2.604912506020228, + "grad_norm": 0.8676589727401733, + "learning_rate": 2.09417290371247e-05, + "loss": 0.3697, + "step": 8113 + }, + { + "epoch": 2.6052335848450796, + "grad_norm": 0.792874813079834, + "learning_rate": 2.0920358622684788e-05, + "loss": 0.379, + "step": 8114 + }, + { + "epoch": 2.605554663669931, + "grad_norm": 0.6348553895950317, + "learning_rate": 2.0898997843859334e-05, + "loss": 0.2493, + "step": 8115 + }, + { + "epoch": 2.6058757424947823, + "grad_norm": 1.1245969533920288, + "learning_rate": 2.0877646703250996e-05, + "loss": 0.3336, + "step": 8116 + }, + { + "epoch": 2.606196821319634, + "grad_norm": 0.794979453086853, + "learning_rate": 2.0856305203461436e-05, + "loss": 0.3597, + "step": 8117 + }, + { + "epoch": 2.6065179001444854, + "grad_norm": 0.9265166521072388, + "learning_rate": 2.0834973347091014e-05, + "loss": 0.336, + "step": 8118 + }, + { + "epoch": 2.606838978969337, + "grad_norm": 0.7240932583808899, + "learning_rate": 2.0813651136738954e-05, + "loss": 0.284, + "step": 8119 + }, + { + "epoch": 2.6071600577941885, + "grad_norm": 0.5318633913993835, + "learning_rate": 2.07923385750033e-05, + "loss": 0.2458, + "step": 8120 + }, + { + "epoch": 2.60748113661904, + "grad_norm": 0.6563091278076172, + "learning_rate": 2.0771035664480942e-05, + "loss": 0.2628, + "step": 8121 + }, + { + "epoch": 2.6078022154438916, + "grad_norm": 0.9432152509689331, + "learning_rate": 2.0749742407767546e-05, + "loss": 0.3554, + "step": 8122 + }, + { + "epoch": 2.608123294268743, + "grad_norm": 1.121485948562622, + "learning_rate": 2.0728458807457662e-05, + "loss": 0.3306, + "step": 8123 + }, + { + "epoch": 2.6084443730935947, + "grad_norm": 0.8368777632713318, + "learning_rate": 2.0707184866144603e-05, + "loss": 0.2796, + "step": 8124 + }, + { + "epoch": 2.6087654519184458, + "grad_norm": 0.9011041522026062, + "learning_rate": 2.068592058642056e-05, + "loss": 0.3304, + "step": 8125 + }, + { + "epoch": 2.6090865307432973, + "grad_norm": 1.134626865386963, + "learning_rate": 2.0664665970876496e-05, + "loss": 0.2825, + "step": 8126 + }, + { + "epoch": 2.609407609568149, + "grad_norm": 0.7329688668251038, + "learning_rate": 2.0643421022102217e-05, + "loss": 0.3027, + "step": 8127 + }, + { + "epoch": 2.6097286883930004, + "grad_norm": 0.5429527759552002, + "learning_rate": 2.0622185742686416e-05, + "loss": 0.3091, + "step": 8128 + }, + { + "epoch": 2.610049767217852, + "grad_norm": 0.6699220538139343, + "learning_rate": 2.0600960135216462e-05, + "loss": 0.296, + "step": 8129 + }, + { + "epoch": 2.6103708460427035, + "grad_norm": 0.5311281681060791, + "learning_rate": 2.0579744202278718e-05, + "loss": 0.7533, + "step": 8130 + }, + { + "epoch": 2.610691924867555, + "grad_norm": 0.5676338076591492, + "learning_rate": 2.0558537946458177e-05, + "loss": 0.8681, + "step": 8131 + }, + { + "epoch": 2.6110130036924066, + "grad_norm": 0.38879385590553284, + "learning_rate": 2.0537341370338857e-05, + "loss": 0.1332, + "step": 8132 + }, + { + "epoch": 2.611334082517258, + "grad_norm": 0.28156155347824097, + "learning_rate": 2.051615447650347e-05, + "loss": 0.1392, + "step": 8133 + }, + { + "epoch": 2.6116551613421093, + "grad_norm": 0.38800543546676636, + "learning_rate": 2.049497726753351e-05, + "loss": 0.156, + "step": 8134 + }, + { + "epoch": 2.611976240166961, + "grad_norm": 0.2520560026168823, + "learning_rate": 2.0473809746009444e-05, + "loss": 0.0688, + "step": 8135 + }, + { + "epoch": 2.6122973189918124, + "grad_norm": 0.7137389183044434, + "learning_rate": 2.045265191451041e-05, + "loss": 0.3956, + "step": 8136 + }, + { + "epoch": 2.612618397816664, + "grad_norm": 0.8318041563034058, + "learning_rate": 2.0431503775614457e-05, + "loss": 0.2772, + "step": 8137 + }, + { + "epoch": 2.6129394766415155, + "grad_norm": 0.7499494552612305, + "learning_rate": 2.0410365331898416e-05, + "loss": 0.3192, + "step": 8138 + }, + { + "epoch": 2.613260555466367, + "grad_norm": 0.8148148059844971, + "learning_rate": 2.0389236585937945e-05, + "loss": 0.3618, + "step": 8139 + }, + { + "epoch": 2.6135816342912186, + "grad_norm": 1.1199220418930054, + "learning_rate": 2.0368117540307496e-05, + "loss": 0.3034, + "step": 8140 + }, + { + "epoch": 2.61390271311607, + "grad_norm": 0.5880412459373474, + "learning_rate": 2.0347008197580374e-05, + "loss": 0.2263, + "step": 8141 + }, + { + "epoch": 2.6142237919409217, + "grad_norm": 0.8377451300621033, + "learning_rate": 2.03259085603287e-05, + "loss": 0.4124, + "step": 8142 + }, + { + "epoch": 2.614544870765773, + "grad_norm": 0.7712952494621277, + "learning_rate": 2.030481863112339e-05, + "loss": 0.2779, + "step": 8143 + }, + { + "epoch": 2.6148659495906243, + "grad_norm": 0.8210815787315369, + "learning_rate": 2.028373841253419e-05, + "loss": 0.3206, + "step": 8144 + }, + { + "epoch": 2.615187028415476, + "grad_norm": 1.114625334739685, + "learning_rate": 2.026266790712965e-05, + "loss": 0.4125, + "step": 8145 + }, + { + "epoch": 2.6155081072403275, + "grad_norm": 1.0668013095855713, + "learning_rate": 2.024160711747717e-05, + "loss": 0.4166, + "step": 8146 + }, + { + "epoch": 2.615829186065179, + "grad_norm": 1.219161868095398, + "learning_rate": 2.0220556046142893e-05, + "loss": 0.4567, + "step": 8147 + }, + { + "epoch": 2.6161502648900306, + "grad_norm": 0.993944525718689, + "learning_rate": 2.019951469569191e-05, + "loss": 0.3649, + "step": 8148 + }, + { + "epoch": 2.616471343714882, + "grad_norm": 0.8392919898033142, + "learning_rate": 2.017848306868797e-05, + "loss": 0.2713, + "step": 8149 + }, + { + "epoch": 2.6167924225397337, + "grad_norm": 0.9957613945007324, + "learning_rate": 2.0157461167693758e-05, + "loss": 0.3948, + "step": 8150 + }, + { + "epoch": 2.617113501364585, + "grad_norm": 0.8635070323944092, + "learning_rate": 2.013644899527074e-05, + "loss": 0.3869, + "step": 8151 + }, + { + "epoch": 2.6174345801894363, + "grad_norm": 1.053145170211792, + "learning_rate": 2.01154465539791e-05, + "loss": 0.3391, + "step": 8152 + }, + { + "epoch": 2.617755659014288, + "grad_norm": 0.6855107545852661, + "learning_rate": 2.009445384637805e-05, + "loss": 0.2861, + "step": 8153 + }, + { + "epoch": 2.6180767378391394, + "grad_norm": 1.1673146486282349, + "learning_rate": 2.0073470875025358e-05, + "loss": 0.3544, + "step": 8154 + }, + { + "epoch": 2.618397816663991, + "grad_norm": 0.7385457158088684, + "learning_rate": 2.005249764247783e-05, + "loss": 0.2765, + "step": 8155 + }, + { + "epoch": 2.6187188954888425, + "grad_norm": 0.721240222454071, + "learning_rate": 2.0031534151290943e-05, + "loss": 0.2583, + "step": 8156 + }, + { + "epoch": 2.619039974313694, + "grad_norm": 0.8315588235855103, + "learning_rate": 2.0010580404019062e-05, + "loss": 0.3844, + "step": 8157 + }, + { + "epoch": 2.6193610531385456, + "grad_norm": 0.9637236595153809, + "learning_rate": 1.9989636403215328e-05, + "loss": 0.3219, + "step": 8158 + }, + { + "epoch": 2.619682131963397, + "grad_norm": 0.7627900242805481, + "learning_rate": 1.9968702151431696e-05, + "loss": 0.2788, + "step": 8159 + }, + { + "epoch": 2.6200032107882487, + "grad_norm": 0.6353088617324829, + "learning_rate": 1.9947777651218946e-05, + "loss": 0.256, + "step": 8160 + }, + { + "epoch": 2.6203242896131, + "grad_norm": 1.0363852977752686, + "learning_rate": 1.9926862905126665e-05, + "loss": 0.451, + "step": 8161 + }, + { + "epoch": 2.6206453684379514, + "grad_norm": 0.863524854183197, + "learning_rate": 1.9905957915703244e-05, + "loss": 0.3374, + "step": 8162 + }, + { + "epoch": 2.620966447262803, + "grad_norm": 0.8532664775848389, + "learning_rate": 1.9885062685495904e-05, + "loss": 0.3011, + "step": 8163 + }, + { + "epoch": 2.6212875260876545, + "grad_norm": 0.8648016452789307, + "learning_rate": 1.9864177217050674e-05, + "loss": 0.3211, + "step": 8164 + }, + { + "epoch": 2.621608604912506, + "grad_norm": 0.7424303889274597, + "learning_rate": 1.9843301512912327e-05, + "loss": 0.3291, + "step": 8165 + }, + { + "epoch": 2.6219296837373576, + "grad_norm": 0.7464106678962708, + "learning_rate": 1.9822435575624608e-05, + "loss": 0.3156, + "step": 8166 + }, + { + "epoch": 2.622250762562209, + "grad_norm": 1.0930415391921997, + "learning_rate": 1.9801579407729866e-05, + "loss": 0.5009, + "step": 8167 + }, + { + "epoch": 2.6225718413870607, + "grad_norm": 0.78467857837677, + "learning_rate": 1.9780733011769447e-05, + "loss": 0.2866, + "step": 8168 + }, + { + "epoch": 2.6228929202119122, + "grad_norm": 0.7698622345924377, + "learning_rate": 1.9759896390283362e-05, + "loss": 0.3075, + "step": 8169 + }, + { + "epoch": 2.6232139990367633, + "grad_norm": 0.5596573352813721, + "learning_rate": 1.9739069545810485e-05, + "loss": 0.2454, + "step": 8170 + }, + { + "epoch": 2.623535077861615, + "grad_norm": 0.7981199622154236, + "learning_rate": 1.9718252480888566e-05, + "loss": 0.2827, + "step": 8171 + }, + { + "epoch": 2.6238561566864664, + "grad_norm": 0.49611830711364746, + "learning_rate": 1.969744519805402e-05, + "loss": 0.2539, + "step": 8172 + }, + { + "epoch": 2.624177235511318, + "grad_norm": 0.9452134966850281, + "learning_rate": 1.9676647699842242e-05, + "loss": 0.3056, + "step": 8173 + }, + { + "epoch": 2.6244983143361695, + "grad_norm": 0.8190011382102966, + "learning_rate": 1.965585998878724e-05, + "loss": 0.304, + "step": 8174 + }, + { + "epoch": 2.624819393161021, + "grad_norm": 0.6110972762107849, + "learning_rate": 1.963508206742202e-05, + "loss": 0.2405, + "step": 8175 + }, + { + "epoch": 2.6251404719858726, + "grad_norm": 0.8863208293914795, + "learning_rate": 1.9614313938278272e-05, + "loss": 0.3248, + "step": 8176 + }, + { + "epoch": 2.625461550810724, + "grad_norm": 0.5325760841369629, + "learning_rate": 1.9593555603886538e-05, + "loss": 0.2599, + "step": 8177 + }, + { + "epoch": 2.6257826296355757, + "grad_norm": 0.9411792755126953, + "learning_rate": 1.9572807066776143e-05, + "loss": 0.3166, + "step": 8178 + }, + { + "epoch": 2.626103708460427, + "grad_norm": 0.5804592370986938, + "learning_rate": 1.955206832947526e-05, + "loss": 0.2814, + "step": 8179 + }, + { + "epoch": 2.6264247872852784, + "grad_norm": 0.5420529842376709, + "learning_rate": 1.9531339394510827e-05, + "loss": 0.7411, + "step": 8180 + }, + { + "epoch": 2.62674586611013, + "grad_norm": 0.46422919631004333, + "learning_rate": 1.9510620264408596e-05, + "loss": 0.6111, + "step": 8181 + }, + { + "epoch": 2.6270669449349815, + "grad_norm": 0.3627856969833374, + "learning_rate": 1.9489910941693133e-05, + "loss": 0.2278, + "step": 8182 + }, + { + "epoch": 2.627388023759833, + "grad_norm": 0.4854031503200531, + "learning_rate": 1.946921142888781e-05, + "loss": 0.1976, + "step": 8183 + }, + { + "epoch": 2.6277091025846846, + "grad_norm": 0.4990745186805725, + "learning_rate": 1.9448521728514802e-05, + "loss": 0.2803, + "step": 8184 + }, + { + "epoch": 2.628030181409536, + "grad_norm": 0.2989073395729065, + "learning_rate": 1.9427841843095063e-05, + "loss": 0.0711, + "step": 8185 + }, + { + "epoch": 2.6283512602343877, + "grad_norm": 0.5125943422317505, + "learning_rate": 1.9407171775148436e-05, + "loss": 0.2092, + "step": 8186 + }, + { + "epoch": 2.6286723390592392, + "grad_norm": 0.8719359040260315, + "learning_rate": 1.938651152719344e-05, + "loss": 0.4531, + "step": 8187 + }, + { + "epoch": 2.6289934178840904, + "grad_norm": 0.7034810781478882, + "learning_rate": 1.9365861101747485e-05, + "loss": 0.3558, + "step": 8188 + }, + { + "epoch": 2.629314496708942, + "grad_norm": 0.9010298848152161, + "learning_rate": 1.9345220501326777e-05, + "loss": 0.4044, + "step": 8189 + }, + { + "epoch": 2.6296355755337935, + "grad_norm": 0.8850868344306946, + "learning_rate": 1.9324589728446262e-05, + "loss": 0.3323, + "step": 8190 + }, + { + "epoch": 2.629956654358645, + "grad_norm": 0.8857529163360596, + "learning_rate": 1.930396878561983e-05, + "loss": 0.3986, + "step": 8191 + }, + { + "epoch": 2.6302777331834966, + "grad_norm": 0.7102099061012268, + "learning_rate": 1.928335767535997e-05, + "loss": 0.254, + "step": 8192 + }, + { + "epoch": 2.630598812008348, + "grad_norm": 0.7003388404846191, + "learning_rate": 1.9262756400178162e-05, + "loss": 0.314, + "step": 8193 + }, + { + "epoch": 2.6309198908331997, + "grad_norm": 0.9088608026504517, + "learning_rate": 1.9242164962584618e-05, + "loss": 0.3336, + "step": 8194 + }, + { + "epoch": 2.631240969658051, + "grad_norm": 0.8129791617393494, + "learning_rate": 1.922158336508825e-05, + "loss": 0.3411, + "step": 8195 + }, + { + "epoch": 2.6315620484829028, + "grad_norm": 0.9310645461082458, + "learning_rate": 1.9201011610196973e-05, + "loss": 0.4063, + "step": 8196 + }, + { + "epoch": 2.631883127307754, + "grad_norm": 1.2737817764282227, + "learning_rate": 1.918044970041729e-05, + "loss": 0.4643, + "step": 8197 + }, + { + "epoch": 2.6322042061326054, + "grad_norm": 0.9532968997955322, + "learning_rate": 1.91598976382547e-05, + "loss": 0.4322, + "step": 8198 + }, + { + "epoch": 2.632525284957457, + "grad_norm": 0.7169177532196045, + "learning_rate": 1.9139355426213347e-05, + "loss": 0.2881, + "step": 8199 + }, + { + "epoch": 2.6328463637823085, + "grad_norm": 1.0440733432769775, + "learning_rate": 1.9118823066796276e-05, + "loss": 0.4381, + "step": 8200 + }, + { + "epoch": 2.63316744260716, + "grad_norm": 0.9224132299423218, + "learning_rate": 1.9098300562505266e-05, + "loss": 0.3855, + "step": 8201 + }, + { + "epoch": 2.6334885214320116, + "grad_norm": 0.7816334962844849, + "learning_rate": 1.9077787915840927e-05, + "loss": 0.3378, + "step": 8202 + }, + { + "epoch": 2.633809600256863, + "grad_norm": 1.13314688205719, + "learning_rate": 1.9057285129302683e-05, + "loss": 0.4, + "step": 8203 + }, + { + "epoch": 2.6341306790817147, + "grad_norm": 0.848976731300354, + "learning_rate": 1.903679220538871e-05, + "loss": 0.3194, + "step": 8204 + }, + { + "epoch": 2.6344517579065663, + "grad_norm": 1.028289556503296, + "learning_rate": 1.9016309146596023e-05, + "loss": 0.4086, + "step": 8205 + }, + { + "epoch": 2.6347728367314174, + "grad_norm": 1.0954947471618652, + "learning_rate": 1.8995835955420416e-05, + "loss": 0.4058, + "step": 8206 + }, + { + "epoch": 2.635093915556269, + "grad_norm": 1.0540190935134888, + "learning_rate": 1.897537263435648e-05, + "loss": 0.4228, + "step": 8207 + }, + { + "epoch": 2.6354149943811205, + "grad_norm": 1.1869220733642578, + "learning_rate": 1.895491918589759e-05, + "loss": 0.3598, + "step": 8208 + }, + { + "epoch": 2.635736073205972, + "grad_norm": 1.1555882692337036, + "learning_rate": 1.8934475612536017e-05, + "loss": 0.4349, + "step": 8209 + }, + { + "epoch": 2.6360571520308236, + "grad_norm": 0.7538331151008606, + "learning_rate": 1.891404191676265e-05, + "loss": 0.3082, + "step": 8210 + }, + { + "epoch": 2.636378230855675, + "grad_norm": 1.011115312576294, + "learning_rate": 1.8893618101067355e-05, + "loss": 0.2848, + "step": 8211 + }, + { + "epoch": 2.6366993096805267, + "grad_norm": 0.9371142983436584, + "learning_rate": 1.8873204167938653e-05, + "loss": 0.3994, + "step": 8212 + }, + { + "epoch": 2.6370203885053782, + "grad_norm": 0.7361756563186646, + "learning_rate": 1.885280011986391e-05, + "loss": 0.3152, + "step": 8213 + }, + { + "epoch": 2.63734146733023, + "grad_norm": 1.0889180898666382, + "learning_rate": 1.883240595932938e-05, + "loss": 0.3764, + "step": 8214 + }, + { + "epoch": 2.637662546155081, + "grad_norm": 0.6203745603561401, + "learning_rate": 1.8812021688819915e-05, + "loss": 0.2491, + "step": 8215 + }, + { + "epoch": 2.6379836249799324, + "grad_norm": 0.7448567748069763, + "learning_rate": 1.879164731081937e-05, + "loss": 0.282, + "step": 8216 + }, + { + "epoch": 2.638304703804784, + "grad_norm": 0.9264616966247559, + "learning_rate": 1.877128282781028e-05, + "loss": 0.2982, + "step": 8217 + }, + { + "epoch": 2.6386257826296355, + "grad_norm": 0.6099106073379517, + "learning_rate": 1.8750928242273968e-05, + "loss": 0.2363, + "step": 8218 + }, + { + "epoch": 2.638946861454487, + "grad_norm": 0.859263002872467, + "learning_rate": 1.8730583556690605e-05, + "loss": 0.3094, + "step": 8219 + }, + { + "epoch": 2.6392679402793386, + "grad_norm": 0.5748609304428101, + "learning_rate": 1.8710248773539118e-05, + "loss": 0.2404, + "step": 8220 + }, + { + "epoch": 2.63958901910419, + "grad_norm": 0.8924229741096497, + "learning_rate": 1.8689923895297245e-05, + "loss": 0.3191, + "step": 8221 + }, + { + "epoch": 2.6399100979290417, + "grad_norm": 0.8334121108055115, + "learning_rate": 1.8669608924441496e-05, + "loss": 0.3053, + "step": 8222 + }, + { + "epoch": 2.6402311767538933, + "grad_norm": 0.9076601266860962, + "learning_rate": 1.8649303863447198e-05, + "loss": 0.3409, + "step": 8223 + }, + { + "epoch": 2.6405522555787444, + "grad_norm": 0.6396355032920837, + "learning_rate": 1.8629008714788464e-05, + "loss": 0.3092, + "step": 8224 + }, + { + "epoch": 2.640873334403596, + "grad_norm": 0.7007104754447937, + "learning_rate": 1.8608723480938206e-05, + "loss": 0.3039, + "step": 8225 + }, + { + "epoch": 2.6411944132284475, + "grad_norm": 0.4519537389278412, + "learning_rate": 1.858844816436809e-05, + "loss": 0.2598, + "step": 8226 + }, + { + "epoch": 2.641515492053299, + "grad_norm": 0.609611988067627, + "learning_rate": 1.8568182767548626e-05, + "loss": 0.2982, + "step": 8227 + }, + { + "epoch": 2.6418365708781506, + "grad_norm": 0.35319435596466064, + "learning_rate": 1.854792729294905e-05, + "loss": 0.2584, + "step": 8228 + }, + { + "epoch": 2.642157649703002, + "grad_norm": 0.5530852675437927, + "learning_rate": 1.852768174303752e-05, + "loss": 0.3132, + "step": 8229 + }, + { + "epoch": 2.6424787285278537, + "grad_norm": 0.6832270622253418, + "learning_rate": 1.8507446120280814e-05, + "loss": 0.9194, + "step": 8230 + }, + { + "epoch": 2.6427998073527053, + "grad_norm": 0.36571961641311646, + "learning_rate": 1.848722042714457e-05, + "loss": 0.2325, + "step": 8231 + }, + { + "epoch": 2.643120886177557, + "grad_norm": 0.3756738305091858, + "learning_rate": 1.8467004666093325e-05, + "loss": 0.1744, + "step": 8232 + }, + { + "epoch": 2.643441965002408, + "grad_norm": 0.3277255594730377, + "learning_rate": 1.8446798839590186e-05, + "loss": 0.1241, + "step": 8233 + }, + { + "epoch": 2.6437630438272595, + "grad_norm": 0.264514297246933, + "learning_rate": 1.8426602950097283e-05, + "loss": 0.1122, + "step": 8234 + }, + { + "epoch": 2.644084122652111, + "grad_norm": 0.20338915288448334, + "learning_rate": 1.8406417000075325e-05, + "loss": 0.0738, + "step": 8235 + }, + { + "epoch": 2.6444052014769626, + "grad_norm": 0.8854188323020935, + "learning_rate": 1.838624099198397e-05, + "loss": 0.4599, + "step": 8236 + }, + { + "epoch": 2.644726280301814, + "grad_norm": 0.7100738883018494, + "learning_rate": 1.8366074928281607e-05, + "loss": 0.2707, + "step": 8237 + }, + { + "epoch": 2.6450473591266657, + "grad_norm": 0.9927451014518738, + "learning_rate": 1.834591881142538e-05, + "loss": 0.3854, + "step": 8238 + }, + { + "epoch": 2.645368437951517, + "grad_norm": 0.8780126571655273, + "learning_rate": 1.8325772643871265e-05, + "loss": 0.3619, + "step": 8239 + }, + { + "epoch": 2.6456895167763683, + "grad_norm": 0.7575050592422485, + "learning_rate": 1.8305636428074014e-05, + "loss": 0.3066, + "step": 8240 + }, + { + "epoch": 2.6460105956012203, + "grad_norm": 0.599746584892273, + "learning_rate": 1.8285510166487152e-05, + "loss": 0.2652, + "step": 8241 + }, + { + "epoch": 2.6463316744260714, + "grad_norm": 0.9272753596305847, + "learning_rate": 1.826539386156302e-05, + "loss": 0.4376, + "step": 8242 + }, + { + "epoch": 2.646652753250923, + "grad_norm": 0.7078785300254822, + "learning_rate": 1.8245287515752708e-05, + "loss": 0.2626, + "step": 8243 + }, + { + "epoch": 2.6469738320757745, + "grad_norm": 0.8110345602035522, + "learning_rate": 1.8225191131506126e-05, + "loss": 0.2777, + "step": 8244 + }, + { + "epoch": 2.647294910900626, + "grad_norm": 0.7809237241744995, + "learning_rate": 1.8205104711271957e-05, + "loss": 0.3272, + "step": 8245 + }, + { + "epoch": 2.6476159897254776, + "grad_norm": 0.7376668453216553, + "learning_rate": 1.818502825749764e-05, + "loss": 0.3092, + "step": 8246 + }, + { + "epoch": 2.647937068550329, + "grad_norm": 0.645916223526001, + "learning_rate": 1.816496177262952e-05, + "loss": 0.2277, + "step": 8247 + }, + { + "epoch": 2.6482581473751807, + "grad_norm": 1.4311972856521606, + "learning_rate": 1.8144905259112542e-05, + "loss": 0.4121, + "step": 8248 + }, + { + "epoch": 2.648579226200032, + "grad_norm": 0.8135798573493958, + "learning_rate": 1.812485871939056e-05, + "loss": 0.3105, + "step": 8249 + }, + { + "epoch": 2.648900305024884, + "grad_norm": 0.9382086992263794, + "learning_rate": 1.81048221559062e-05, + "loss": 0.3712, + "step": 8250 + }, + { + "epoch": 2.649221383849735, + "grad_norm": 0.9404576420783997, + "learning_rate": 1.808479557110081e-05, + "loss": 0.3391, + "step": 8251 + }, + { + "epoch": 2.6495424626745865, + "grad_norm": 1.0706158876419067, + "learning_rate": 1.8064778967414662e-05, + "loss": 0.433, + "step": 8252 + }, + { + "epoch": 2.649863541499438, + "grad_norm": 0.7485371828079224, + "learning_rate": 1.804477234728661e-05, + "loss": 0.2812, + "step": 8253 + }, + { + "epoch": 2.6501846203242896, + "grad_norm": 1.0219621658325195, + "learning_rate": 1.8024775713154473e-05, + "loss": 0.4084, + "step": 8254 + }, + { + "epoch": 2.650505699149141, + "grad_norm": 0.861611545085907, + "learning_rate": 1.8004789067454764e-05, + "loss": 0.3148, + "step": 8255 + }, + { + "epoch": 2.6508267779739927, + "grad_norm": 1.0790586471557617, + "learning_rate": 1.7984812412622787e-05, + "loss": 0.4146, + "step": 8256 + }, + { + "epoch": 2.6511478567988442, + "grad_norm": 0.6884385347366333, + "learning_rate": 1.7964845751092664e-05, + "loss": 0.2809, + "step": 8257 + }, + { + "epoch": 2.6514689356236953, + "grad_norm": 1.2098215818405151, + "learning_rate": 1.794488908529719e-05, + "loss": 0.4282, + "step": 8258 + }, + { + "epoch": 2.6517900144485473, + "grad_norm": 0.5315207242965698, + "learning_rate": 1.792494241766811e-05, + "loss": 0.2248, + "step": 8259 + }, + { + "epoch": 2.6521110932733984, + "grad_norm": 1.038537859916687, + "learning_rate": 1.790500575063584e-05, + "loss": 0.4178, + "step": 8260 + }, + { + "epoch": 2.65243217209825, + "grad_norm": 0.7643697261810303, + "learning_rate": 1.78850790866296e-05, + "loss": 0.2673, + "step": 8261 + }, + { + "epoch": 2.6527532509231015, + "grad_norm": 0.6102786064147949, + "learning_rate": 1.7865162428077386e-05, + "loss": 0.2725, + "step": 8262 + }, + { + "epoch": 2.653074329747953, + "grad_norm": 0.9418346285820007, + "learning_rate": 1.7845255777406e-05, + "loss": 0.3239, + "step": 8263 + }, + { + "epoch": 2.6533954085728046, + "grad_norm": 0.7261042594909668, + "learning_rate": 1.7825359137040988e-05, + "loss": 0.2828, + "step": 8264 + }, + { + "epoch": 2.653716487397656, + "grad_norm": 0.9733819961547852, + "learning_rate": 1.7805472509406696e-05, + "loss": 0.3408, + "step": 8265 + }, + { + "epoch": 2.6540375662225077, + "grad_norm": 0.5591874718666077, + "learning_rate": 1.7785595896926265e-05, + "loss": 0.2455, + "step": 8266 + }, + { + "epoch": 2.654358645047359, + "grad_norm": 0.7229105234146118, + "learning_rate": 1.7765729302021596e-05, + "loss": 0.2537, + "step": 8267 + }, + { + "epoch": 2.654679723872211, + "grad_norm": 0.6605117917060852, + "learning_rate": 1.7745872727113356e-05, + "loss": 0.2885, + "step": 8268 + }, + { + "epoch": 2.655000802697062, + "grad_norm": 0.5864028334617615, + "learning_rate": 1.7726026174621003e-05, + "loss": 0.2461, + "step": 8269 + }, + { + "epoch": 2.6553218815219135, + "grad_norm": 1.8025933504104614, + "learning_rate": 1.7706189646962847e-05, + "loss": 0.3684, + "step": 8270 + }, + { + "epoch": 2.655642960346765, + "grad_norm": 0.835909903049469, + "learning_rate": 1.7686363146555805e-05, + "loss": 0.3308, + "step": 8271 + }, + { + "epoch": 2.6559640391716166, + "grad_norm": 0.9109052419662476, + "learning_rate": 1.7666546675815778e-05, + "loss": 0.3649, + "step": 8272 + }, + { + "epoch": 2.656285117996468, + "grad_norm": 0.5964946150779724, + "learning_rate": 1.7646740237157256e-05, + "loss": 0.2541, + "step": 8273 + }, + { + "epoch": 2.6566061968213197, + "grad_norm": 0.6849761605262756, + "learning_rate": 1.7626943832993647e-05, + "loss": 0.29, + "step": 8274 + }, + { + "epoch": 2.6569272756461713, + "grad_norm": 0.5830851197242737, + "learning_rate": 1.760715746573709e-05, + "loss": 0.2506, + "step": 8275 + }, + { + "epoch": 2.6572483544710224, + "grad_norm": 0.5915994644165039, + "learning_rate": 1.7587381137798432e-05, + "loss": 0.299, + "step": 8276 + }, + { + "epoch": 2.6575694332958744, + "grad_norm": 0.5111377835273743, + "learning_rate": 1.7567614851587443e-05, + "loss": 0.2853, + "step": 8277 + }, + { + "epoch": 2.6578905121207255, + "grad_norm": 0.5485566854476929, + "learning_rate": 1.754785860951249e-05, + "loss": 0.2842, + "step": 8278 + }, + { + "epoch": 2.658211590945577, + "grad_norm": 0.38606464862823486, + "learning_rate": 1.752811241398089e-05, + "loss": 0.2609, + "step": 8279 + }, + { + "epoch": 2.6585326697704286, + "grad_norm": 0.36825671792030334, + "learning_rate": 1.750837626739863e-05, + "loss": 0.4696, + "step": 8280 + }, + { + "epoch": 2.65885374859528, + "grad_norm": 0.49861788749694824, + "learning_rate": 1.7488650172170496e-05, + "loss": 0.4966, + "step": 8281 + }, + { + "epoch": 2.6591748274201317, + "grad_norm": 0.3955051004886627, + "learning_rate": 1.7468934130700044e-05, + "loss": 0.1671, + "step": 8282 + }, + { + "epoch": 2.659495906244983, + "grad_norm": 0.30318066477775574, + "learning_rate": 1.744922814538964e-05, + "loss": 0.1232, + "step": 8283 + }, + { + "epoch": 2.6598169850698348, + "grad_norm": 0.5016526579856873, + "learning_rate": 1.7429532218640377e-05, + "loss": 0.1385, + "step": 8284 + }, + { + "epoch": 2.660138063894686, + "grad_norm": 0.33723151683807373, + "learning_rate": 1.7409846352852143e-05, + "loss": 0.122, + "step": 8285 + }, + { + "epoch": 2.660459142719538, + "grad_norm": 0.7565352916717529, + "learning_rate": 1.7390170550423625e-05, + "loss": 0.3913, + "step": 8286 + }, + { + "epoch": 2.660780221544389, + "grad_norm": 0.857373833656311, + "learning_rate": 1.737050481375223e-05, + "loss": 0.4855, + "step": 8287 + }, + { + "epoch": 2.6611013003692405, + "grad_norm": 0.7241206765174866, + "learning_rate": 1.7350849145234184e-05, + "loss": 0.281, + "step": 8288 + }, + { + "epoch": 2.661422379194092, + "grad_norm": 0.822955310344696, + "learning_rate": 1.7331203547264453e-05, + "loss": 0.3799, + "step": 8289 + }, + { + "epoch": 2.6617434580189436, + "grad_norm": 0.9299210906028748, + "learning_rate": 1.7311568022236845e-05, + "loss": 0.3482, + "step": 8290 + }, + { + "epoch": 2.662064536843795, + "grad_norm": 0.68746417760849, + "learning_rate": 1.7291942572543807e-05, + "loss": 0.293, + "step": 8291 + }, + { + "epoch": 2.6623856156686467, + "grad_norm": 0.8198730945587158, + "learning_rate": 1.7272327200576742e-05, + "loss": 0.3485, + "step": 8292 + }, + { + "epoch": 2.6627066944934983, + "grad_norm": 1.226529598236084, + "learning_rate": 1.7252721908725632e-05, + "loss": 0.3973, + "step": 8293 + }, + { + "epoch": 2.6630277733183494, + "grad_norm": 0.7077026963233948, + "learning_rate": 1.7233126699379343e-05, + "loss": 0.2794, + "step": 8294 + }, + { + "epoch": 2.6633488521432014, + "grad_norm": 0.8905551433563232, + "learning_rate": 1.721354157492555e-05, + "loss": 0.3946, + "step": 8295 + }, + { + "epoch": 2.6636699309680525, + "grad_norm": 0.9082470536231995, + "learning_rate": 1.719396653775056e-05, + "loss": 0.3648, + "step": 8296 + }, + { + "epoch": 2.663991009792904, + "grad_norm": 0.6751482486724854, + "learning_rate": 1.7174401590239587e-05, + "loss": 0.2582, + "step": 8297 + }, + { + "epoch": 2.6643120886177556, + "grad_norm": 0.916202187538147, + "learning_rate": 1.715484673477654e-05, + "loss": 0.3537, + "step": 8298 + }, + { + "epoch": 2.664633167442607, + "grad_norm": 1.0404547452926636, + "learning_rate": 1.7135301973744124e-05, + "loss": 0.3866, + "step": 8299 + }, + { + "epoch": 2.6649542462674587, + "grad_norm": 0.7834314107894897, + "learning_rate": 1.7115767309523812e-05, + "loss": 0.3115, + "step": 8300 + }, + { + "epoch": 2.6652753250923102, + "grad_norm": 0.8973174691200256, + "learning_rate": 1.7096242744495837e-05, + "loss": 0.2688, + "step": 8301 + }, + { + "epoch": 2.665596403917162, + "grad_norm": 1.4204421043395996, + "learning_rate": 1.70767282810392e-05, + "loss": 0.5071, + "step": 8302 + }, + { + "epoch": 2.665917482742013, + "grad_norm": 1.0806547403335571, + "learning_rate": 1.7057223921531707e-05, + "loss": 0.3664, + "step": 8303 + }, + { + "epoch": 2.666238561566865, + "grad_norm": 1.0438770055770874, + "learning_rate": 1.7037729668349877e-05, + "loss": 0.3832, + "step": 8304 + }, + { + "epoch": 2.666559640391716, + "grad_norm": 0.9235129356384277, + "learning_rate": 1.7018245523869036e-05, + "loss": 0.345, + "step": 8305 + }, + { + "epoch": 2.6668807192165676, + "grad_norm": 0.9867783784866333, + "learning_rate": 1.6998771490463262e-05, + "loss": 0.3577, + "step": 8306 + }, + { + "epoch": 2.667201798041419, + "grad_norm": 0.9603634476661682, + "learning_rate": 1.697930757050542e-05, + "loss": 0.3107, + "step": 8307 + }, + { + "epoch": 2.6675228768662707, + "grad_norm": 0.6079133749008179, + "learning_rate": 1.6959853766367118e-05, + "loss": 0.2685, + "step": 8308 + }, + { + "epoch": 2.667843955691122, + "grad_norm": 1.3194692134857178, + "learning_rate": 1.6940410080418723e-05, + "loss": 0.3876, + "step": 8309 + }, + { + "epoch": 2.6681650345159738, + "grad_norm": 0.7050005793571472, + "learning_rate": 1.6920976515029463e-05, + "loss": 0.301, + "step": 8310 + }, + { + "epoch": 2.6684861133408253, + "grad_norm": 1.0123231410980225, + "learning_rate": 1.690155307256719e-05, + "loss": 0.4113, + "step": 8311 + }, + { + "epoch": 2.6688071921656764, + "grad_norm": 0.8244590163230896, + "learning_rate": 1.6882139755398574e-05, + "loss": 0.3621, + "step": 8312 + }, + { + "epoch": 2.6691282709905284, + "grad_norm": 0.5136908292770386, + "learning_rate": 1.686273656588917e-05, + "loss": 0.2476, + "step": 8313 + }, + { + "epoch": 2.6694493498153795, + "grad_norm": 0.7136945128440857, + "learning_rate": 1.6843343506403075e-05, + "loss": 0.2703, + "step": 8314 + }, + { + "epoch": 2.669770428640231, + "grad_norm": 1.0239125490188599, + "learning_rate": 1.6823960579303377e-05, + "loss": 0.314, + "step": 8315 + }, + { + "epoch": 2.6700915074650826, + "grad_norm": 0.6786704063415527, + "learning_rate": 1.680458778695174e-05, + "loss": 0.2897, + "step": 8316 + }, + { + "epoch": 2.670412586289934, + "grad_norm": 0.6435678601264954, + "learning_rate": 1.678522513170875e-05, + "loss": 0.274, + "step": 8317 + }, + { + "epoch": 2.6707336651147857, + "grad_norm": 0.6759331822395325, + "learning_rate": 1.6765872615933677e-05, + "loss": 0.2739, + "step": 8318 + }, + { + "epoch": 2.6710547439396373, + "grad_norm": 0.7092223167419434, + "learning_rate": 1.6746530241984504e-05, + "loss": 0.2994, + "step": 8319 + }, + { + "epoch": 2.671375822764489, + "grad_norm": 0.9770942330360413, + "learning_rate": 1.6727198012218114e-05, + "loss": 0.3379, + "step": 8320 + }, + { + "epoch": 2.67169690158934, + "grad_norm": 1.3338515758514404, + "learning_rate": 1.6707875928990058e-05, + "loss": 0.3295, + "step": 8321 + }, + { + "epoch": 2.672017980414192, + "grad_norm": 0.6919156312942505, + "learning_rate": 1.668856399465466e-05, + "loss": 0.2663, + "step": 8322 + }, + { + "epoch": 2.672339059239043, + "grad_norm": 0.9007259011268616, + "learning_rate": 1.6669262211565028e-05, + "loss": 0.3146, + "step": 8323 + }, + { + "epoch": 2.6726601380638946, + "grad_norm": 0.6799442172050476, + "learning_rate": 1.6649970582073028e-05, + "loss": 0.2957, + "step": 8324 + }, + { + "epoch": 2.672981216888746, + "grad_norm": 0.8429846167564392, + "learning_rate": 1.6630689108529284e-05, + "loss": 0.3215, + "step": 8325 + }, + { + "epoch": 2.6733022957135977, + "grad_norm": 0.9628200531005859, + "learning_rate": 1.661141779328319e-05, + "loss": 0.3283, + "step": 8326 + }, + { + "epoch": 2.6736233745384492, + "grad_norm": 0.7817652225494385, + "learning_rate": 1.6592156638682886e-05, + "loss": 0.2815, + "step": 8327 + }, + { + "epoch": 2.6739444533633008, + "grad_norm": 0.5728936791419983, + "learning_rate": 1.6572905647075298e-05, + "loss": 0.2687, + "step": 8328 + }, + { + "epoch": 2.6742655321881523, + "grad_norm": 0.3916088342666626, + "learning_rate": 1.6553664820806102e-05, + "loss": 0.2574, + "step": 8329 + }, + { + "epoch": 2.6745866110130034, + "grad_norm": 0.5118910670280457, + "learning_rate": 1.6534434162219727e-05, + "loss": 0.6336, + "step": 8330 + }, + { + "epoch": 2.6749076898378554, + "grad_norm": 0.4536105990409851, + "learning_rate": 1.6515213673659357e-05, + "loss": 0.4168, + "step": 8331 + }, + { + "epoch": 2.6752287686627065, + "grad_norm": 0.442342072725296, + "learning_rate": 1.649600335746695e-05, + "loss": 0.2928, + "step": 8332 + }, + { + "epoch": 2.675549847487558, + "grad_norm": 0.4411477744579315, + "learning_rate": 1.6476803215983294e-05, + "loss": 0.1311, + "step": 8333 + }, + { + "epoch": 2.6758709263124096, + "grad_norm": 0.3371138572692871, + "learning_rate": 1.6457613251547754e-05, + "loss": 0.1534, + "step": 8334 + }, + { + "epoch": 2.676192005137261, + "grad_norm": 0.6045904755592346, + "learning_rate": 1.643843346649866e-05, + "loss": 0.3301, + "step": 8335 + }, + { + "epoch": 2.6765130839621127, + "grad_norm": 0.905532956123352, + "learning_rate": 1.6419263863172997e-05, + "loss": 0.3937, + "step": 8336 + }, + { + "epoch": 2.6768341627869643, + "grad_norm": 0.7415409088134766, + "learning_rate": 1.640010444390646e-05, + "loss": 0.3073, + "step": 8337 + }, + { + "epoch": 2.677155241611816, + "grad_norm": 0.8520146608352661, + "learning_rate": 1.6380955211033656e-05, + "loss": 0.4127, + "step": 8338 + }, + { + "epoch": 2.677476320436667, + "grad_norm": 0.9074903130531311, + "learning_rate": 1.6361816166887768e-05, + "loss": 0.3889, + "step": 8339 + }, + { + "epoch": 2.677797399261519, + "grad_norm": 0.7967313528060913, + "learning_rate": 1.634268731380091e-05, + "loss": 0.2836, + "step": 8340 + }, + { + "epoch": 2.67811847808637, + "grad_norm": 0.7491098046302795, + "learning_rate": 1.632356865410384e-05, + "loss": 0.3364, + "step": 8341 + }, + { + "epoch": 2.6784395569112216, + "grad_norm": 0.7044479250907898, + "learning_rate": 1.6304460190126103e-05, + "loss": 0.3405, + "step": 8342 + }, + { + "epoch": 2.678760635736073, + "grad_norm": 0.7631996273994446, + "learning_rate": 1.628536192419603e-05, + "loss": 0.3188, + "step": 8343 + }, + { + "epoch": 2.6790817145609247, + "grad_norm": 0.9044662714004517, + "learning_rate": 1.6266273858640656e-05, + "loss": 0.3248, + "step": 8344 + }, + { + "epoch": 2.6794027933857762, + "grad_norm": 0.9730323553085327, + "learning_rate": 1.6247195995785837e-05, + "loss": 0.3272, + "step": 8345 + }, + { + "epoch": 2.679723872210628, + "grad_norm": 0.9909996390342712, + "learning_rate": 1.622812833795613e-05, + "loss": 0.4031, + "step": 8346 + }, + { + "epoch": 2.6800449510354794, + "grad_norm": 1.0814063549041748, + "learning_rate": 1.6209070887474876e-05, + "loss": 0.3565, + "step": 8347 + }, + { + "epoch": 2.6803660298603305, + "grad_norm": 0.8834957480430603, + "learning_rate": 1.6190023646664175e-05, + "loss": 0.4426, + "step": 8348 + }, + { + "epoch": 2.6806871086851825, + "grad_norm": 1.0340230464935303, + "learning_rate": 1.6170986617844863e-05, + "loss": 0.3924, + "step": 8349 + }, + { + "epoch": 2.6810081875100336, + "grad_norm": 0.743299663066864, + "learning_rate": 1.6151959803336535e-05, + "loss": 0.3009, + "step": 8350 + }, + { + "epoch": 2.681329266334885, + "grad_norm": 0.802480936050415, + "learning_rate": 1.6132943205457606e-05, + "loss": 0.3529, + "step": 8351 + }, + { + "epoch": 2.6816503451597367, + "grad_norm": 1.1612759828567505, + "learning_rate": 1.611393682652511e-05, + "loss": 0.377, + "step": 8352 + }, + { + "epoch": 2.681971423984588, + "grad_norm": 0.9446073174476624, + "learning_rate": 1.6094940668855006e-05, + "loss": 0.3516, + "step": 8353 + }, + { + "epoch": 2.6822925028094398, + "grad_norm": 0.7460670471191406, + "learning_rate": 1.6075954734761845e-05, + "loss": 0.3126, + "step": 8354 + }, + { + "epoch": 2.6826135816342913, + "grad_norm": 0.7956533432006836, + "learning_rate": 1.6056979026559004e-05, + "loss": 0.3152, + "step": 8355 + }, + { + "epoch": 2.682934660459143, + "grad_norm": 0.5445581674575806, + "learning_rate": 1.6038013546558696e-05, + "loss": 0.2414, + "step": 8356 + }, + { + "epoch": 2.683255739283994, + "grad_norm": 0.8565784692764282, + "learning_rate": 1.601905829707171e-05, + "loss": 0.2997, + "step": 8357 + }, + { + "epoch": 2.683576818108846, + "grad_norm": 0.512998104095459, + "learning_rate": 1.600011328040777e-05, + "loss": 0.2233, + "step": 8358 + }, + { + "epoch": 2.683897896933697, + "grad_norm": 0.8238282203674316, + "learning_rate": 1.598117849887518e-05, + "loss": 0.3354, + "step": 8359 + }, + { + "epoch": 2.6842189757585486, + "grad_norm": 0.8783250451087952, + "learning_rate": 1.596225395478116e-05, + "loss": 0.3224, + "step": 8360 + }, + { + "epoch": 2.6845400545834, + "grad_norm": 0.6157239675521851, + "learning_rate": 1.5943339650431576e-05, + "loss": 0.2397, + "step": 8361 + }, + { + "epoch": 2.6848611334082517, + "grad_norm": 0.7254630327224731, + "learning_rate": 1.592443558813109e-05, + "loss": 0.249, + "step": 8362 + }, + { + "epoch": 2.6851822122331033, + "grad_norm": 1.0518107414245605, + "learning_rate": 1.5905541770183096e-05, + "loss": 0.3714, + "step": 8363 + }, + { + "epoch": 2.685503291057955, + "grad_norm": 0.6833174824714661, + "learning_rate": 1.588665819888976e-05, + "loss": 0.277, + "step": 8364 + }, + { + "epoch": 2.6858243698828064, + "grad_norm": 0.9686453938484192, + "learning_rate": 1.586778487655197e-05, + "loss": 0.4063, + "step": 8365 + }, + { + "epoch": 2.6861454487076575, + "grad_norm": 0.634539008140564, + "learning_rate": 1.5848921805469397e-05, + "loss": 0.2757, + "step": 8366 + }, + { + "epoch": 2.6864665275325095, + "grad_norm": 0.6041567325592041, + "learning_rate": 1.5830068987940438e-05, + "loss": 0.2413, + "step": 8367 + }, + { + "epoch": 2.6867876063573606, + "grad_norm": 0.7220975160598755, + "learning_rate": 1.581122642626226e-05, + "loss": 0.2697, + "step": 8368 + }, + { + "epoch": 2.687108685182212, + "grad_norm": 1.3267475366592407, + "learning_rate": 1.579239412273078e-05, + "loss": 0.3793, + "step": 8369 + }, + { + "epoch": 2.6874297640070637, + "grad_norm": 1.0630850791931152, + "learning_rate": 1.577357207964062e-05, + "loss": 0.3434, + "step": 8370 + }, + { + "epoch": 2.6877508428319152, + "grad_norm": 0.6697249412536621, + "learning_rate": 1.5754760299285252e-05, + "loss": 0.2852, + "step": 8371 + }, + { + "epoch": 2.688071921656767, + "grad_norm": 0.8393073081970215, + "learning_rate": 1.5735958783956794e-05, + "loss": 0.3452, + "step": 8372 + }, + { + "epoch": 2.6883930004816183, + "grad_norm": 0.8739592432975769, + "learning_rate": 1.5717167535946144e-05, + "loss": 0.3311, + "step": 8373 + }, + { + "epoch": 2.68871407930647, + "grad_norm": 0.4763057231903076, + "learning_rate": 1.569838655754298e-05, + "loss": 0.2477, + "step": 8374 + }, + { + "epoch": 2.689035158131321, + "grad_norm": 0.5161868333816528, + "learning_rate": 1.567961585103567e-05, + "loss": 0.2638, + "step": 8375 + }, + { + "epoch": 2.689356236956173, + "grad_norm": 0.5207780003547668, + "learning_rate": 1.566085541871145e-05, + "loss": 0.2612, + "step": 8376 + }, + { + "epoch": 2.689677315781024, + "grad_norm": 0.388313353061676, + "learning_rate": 1.564210526285612e-05, + "loss": 0.25, + "step": 8377 + }, + { + "epoch": 2.6899983946058756, + "grad_norm": 0.6559248566627502, + "learning_rate": 1.5623365385754408e-05, + "loss": 0.2934, + "step": 8378 + }, + { + "epoch": 2.690319473430727, + "grad_norm": 0.6351720094680786, + "learning_rate": 1.560463578968967e-05, + "loss": 0.2985, + "step": 8379 + }, + { + "epoch": 2.6906405522555787, + "grad_norm": 0.6100612282752991, + "learning_rate": 1.5585916476944073e-05, + "loss": 0.7795, + "step": 8380 + }, + { + "epoch": 2.6909616310804303, + "grad_norm": 0.3323669135570526, + "learning_rate": 1.5567207449798515e-05, + "loss": 0.246, + "step": 8381 + }, + { + "epoch": 2.691282709905282, + "grad_norm": 0.4569035768508911, + "learning_rate": 1.5548508710532572e-05, + "loss": 0.1564, + "step": 8382 + }, + { + "epoch": 2.6916037887301334, + "grad_norm": 0.3317892551422119, + "learning_rate": 1.5529820261424698e-05, + "loss": 0.1498, + "step": 8383 + }, + { + "epoch": 2.6919248675549845, + "grad_norm": 0.3487982451915741, + "learning_rate": 1.551114210475201e-05, + "loss": 0.1257, + "step": 8384 + }, + { + "epoch": 2.6922459463798365, + "grad_norm": 0.42391496896743774, + "learning_rate": 1.5492474242790366e-05, + "loss": 0.1278, + "step": 8385 + }, + { + "epoch": 2.6925670252046876, + "grad_norm": 0.3310485780239105, + "learning_rate": 1.547381667781439e-05, + "loss": 0.1795, + "step": 8386 + }, + { + "epoch": 2.692888104029539, + "grad_norm": 0.8500526547431946, + "learning_rate": 1.545516941209747e-05, + "loss": 0.3601, + "step": 8387 + }, + { + "epoch": 2.6932091828543907, + "grad_norm": 0.8416752815246582, + "learning_rate": 1.54365324479117e-05, + "loss": 0.3523, + "step": 8388 + }, + { + "epoch": 2.6935302616792423, + "grad_norm": 0.7443310022354126, + "learning_rate": 1.541790578752794e-05, + "loss": 0.3137, + "step": 8389 + }, + { + "epoch": 2.693851340504094, + "grad_norm": 0.7136545777320862, + "learning_rate": 1.539928943321579e-05, + "loss": 0.2981, + "step": 8390 + }, + { + "epoch": 2.6941724193289454, + "grad_norm": 0.6598768830299377, + "learning_rate": 1.538068338724361e-05, + "loss": 0.2808, + "step": 8391 + }, + { + "epoch": 2.694493498153797, + "grad_norm": 0.8997542858123779, + "learning_rate": 1.5362087651878475e-05, + "loss": 0.3301, + "step": 8392 + }, + { + "epoch": 2.694814576978648, + "grad_norm": 0.9594383835792542, + "learning_rate": 1.5343502229386207e-05, + "loss": 0.382, + "step": 8393 + }, + { + "epoch": 2.6951356558035, + "grad_norm": 0.7930410504341125, + "learning_rate": 1.532492712203145e-05, + "loss": 0.3238, + "step": 8394 + }, + { + "epoch": 2.695456734628351, + "grad_norm": 0.918022096157074, + "learning_rate": 1.530636233207743e-05, + "loss": 0.3953, + "step": 8395 + }, + { + "epoch": 2.6957778134532027, + "grad_norm": 1.3082184791564941, + "learning_rate": 1.528780786178631e-05, + "loss": 0.4692, + "step": 8396 + }, + { + "epoch": 2.696098892278054, + "grad_norm": 0.8029715418815613, + "learning_rate": 1.526926371341878e-05, + "loss": 0.3129, + "step": 8397 + }, + { + "epoch": 2.6964199711029058, + "grad_norm": 0.8471099734306335, + "learning_rate": 1.5250729889234482e-05, + "loss": 0.3716, + "step": 8398 + }, + { + "epoch": 2.6967410499277573, + "grad_norm": 0.8596959710121155, + "learning_rate": 1.5232206391491699e-05, + "loss": 0.3605, + "step": 8399 + }, + { + "epoch": 2.697062128752609, + "grad_norm": 0.6960004568099976, + "learning_rate": 1.521369322244739e-05, + "loss": 0.288, + "step": 8400 + }, + { + "epoch": 2.6973832075774604, + "grad_norm": 0.6006725430488586, + "learning_rate": 1.5195190384357404e-05, + "loss": 0.2479, + "step": 8401 + }, + { + "epoch": 2.6977042864023115, + "grad_norm": 0.9754154086112976, + "learning_rate": 1.5176697879476232e-05, + "loss": 0.2391, + "step": 8402 + }, + { + "epoch": 2.6980253652271635, + "grad_norm": 0.7562586665153503, + "learning_rate": 1.5158215710057123e-05, + "loss": 0.2767, + "step": 8403 + }, + { + "epoch": 2.6983464440520146, + "grad_norm": 0.9271560907363892, + "learning_rate": 1.5139743878352075e-05, + "loss": 0.3777, + "step": 8404 + }, + { + "epoch": 2.698667522876866, + "grad_norm": 0.9956105947494507, + "learning_rate": 1.5121282386611824e-05, + "loss": 0.4351, + "step": 8405 + }, + { + "epoch": 2.6989886017017177, + "grad_norm": 1.1439400911331177, + "learning_rate": 1.5102831237085857e-05, + "loss": 0.3981, + "step": 8406 + }, + { + "epoch": 2.6993096805265693, + "grad_norm": 0.8666257262229919, + "learning_rate": 1.5084390432022377e-05, + "loss": 0.3045, + "step": 8407 + }, + { + "epoch": 2.699630759351421, + "grad_norm": 0.7622800469398499, + "learning_rate": 1.5065959973668353e-05, + "loss": 0.2846, + "step": 8408 + }, + { + "epoch": 2.6999518381762724, + "grad_norm": 0.9066924452781677, + "learning_rate": 1.5047539864269478e-05, + "loss": 0.2969, + "step": 8409 + }, + { + "epoch": 2.700272917001124, + "grad_norm": 0.9989892244338989, + "learning_rate": 1.5029130106070167e-05, + "loss": 0.3537, + "step": 8410 + }, + { + "epoch": 2.700593995825975, + "grad_norm": 1.0656063556671143, + "learning_rate": 1.5010730701313625e-05, + "loss": 0.3365, + "step": 8411 + }, + { + "epoch": 2.700915074650827, + "grad_norm": 0.9919269680976868, + "learning_rate": 1.4992341652241737e-05, + "loss": 0.3995, + "step": 8412 + }, + { + "epoch": 2.701236153475678, + "grad_norm": 0.9192927479743958, + "learning_rate": 1.4973962961095134e-05, + "loss": 0.3682, + "step": 8413 + }, + { + "epoch": 2.7015572323005297, + "grad_norm": 0.9061486124992371, + "learning_rate": 1.495559463011329e-05, + "loss": 0.3291, + "step": 8414 + }, + { + "epoch": 2.7018783111253812, + "grad_norm": 1.0932555198669434, + "learning_rate": 1.4937236661534226e-05, + "loss": 0.274, + "step": 8415 + }, + { + "epoch": 2.702199389950233, + "grad_norm": 1.2957507371902466, + "learning_rate": 1.4918889057594876e-05, + "loss": 0.2687, + "step": 8416 + }, + { + "epoch": 2.7025204687750843, + "grad_norm": 0.8411356806755066, + "learning_rate": 1.4900551820530828e-05, + "loss": 0.2988, + "step": 8417 + }, + { + "epoch": 2.702841547599936, + "grad_norm": 1.028803825378418, + "learning_rate": 1.4882224952576373e-05, + "loss": 0.393, + "step": 8418 + }, + { + "epoch": 2.7031626264247874, + "grad_norm": 1.0516469478607178, + "learning_rate": 1.486390845596466e-05, + "loss": 0.3689, + "step": 8419 + }, + { + "epoch": 2.7034837052496385, + "grad_norm": 0.5976935029029846, + "learning_rate": 1.484560233292741e-05, + "loss": 0.2572, + "step": 8420 + }, + { + "epoch": 2.7038047840744905, + "grad_norm": 1.021976113319397, + "learning_rate": 1.4827306585695234e-05, + "loss": 0.4232, + "step": 8421 + }, + { + "epoch": 2.7041258628993416, + "grad_norm": 0.8594872951507568, + "learning_rate": 1.4809021216497399e-05, + "loss": 0.3019, + "step": 8422 + }, + { + "epoch": 2.704446941724193, + "grad_norm": 0.8935600519180298, + "learning_rate": 1.4790746227561925e-05, + "loss": 0.2665, + "step": 8423 + }, + { + "epoch": 2.7047680205490447, + "grad_norm": 0.5370054244995117, + "learning_rate": 1.4772481621115541e-05, + "loss": 0.2293, + "step": 8424 + }, + { + "epoch": 2.7050890993738963, + "grad_norm": 0.42363736033439636, + "learning_rate": 1.4754227399383757e-05, + "loss": 0.2423, + "step": 8425 + }, + { + "epoch": 2.705410178198748, + "grad_norm": 0.6202889680862427, + "learning_rate": 1.4735983564590783e-05, + "loss": 0.3039, + "step": 8426 + }, + { + "epoch": 2.7057312570235994, + "grad_norm": 0.5840016603469849, + "learning_rate": 1.4717750118959584e-05, + "loss": 0.3053, + "step": 8427 + }, + { + "epoch": 2.706052335848451, + "grad_norm": 0.5942694544792175, + "learning_rate": 1.4699527064711838e-05, + "loss": 0.2565, + "step": 8428 + }, + { + "epoch": 2.706373414673302, + "grad_norm": 0.7443995475769043, + "learning_rate": 1.468131440406798e-05, + "loss": 0.3169, + "step": 8429 + }, + { + "epoch": 2.706694493498154, + "grad_norm": 0.47813400626182556, + "learning_rate": 1.466311213924716e-05, + "loss": 0.6779, + "step": 8430 + }, + { + "epoch": 2.707015572323005, + "grad_norm": 0.406895250082016, + "learning_rate": 1.4644920272467244e-05, + "loss": 0.5476, + "step": 8431 + }, + { + "epoch": 2.7073366511478567, + "grad_norm": 0.3295402228832245, + "learning_rate": 1.462673880594494e-05, + "loss": 0.155, + "step": 8432 + }, + { + "epoch": 2.7076577299727083, + "grad_norm": 0.23888343572616577, + "learning_rate": 1.4608567741895495e-05, + "loss": 0.0741, + "step": 8433 + }, + { + "epoch": 2.70797880879756, + "grad_norm": 0.20620322227478027, + "learning_rate": 1.4590407082533097e-05, + "loss": 0.0649, + "step": 8434 + }, + { + "epoch": 2.7082998876224114, + "grad_norm": 0.3761898875236511, + "learning_rate": 1.4572256830070497e-05, + "loss": 0.2023, + "step": 8435 + }, + { + "epoch": 2.708620966447263, + "grad_norm": 0.45816802978515625, + "learning_rate": 1.4554116986719257e-05, + "loss": 0.2164, + "step": 8436 + }, + { + "epoch": 2.7089420452721145, + "grad_norm": 0.8823957443237305, + "learning_rate": 1.4535987554689712e-05, + "loss": 0.5551, + "step": 8437 + }, + { + "epoch": 2.7092631240969656, + "grad_norm": 0.8065453171730042, + "learning_rate": 1.4517868536190803e-05, + "loss": 0.396, + "step": 8438 + }, + { + "epoch": 2.7095842029218176, + "grad_norm": 0.9334639310836792, + "learning_rate": 1.4499759933430346e-05, + "loss": 0.4288, + "step": 8439 + }, + { + "epoch": 2.7099052817466687, + "grad_norm": 0.9277933239936829, + "learning_rate": 1.4481661748614784e-05, + "loss": 0.3647, + "step": 8440 + }, + { + "epoch": 2.71022636057152, + "grad_norm": 0.7920669317245483, + "learning_rate": 1.4463573983949341e-05, + "loss": 0.3438, + "step": 8441 + }, + { + "epoch": 2.7105474393963718, + "grad_norm": 1.0169240236282349, + "learning_rate": 1.4445496641637967e-05, + "loss": 0.4013, + "step": 8442 + }, + { + "epoch": 2.7108685182212233, + "grad_norm": 0.83201664686203, + "learning_rate": 1.4427429723883257e-05, + "loss": 0.3804, + "step": 8443 + }, + { + "epoch": 2.711189597046075, + "grad_norm": 0.9397286772727966, + "learning_rate": 1.4409373232886702e-05, + "loss": 0.3005, + "step": 8444 + }, + { + "epoch": 2.7115106758709264, + "grad_norm": 0.6609529852867126, + "learning_rate": 1.439132717084839e-05, + "loss": 0.314, + "step": 8445 + }, + { + "epoch": 2.711831754695778, + "grad_norm": 0.8575668334960938, + "learning_rate": 1.4373291539967182e-05, + "loss": 0.3991, + "step": 8446 + }, + { + "epoch": 2.712152833520629, + "grad_norm": 1.156197428703308, + "learning_rate": 1.4355266342440677e-05, + "loss": 0.3177, + "step": 8447 + }, + { + "epoch": 2.712473912345481, + "grad_norm": 0.7570515275001526, + "learning_rate": 1.4337251580465172e-05, + "loss": 0.3598, + "step": 8448 + }, + { + "epoch": 2.712794991170332, + "grad_norm": 0.7249849438667297, + "learning_rate": 1.4319247256235714e-05, + "loss": 0.2816, + "step": 8449 + }, + { + "epoch": 2.7131160699951837, + "grad_norm": 0.9076480865478516, + "learning_rate": 1.4301253371946089e-05, + "loss": 0.3801, + "step": 8450 + }, + { + "epoch": 2.7134371488200353, + "grad_norm": 1.0600674152374268, + "learning_rate": 1.4283269929788779e-05, + "loss": 0.4087, + "step": 8451 + }, + { + "epoch": 2.713758227644887, + "grad_norm": 1.2555755376815796, + "learning_rate": 1.426529693195503e-05, + "loss": 0.5346, + "step": 8452 + }, + { + "epoch": 2.7140793064697384, + "grad_norm": 0.9253405928611755, + "learning_rate": 1.4247334380634792e-05, + "loss": 0.3705, + "step": 8453 + }, + { + "epoch": 2.71440038529459, + "grad_norm": 0.9356733560562134, + "learning_rate": 1.4229382278016712e-05, + "loss": 0.3621, + "step": 8454 + }, + { + "epoch": 2.7147214641194415, + "grad_norm": 1.4253745079040527, + "learning_rate": 1.4211440626288286e-05, + "loss": 0.4511, + "step": 8455 + }, + { + "epoch": 2.7150425429442926, + "grad_norm": 1.0171775817871094, + "learning_rate": 1.4193509427635543e-05, + "loss": 0.3103, + "step": 8456 + }, + { + "epoch": 2.7153636217691446, + "grad_norm": 1.1757081747055054, + "learning_rate": 1.4175588684243446e-05, + "loss": 0.4486, + "step": 8457 + }, + { + "epoch": 2.7156847005939957, + "grad_norm": 0.8698387742042542, + "learning_rate": 1.4157678398295481e-05, + "loss": 0.3556, + "step": 8458 + }, + { + "epoch": 2.7160057794188472, + "grad_norm": 0.8687936067581177, + "learning_rate": 1.4139778571974049e-05, + "loss": 0.3552, + "step": 8459 + }, + { + "epoch": 2.716326858243699, + "grad_norm": 0.8789449334144592, + "learning_rate": 1.412188920746017e-05, + "loss": 0.3352, + "step": 8460 + }, + { + "epoch": 2.7166479370685503, + "grad_norm": 0.8837921023368835, + "learning_rate": 1.4104010306933557e-05, + "loss": 0.3605, + "step": 8461 + }, + { + "epoch": 2.716969015893402, + "grad_norm": 0.7887502312660217, + "learning_rate": 1.4086141872572789e-05, + "loss": 0.2859, + "step": 8462 + }, + { + "epoch": 2.7172900947182534, + "grad_norm": 0.8587145805358887, + "learning_rate": 1.406828390655497e-05, + "loss": 0.3172, + "step": 8463 + }, + { + "epoch": 2.717611173543105, + "grad_norm": 0.6038246750831604, + "learning_rate": 1.4050436411056123e-05, + "loss": 0.2771, + "step": 8464 + }, + { + "epoch": 2.717932252367956, + "grad_norm": 0.7046909332275391, + "learning_rate": 1.40325993882509e-05, + "loss": 0.2853, + "step": 8465 + }, + { + "epoch": 2.718253331192808, + "grad_norm": 0.6671448945999146, + "learning_rate": 1.4014772840312663e-05, + "loss": 0.2452, + "step": 8466 + }, + { + "epoch": 2.718574410017659, + "grad_norm": 0.46538805961608887, + "learning_rate": 1.3996956769413538e-05, + "loss": 0.2233, + "step": 8467 + }, + { + "epoch": 2.7188954888425108, + "grad_norm": 0.845503032207489, + "learning_rate": 1.3979151177724347e-05, + "loss": 0.329, + "step": 8468 + }, + { + "epoch": 2.7192165676673623, + "grad_norm": 0.6878446936607361, + "learning_rate": 1.3961356067414666e-05, + "loss": 0.255, + "step": 8469 + }, + { + "epoch": 2.719537646492214, + "grad_norm": 0.9203073978424072, + "learning_rate": 1.3943571440652769e-05, + "loss": 0.3479, + "step": 8470 + }, + { + "epoch": 2.7198587253170654, + "grad_norm": 0.7054129242897034, + "learning_rate": 1.3925797299605647e-05, + "loss": 0.3034, + "step": 8471 + }, + { + "epoch": 2.720179804141917, + "grad_norm": 1.0850917100906372, + "learning_rate": 1.3908033646439033e-05, + "loss": 0.3771, + "step": 8472 + }, + { + "epoch": 2.7205008829667685, + "grad_norm": 0.6727474927902222, + "learning_rate": 1.3890280483317374e-05, + "loss": 0.2476, + "step": 8473 + }, + { + "epoch": 2.7208219617916196, + "grad_norm": 0.7375319600105286, + "learning_rate": 1.3872537812403829e-05, + "loss": 0.2564, + "step": 8474 + }, + { + "epoch": 2.7211430406164716, + "grad_norm": 0.4288458526134491, + "learning_rate": 1.3854805635860336e-05, + "loss": 0.2455, + "step": 8475 + }, + { + "epoch": 2.7214641194413227, + "grad_norm": 0.5464689135551453, + "learning_rate": 1.3837083955847418e-05, + "loss": 0.2698, + "step": 8476 + }, + { + "epoch": 2.7217851982661743, + "grad_norm": 0.6613762378692627, + "learning_rate": 1.3819372774524508e-05, + "loss": 0.2394, + "step": 8477 + }, + { + "epoch": 2.722106277091026, + "grad_norm": 0.6144346594810486, + "learning_rate": 1.38016720940496e-05, + "loss": 0.2879, + "step": 8478 + }, + { + "epoch": 2.7224273559158774, + "grad_norm": 0.4245906174182892, + "learning_rate": 1.3783981916579446e-05, + "loss": 0.2576, + "step": 8479 + }, + { + "epoch": 2.722748434740729, + "grad_norm": 0.5054341554641724, + "learning_rate": 1.3766302244269624e-05, + "loss": 0.6253, + "step": 8480 + }, + { + "epoch": 2.7230695135655805, + "grad_norm": 0.44886812567710876, + "learning_rate": 1.3748633079274253e-05, + "loss": 0.3354, + "step": 8481 + }, + { + "epoch": 2.723390592390432, + "grad_norm": 0.39985498785972595, + "learning_rate": 1.3730974423746335e-05, + "loss": 0.2347, + "step": 8482 + }, + { + "epoch": 2.723711671215283, + "grad_norm": 0.5271901488304138, + "learning_rate": 1.3713326279837501e-05, + "loss": 0.2654, + "step": 8483 + }, + { + "epoch": 2.724032750040135, + "grad_norm": 0.4039822518825531, + "learning_rate": 1.3695688649698124e-05, + "loss": 0.1506, + "step": 8484 + }, + { + "epoch": 2.7243538288649862, + "grad_norm": 0.4156002104282379, + "learning_rate": 1.3678061535477304e-05, + "loss": 0.1746, + "step": 8485 + }, + { + "epoch": 2.7246749076898378, + "grad_norm": 0.43996649980545044, + "learning_rate": 1.3660444939322836e-05, + "loss": 0.1855, + "step": 8486 + }, + { + "epoch": 2.7249959865146893, + "grad_norm": 1.112030267715454, + "learning_rate": 1.3642838863381257e-05, + "loss": 0.4318, + "step": 8487 + }, + { + "epoch": 2.725317065339541, + "grad_norm": 0.7671384215354919, + "learning_rate": 1.362524330979782e-05, + "loss": 0.3795, + "step": 8488 + }, + { + "epoch": 2.7256381441643924, + "grad_norm": 0.8614218831062317, + "learning_rate": 1.3607658280716473e-05, + "loss": 0.3608, + "step": 8489 + }, + { + "epoch": 2.725959222989244, + "grad_norm": 0.7562094330787659, + "learning_rate": 1.3590083778279916e-05, + "loss": 0.4333, + "step": 8490 + }, + { + "epoch": 2.7262803018140955, + "grad_norm": 0.6444230079650879, + "learning_rate": 1.3572519804629536e-05, + "loss": 0.2859, + "step": 8491 + }, + { + "epoch": 2.7266013806389466, + "grad_norm": 0.707561194896698, + "learning_rate": 1.3554966361905464e-05, + "loss": 0.2902, + "step": 8492 + }, + { + "epoch": 2.7269224594637986, + "grad_norm": 0.7419070601463318, + "learning_rate": 1.3537423452246523e-05, + "loss": 0.3034, + "step": 8493 + }, + { + "epoch": 2.7272435382886497, + "grad_norm": 0.9765730500221252, + "learning_rate": 1.3519891077790236e-05, + "loss": 0.3747, + "step": 8494 + }, + { + "epoch": 2.7275646171135013, + "grad_norm": 0.6046754717826843, + "learning_rate": 1.3502369240672941e-05, + "loss": 0.2038, + "step": 8495 + }, + { + "epoch": 2.727885695938353, + "grad_norm": 0.7997030019760132, + "learning_rate": 1.3484857943029572e-05, + "loss": 0.3041, + "step": 8496 + }, + { + "epoch": 2.7282067747632044, + "grad_norm": 0.879155695438385, + "learning_rate": 1.34673571869938e-05, + "loss": 0.3071, + "step": 8497 + }, + { + "epoch": 2.728527853588056, + "grad_norm": 1.1423804759979248, + "learning_rate": 1.3449866974698122e-05, + "loss": 0.3656, + "step": 8498 + }, + { + "epoch": 2.7288489324129075, + "grad_norm": 0.8787369132041931, + "learning_rate": 1.3432387308273575e-05, + "loss": 0.4357, + "step": 8499 + }, + { + "epoch": 2.729170011237759, + "grad_norm": 1.0522805452346802, + "learning_rate": 1.3414918189850089e-05, + "loss": 0.4416, + "step": 8500 + }, + { + "epoch": 2.72949109006261, + "grad_norm": 0.7044956684112549, + "learning_rate": 1.339745962155613e-05, + "loss": 0.2899, + "step": 8501 + }, + { + "epoch": 2.729812168887462, + "grad_norm": 0.7710807919502258, + "learning_rate": 1.338001160551906e-05, + "loss": 0.2822, + "step": 8502 + }, + { + "epoch": 2.7301332477123132, + "grad_norm": 1.0037212371826172, + "learning_rate": 1.3362574143864814e-05, + "loss": 0.3884, + "step": 8503 + }, + { + "epoch": 2.730454326537165, + "grad_norm": 0.7747420072555542, + "learning_rate": 1.3345147238718126e-05, + "loss": 0.3585, + "step": 8504 + }, + { + "epoch": 2.7307754053620164, + "grad_norm": 0.5802621841430664, + "learning_rate": 1.3327730892202383e-05, + "loss": 0.2472, + "step": 8505 + }, + { + "epoch": 2.731096484186868, + "grad_norm": 0.7853018641471863, + "learning_rate": 1.3310325106439726e-05, + "loss": 0.2751, + "step": 8506 + }, + { + "epoch": 2.7314175630117195, + "grad_norm": 0.5948209166526794, + "learning_rate": 1.3292929883550998e-05, + "loss": 0.2399, + "step": 8507 + }, + { + "epoch": 2.731738641836571, + "grad_norm": 1.0438854694366455, + "learning_rate": 1.327554522565576e-05, + "loss": 0.3403, + "step": 8508 + }, + { + "epoch": 2.7320597206614226, + "grad_norm": 0.662353515625, + "learning_rate": 1.3258171134872265e-05, + "loss": 0.2787, + "step": 8509 + }, + { + "epoch": 2.7323807994862737, + "grad_norm": 0.8576146960258484, + "learning_rate": 1.3240807613317507e-05, + "loss": 0.3339, + "step": 8510 + }, + { + "epoch": 2.7327018783111257, + "grad_norm": 0.8656917214393616, + "learning_rate": 1.3223454663107172e-05, + "loss": 0.3105, + "step": 8511 + }, + { + "epoch": 2.7330229571359768, + "grad_norm": 0.6916074752807617, + "learning_rate": 1.3206112286355632e-05, + "loss": 0.2808, + "step": 8512 + }, + { + "epoch": 2.7333440359608283, + "grad_norm": 0.5651912689208984, + "learning_rate": 1.3188780485176088e-05, + "loss": 0.2671, + "step": 8513 + }, + { + "epoch": 2.73366511478568, + "grad_norm": 0.7664126753807068, + "learning_rate": 1.3171459261680297e-05, + "loss": 0.316, + "step": 8514 + }, + { + "epoch": 2.7339861936105314, + "grad_norm": 0.632695198059082, + "learning_rate": 1.3154148617978812e-05, + "loss": 0.2613, + "step": 8515 + }, + { + "epoch": 2.734307272435383, + "grad_norm": 0.7974990606307983, + "learning_rate": 1.3136848556180892e-05, + "loss": 0.2982, + "step": 8516 + }, + { + "epoch": 2.7346283512602345, + "grad_norm": 0.7887236475944519, + "learning_rate": 1.3119559078394461e-05, + "loss": 0.2945, + "step": 8517 + }, + { + "epoch": 2.734949430085086, + "grad_norm": 0.8384912014007568, + "learning_rate": 1.3102280186726269e-05, + "loss": 0.3098, + "step": 8518 + }, + { + "epoch": 2.735270508909937, + "grad_norm": 0.8313494920730591, + "learning_rate": 1.3085011883281606e-05, + "loss": 0.3429, + "step": 8519 + }, + { + "epoch": 2.735591587734789, + "grad_norm": 0.7698463797569275, + "learning_rate": 1.3067754170164614e-05, + "loss": 0.276, + "step": 8520 + }, + { + "epoch": 2.7359126665596403, + "grad_norm": 0.7834967374801636, + "learning_rate": 1.30505070494781e-05, + "loss": 0.2935, + "step": 8521 + }, + { + "epoch": 2.736233745384492, + "grad_norm": 1.1544512510299683, + "learning_rate": 1.3033270523323549e-05, + "loss": 0.3743, + "step": 8522 + }, + { + "epoch": 2.7365548242093434, + "grad_norm": 1.0003769397735596, + "learning_rate": 1.3016044593801202e-05, + "loss": 0.3563, + "step": 8523 + }, + { + "epoch": 2.736875903034195, + "grad_norm": 1.167098879814148, + "learning_rate": 1.2998829263009938e-05, + "loss": 0.4428, + "step": 8524 + }, + { + "epoch": 2.7371969818590465, + "grad_norm": 0.758287250995636, + "learning_rate": 1.2981624533047432e-05, + "loss": 0.3105, + "step": 8525 + }, + { + "epoch": 2.737518060683898, + "grad_norm": 0.7774193286895752, + "learning_rate": 1.296443040601003e-05, + "loss": 0.3055, + "step": 8526 + }, + { + "epoch": 2.7378391395087496, + "grad_norm": 0.4565434455871582, + "learning_rate": 1.294724688399278e-05, + "loss": 0.2601, + "step": 8527 + }, + { + "epoch": 2.7381602183336007, + "grad_norm": 0.8566820025444031, + "learning_rate": 1.293007396908944e-05, + "loss": 0.2995, + "step": 8528 + }, + { + "epoch": 2.7384812971584527, + "grad_norm": 0.6120618581771851, + "learning_rate": 1.2912911663392469e-05, + "loss": 0.3114, + "step": 8529 + }, + { + "epoch": 2.738802375983304, + "grad_norm": 0.46445560455322266, + "learning_rate": 1.2895759968993048e-05, + "loss": 0.5424, + "step": 8530 + }, + { + "epoch": 2.7391234548081553, + "grad_norm": 0.5213553309440613, + "learning_rate": 1.2878618887981064e-05, + "loss": 0.6759, + "step": 8531 + }, + { + "epoch": 2.739444533633007, + "grad_norm": 0.38170596957206726, + "learning_rate": 1.28614884224451e-05, + "loss": 0.2189, + "step": 8532 + }, + { + "epoch": 2.7397656124578584, + "grad_norm": 0.4311400353908539, + "learning_rate": 1.2844368574472454e-05, + "loss": 0.2501, + "step": 8533 + }, + { + "epoch": 2.74008669128271, + "grad_norm": 0.23123976588249207, + "learning_rate": 1.2827259346149122e-05, + "loss": 0.0694, + "step": 8534 + }, + { + "epoch": 2.7404077701075615, + "grad_norm": 0.32238295674324036, + "learning_rate": 1.2810160739559796e-05, + "loss": 0.1146, + "step": 8535 + }, + { + "epoch": 2.740728848932413, + "grad_norm": 0.1789664924144745, + "learning_rate": 1.279307275678795e-05, + "loss": 0.063, + "step": 8536 + }, + { + "epoch": 2.741049927757264, + "grad_norm": 0.7732048630714417, + "learning_rate": 1.2775995399915631e-05, + "loss": 0.4496, + "step": 8537 + }, + { + "epoch": 2.7413710065821157, + "grad_norm": 1.2547789812088013, + "learning_rate": 1.2758928671023718e-05, + "loss": 0.4711, + "step": 8538 + }, + { + "epoch": 2.7416920854069673, + "grad_norm": 0.7271549105644226, + "learning_rate": 1.2741872572191682e-05, + "loss": 0.3086, + "step": 8539 + }, + { + "epoch": 2.742013164231819, + "grad_norm": 0.7055025696754456, + "learning_rate": 1.2724827105497816e-05, + "loss": 0.3016, + "step": 8540 + }, + { + "epoch": 2.7423342430566704, + "grad_norm": 0.7456016540527344, + "learning_rate": 1.2707792273019048e-05, + "loss": 0.3477, + "step": 8541 + }, + { + "epoch": 2.742655321881522, + "grad_norm": 0.9478852152824402, + "learning_rate": 1.2690768076830972e-05, + "loss": 0.302, + "step": 8542 + }, + { + "epoch": 2.7429764007063735, + "grad_norm": 0.7163116335868835, + "learning_rate": 1.2673754519008008e-05, + "loss": 0.3128, + "step": 8543 + }, + { + "epoch": 2.743297479531225, + "grad_norm": 0.6127589344978333, + "learning_rate": 1.2656751601623118e-05, + "loss": 0.2145, + "step": 8544 + }, + { + "epoch": 2.7436185583560766, + "grad_norm": 0.8434281349182129, + "learning_rate": 1.2639759326748135e-05, + "loss": 0.2788, + "step": 8545 + }, + { + "epoch": 2.7439396371809277, + "grad_norm": 0.6575958728790283, + "learning_rate": 1.262277769645348e-05, + "loss": 0.2678, + "step": 8546 + }, + { + "epoch": 2.7442607160057793, + "grad_norm": 0.9952372312545776, + "learning_rate": 1.260580671280832e-05, + "loss": 0.3722, + "step": 8547 + }, + { + "epoch": 2.744581794830631, + "grad_norm": 1.0888564586639404, + "learning_rate": 1.2588846377880525e-05, + "loss": 0.4193, + "step": 8548 + }, + { + "epoch": 2.7449028736554824, + "grad_norm": 0.8872905373573303, + "learning_rate": 1.257189669373664e-05, + "loss": 0.3686, + "step": 8549 + }, + { + "epoch": 2.745223952480334, + "grad_norm": 0.8427690267562866, + "learning_rate": 1.2554957662441957e-05, + "loss": 0.3544, + "step": 8550 + }, + { + "epoch": 2.7455450313051855, + "grad_norm": 0.8962048888206482, + "learning_rate": 1.2538029286060426e-05, + "loss": 0.3592, + "step": 8551 + }, + { + "epoch": 2.745866110130037, + "grad_norm": 0.9221064448356628, + "learning_rate": 1.2521111566654731e-05, + "loss": 0.3621, + "step": 8552 + }, + { + "epoch": 2.7461871889548886, + "grad_norm": 1.0115526914596558, + "learning_rate": 1.2504204506286243e-05, + "loss": 0.4147, + "step": 8553 + }, + { + "epoch": 2.74650826777974, + "grad_norm": 0.8102017045021057, + "learning_rate": 1.2487308107015027e-05, + "loss": 0.3009, + "step": 8554 + }, + { + "epoch": 2.746829346604591, + "grad_norm": 1.0300239324569702, + "learning_rate": 1.2470422370899838e-05, + "loss": 0.3574, + "step": 8555 + }, + { + "epoch": 2.7471504254294428, + "grad_norm": 1.0161161422729492, + "learning_rate": 1.2453547299998225e-05, + "loss": 0.3415, + "step": 8556 + }, + { + "epoch": 2.7474715042542943, + "grad_norm": 1.0009255409240723, + "learning_rate": 1.243668289636628e-05, + "loss": 0.3357, + "step": 8557 + }, + { + "epoch": 2.747792583079146, + "grad_norm": 0.9013484120368958, + "learning_rate": 1.241982916205895e-05, + "loss": 0.354, + "step": 8558 + }, + { + "epoch": 2.7481136619039974, + "grad_norm": 0.9857357740402222, + "learning_rate": 1.2402986099129765e-05, + "loss": 0.3988, + "step": 8559 + }, + { + "epoch": 2.748434740728849, + "grad_norm": 0.8265845775604248, + "learning_rate": 1.2386153709630988e-05, + "loss": 0.3291, + "step": 8560 + }, + { + "epoch": 2.7487558195537005, + "grad_norm": 1.0555880069732666, + "learning_rate": 1.2369331995613665e-05, + "loss": 0.3589, + "step": 8561 + }, + { + "epoch": 2.749076898378552, + "grad_norm": 0.9744288921356201, + "learning_rate": 1.2352520959127379e-05, + "loss": 0.3236, + "step": 8562 + }, + { + "epoch": 2.7493979772034036, + "grad_norm": 0.7760259509086609, + "learning_rate": 1.2335720602220569e-05, + "loss": 0.2423, + "step": 8563 + }, + { + "epoch": 2.7497190560282547, + "grad_norm": 1.1198222637176514, + "learning_rate": 1.2318930926940298e-05, + "loss": 0.3958, + "step": 8564 + }, + { + "epoch": 2.7500401348531063, + "grad_norm": 1.1928644180297852, + "learning_rate": 1.2302151935332329e-05, + "loss": 0.4002, + "step": 8565 + }, + { + "epoch": 2.750361213677958, + "grad_norm": 0.7599969506263733, + "learning_rate": 1.228538362944115e-05, + "loss": 0.2612, + "step": 8566 + }, + { + "epoch": 2.7506822925028094, + "grad_norm": 1.1186960935592651, + "learning_rate": 1.2268626011309858e-05, + "loss": 0.3827, + "step": 8567 + }, + { + "epoch": 2.751003371327661, + "grad_norm": 0.6915915608406067, + "learning_rate": 1.22518790829804e-05, + "loss": 0.2565, + "step": 8568 + }, + { + "epoch": 2.7513244501525125, + "grad_norm": 0.7428755164146423, + "learning_rate": 1.2235142846493308e-05, + "loss": 0.2987, + "step": 8569 + }, + { + "epoch": 2.751645528977364, + "grad_norm": 0.7877930998802185, + "learning_rate": 1.2218417303887842e-05, + "loss": 0.3379, + "step": 8570 + }, + { + "epoch": 2.7519666078022156, + "grad_norm": 0.9466036558151245, + "learning_rate": 1.2201702457201947e-05, + "loss": 0.3894, + "step": 8571 + }, + { + "epoch": 2.752287686627067, + "grad_norm": 0.8674002289772034, + "learning_rate": 1.2184998308472295e-05, + "loss": 0.3414, + "step": 8572 + }, + { + "epoch": 2.7526087654519182, + "grad_norm": 0.8166511654853821, + "learning_rate": 1.2168304859734226e-05, + "loss": 0.3156, + "step": 8573 + }, + { + "epoch": 2.75292984427677, + "grad_norm": 0.7151662707328796, + "learning_rate": 1.2151622113021787e-05, + "loss": 0.3031, + "step": 8574 + }, + { + "epoch": 2.7532509231016213, + "grad_norm": 0.7391020655632019, + "learning_rate": 1.2134950070367723e-05, + "loss": 0.2881, + "step": 8575 + }, + { + "epoch": 2.753572001926473, + "grad_norm": 0.8440263867378235, + "learning_rate": 1.2118288733803473e-05, + "loss": 0.3104, + "step": 8576 + }, + { + "epoch": 2.7538930807513244, + "grad_norm": 0.8688570261001587, + "learning_rate": 1.2101638105359169e-05, + "loss": 0.3327, + "step": 8577 + }, + { + "epoch": 2.754214159576176, + "grad_norm": 0.35422104597091675, + "learning_rate": 1.2084998187063613e-05, + "loss": 0.2317, + "step": 8578 + }, + { + "epoch": 2.7545352384010275, + "grad_norm": 0.4497572183609009, + "learning_rate": 1.206836898094439e-05, + "loss": 0.2845, + "step": 8579 + }, + { + "epoch": 2.754856317225879, + "grad_norm": 0.48799851536750793, + "learning_rate": 1.2051750489027647e-05, + "loss": 0.6289, + "step": 8580 + }, + { + "epoch": 2.7551773960507306, + "grad_norm": 0.438763827085495, + "learning_rate": 1.2035142713338366e-05, + "loss": 0.5615, + "step": 8581 + }, + { + "epoch": 2.7554984748755817, + "grad_norm": 0.4711540639400482, + "learning_rate": 1.2018545655900081e-05, + "loss": 0.3515, + "step": 8582 + }, + { + "epoch": 2.7558195537004333, + "grad_norm": 0.43280723690986633, + "learning_rate": 1.2001959318735156e-05, + "loss": 0.16, + "step": 8583 + }, + { + "epoch": 2.756140632525285, + "grad_norm": 0.3058044910430908, + "learning_rate": 1.1985383703864583e-05, + "loss": 0.1416, + "step": 8584 + }, + { + "epoch": 2.7564617113501364, + "grad_norm": 0.15330089628696442, + "learning_rate": 1.196881881330798e-05, + "loss": 0.06, + "step": 8585 + }, + { + "epoch": 2.756782790174988, + "grad_norm": 0.45798158645629883, + "learning_rate": 1.19522646490838e-05, + "loss": 0.1952, + "step": 8586 + }, + { + "epoch": 2.7571038689998395, + "grad_norm": 0.8995585441589355, + "learning_rate": 1.1935721213209105e-05, + "loss": 0.6017, + "step": 8587 + }, + { + "epoch": 2.757424947824691, + "grad_norm": 0.8658804297447205, + "learning_rate": 1.191918850769964e-05, + "loss": 0.4147, + "step": 8588 + }, + { + "epoch": 2.7577460266495426, + "grad_norm": 0.7916271090507507, + "learning_rate": 1.1902666534569883e-05, + "loss": 0.3732, + "step": 8589 + }, + { + "epoch": 2.758067105474394, + "grad_norm": 1.541236162185669, + "learning_rate": 1.188615529583299e-05, + "loss": 0.4061, + "step": 8590 + }, + { + "epoch": 2.7583881842992453, + "grad_norm": 0.7521135210990906, + "learning_rate": 1.1869654793500784e-05, + "loss": 0.3043, + "step": 8591 + }, + { + "epoch": 2.758709263124097, + "grad_norm": 0.7983594536781311, + "learning_rate": 1.1853165029583823e-05, + "loss": 0.3089, + "step": 8592 + }, + { + "epoch": 2.7590303419489484, + "grad_norm": 0.6637342572212219, + "learning_rate": 1.1836686006091313e-05, + "loss": 0.2742, + "step": 8593 + }, + { + "epoch": 2.7593514207738, + "grad_norm": 0.8131726384162903, + "learning_rate": 1.1820217725031191e-05, + "loss": 0.3027, + "step": 8594 + }, + { + "epoch": 2.7596724995986515, + "grad_norm": 0.6360102891921997, + "learning_rate": 1.1803760188410073e-05, + "loss": 0.2646, + "step": 8595 + }, + { + "epoch": 2.759993578423503, + "grad_norm": 0.7749152183532715, + "learning_rate": 1.1787313398233235e-05, + "loss": 0.3449, + "step": 8596 + }, + { + "epoch": 2.7603146572483546, + "grad_norm": 0.9877920746803284, + "learning_rate": 1.1770877356504683e-05, + "loss": 0.3666, + "step": 8597 + }, + { + "epoch": 2.760635736073206, + "grad_norm": 0.9534462094306946, + "learning_rate": 1.1754452065227084e-05, + "loss": 0.3528, + "step": 8598 + }, + { + "epoch": 2.7609568148980577, + "grad_norm": 1.1205729246139526, + "learning_rate": 1.1738037526401857e-05, + "loss": 0.4691, + "step": 8599 + }, + { + "epoch": 2.7612778937229088, + "grad_norm": 0.7297707200050354, + "learning_rate": 1.1721633742028992e-05, + "loss": 0.2858, + "step": 8600 + }, + { + "epoch": 2.7615989725477603, + "grad_norm": 0.8408207297325134, + "learning_rate": 1.1705240714107302e-05, + "loss": 0.3679, + "step": 8601 + }, + { + "epoch": 2.761920051372612, + "grad_norm": 1.0112425088882446, + "learning_rate": 1.168885844463422e-05, + "loss": 0.3732, + "step": 8602 + }, + { + "epoch": 2.7622411301974634, + "grad_norm": 1.013698697090149, + "learning_rate": 1.1672486935605831e-05, + "loss": 0.3725, + "step": 8603 + }, + { + "epoch": 2.762562209022315, + "grad_norm": 1.6599117517471313, + "learning_rate": 1.1656126189017014e-05, + "loss": 0.3795, + "step": 8604 + }, + { + "epoch": 2.7628832878471665, + "grad_norm": 0.8416186571121216, + "learning_rate": 1.1639776206861196e-05, + "loss": 0.2958, + "step": 8605 + }, + { + "epoch": 2.763204366672018, + "grad_norm": 0.892895519733429, + "learning_rate": 1.1623436991130654e-05, + "loss": 0.3505, + "step": 8606 + }, + { + "epoch": 2.7635254454968696, + "grad_norm": 0.7338150143623352, + "learning_rate": 1.1607108543816248e-05, + "loss": 0.3204, + "step": 8607 + }, + { + "epoch": 2.763846524321721, + "grad_norm": 0.9701830744743347, + "learning_rate": 1.159079086690753e-05, + "loss": 0.3527, + "step": 8608 + }, + { + "epoch": 2.7641676031465723, + "grad_norm": 1.0651260614395142, + "learning_rate": 1.1574483962392767e-05, + "loss": 0.3822, + "step": 8609 + }, + { + "epoch": 2.764488681971424, + "grad_norm": 0.7772603034973145, + "learning_rate": 1.1558187832258926e-05, + "loss": 0.3063, + "step": 8610 + }, + { + "epoch": 2.7648097607962754, + "grad_norm": 0.5828967690467834, + "learning_rate": 1.1541902478491606e-05, + "loss": 0.2822, + "step": 8611 + }, + { + "epoch": 2.765130839621127, + "grad_norm": 0.8578951954841614, + "learning_rate": 1.1525627903075165e-05, + "loss": 0.2906, + "step": 8612 + }, + { + "epoch": 2.7654519184459785, + "grad_norm": 0.7797196507453918, + "learning_rate": 1.1509364107992583e-05, + "loss": 0.2938, + "step": 8613 + }, + { + "epoch": 2.76577299727083, + "grad_norm": 0.892012894153595, + "learning_rate": 1.1493111095225562e-05, + "loss": 0.3103, + "step": 8614 + }, + { + "epoch": 2.7660940760956816, + "grad_norm": 0.747068464756012, + "learning_rate": 1.1476868866754486e-05, + "loss": 0.2878, + "step": 8615 + }, + { + "epoch": 2.766415154920533, + "grad_norm": 0.9010385274887085, + "learning_rate": 1.1460637424558407e-05, + "loss": 0.3162, + "step": 8616 + }, + { + "epoch": 2.7667362337453847, + "grad_norm": 0.9162553548812866, + "learning_rate": 1.1444416770615118e-05, + "loss": 0.3087, + "step": 8617 + }, + { + "epoch": 2.767057312570236, + "grad_norm": 0.7222566604614258, + "learning_rate": 1.1428206906900995e-05, + "loss": 0.2889, + "step": 8618 + }, + { + "epoch": 2.7673783913950873, + "grad_norm": 0.5064111948013306, + "learning_rate": 1.1412007835391236e-05, + "loss": 0.2399, + "step": 8619 + }, + { + "epoch": 2.767699470219939, + "grad_norm": 0.6242873072624207, + "learning_rate": 1.1395819558059572e-05, + "loss": 0.2545, + "step": 8620 + }, + { + "epoch": 2.7680205490447904, + "grad_norm": 1.012289047241211, + "learning_rate": 1.1379642076878527e-05, + "loss": 0.3428, + "step": 8621 + }, + { + "epoch": 2.768341627869642, + "grad_norm": 0.7770035862922668, + "learning_rate": 1.1363475393819311e-05, + "loss": 0.3126, + "step": 8622 + }, + { + "epoch": 2.7686627066944935, + "grad_norm": 0.8441600799560547, + "learning_rate": 1.1347319510851717e-05, + "loss": 0.2974, + "step": 8623 + }, + { + "epoch": 2.768983785519345, + "grad_norm": 1.1390506029129028, + "learning_rate": 1.1331174429944347e-05, + "loss": 0.3347, + "step": 8624 + }, + { + "epoch": 2.7693048643441966, + "grad_norm": 0.9870839715003967, + "learning_rate": 1.1315040153064416e-05, + "loss": 0.4109, + "step": 8625 + }, + { + "epoch": 2.769625943169048, + "grad_norm": 0.9025225043296814, + "learning_rate": 1.129891668217783e-05, + "loss": 0.3347, + "step": 8626 + }, + { + "epoch": 2.7699470219938993, + "grad_norm": 0.6269119381904602, + "learning_rate": 1.1282804019249182e-05, + "loss": 0.2896, + "step": 8627 + }, + { + "epoch": 2.770268100818751, + "grad_norm": 0.30881941318511963, + "learning_rate": 1.1266702166241772e-05, + "loss": 0.2314, + "step": 8628 + }, + { + "epoch": 2.7705891796436024, + "grad_norm": 0.4127409756183624, + "learning_rate": 1.1250611125117527e-05, + "loss": 0.2654, + "step": 8629 + }, + { + "epoch": 2.770910258468454, + "grad_norm": 0.49982887506484985, + "learning_rate": 1.1234530897837126e-05, + "loss": 0.7763, + "step": 8630 + }, + { + "epoch": 2.7712313372933055, + "grad_norm": 0.3858592212200165, + "learning_rate": 1.1218461486359877e-05, + "loss": 0.3977, + "step": 8631 + }, + { + "epoch": 2.771552416118157, + "grad_norm": 0.5631637573242188, + "learning_rate": 1.1202402892643781e-05, + "loss": 0.3626, + "step": 8632 + }, + { + "epoch": 2.7718734949430086, + "grad_norm": 0.4585270881652832, + "learning_rate": 1.1186355118645554e-05, + "loss": 0.2111, + "step": 8633 + }, + { + "epoch": 2.77219457376786, + "grad_norm": 0.4266338646411896, + "learning_rate": 1.1170318166320548e-05, + "loss": 0.1454, + "step": 8634 + }, + { + "epoch": 2.7725156525927117, + "grad_norm": 0.3712063729763031, + "learning_rate": 1.1154292037622838e-05, + "loss": 0.1819, + "step": 8635 + }, + { + "epoch": 2.772836731417563, + "grad_norm": 0.7830849289894104, + "learning_rate": 1.1138276734505104e-05, + "loss": 0.4185, + "step": 8636 + }, + { + "epoch": 2.7731578102424144, + "grad_norm": 0.7869873046875, + "learning_rate": 1.1122272258918865e-05, + "loss": 0.3898, + "step": 8637 + }, + { + "epoch": 2.773478889067266, + "grad_norm": 0.9816368222236633, + "learning_rate": 1.1106278612814125e-05, + "loss": 0.3876, + "step": 8638 + }, + { + "epoch": 2.7737999678921175, + "grad_norm": 0.8416398167610168, + "learning_rate": 1.1090295798139672e-05, + "loss": 0.3797, + "step": 8639 + }, + { + "epoch": 2.774121046716969, + "grad_norm": 0.7030868530273438, + "learning_rate": 1.1074323816843024e-05, + "loss": 0.3055, + "step": 8640 + }, + { + "epoch": 2.7744421255418206, + "grad_norm": 0.6922499537467957, + "learning_rate": 1.1058362670870249e-05, + "loss": 0.2845, + "step": 8641 + }, + { + "epoch": 2.774763204366672, + "grad_norm": 0.6932337284088135, + "learning_rate": 1.1042412362166222e-05, + "loss": 0.3024, + "step": 8642 + }, + { + "epoch": 2.7750842831915237, + "grad_norm": 0.6935373544692993, + "learning_rate": 1.1026472892674378e-05, + "loss": 0.2872, + "step": 8643 + }, + { + "epoch": 2.775405362016375, + "grad_norm": 0.7587444186210632, + "learning_rate": 1.1010544264336942e-05, + "loss": 0.3455, + "step": 8644 + }, + { + "epoch": 2.7757264408412263, + "grad_norm": 0.6532779335975647, + "learning_rate": 1.099462647909475e-05, + "loss": 0.2364, + "step": 8645 + }, + { + "epoch": 2.776047519666078, + "grad_norm": 0.923891007900238, + "learning_rate": 1.0978719538887349e-05, + "loss": 0.3352, + "step": 8646 + }, + { + "epoch": 2.7763685984909294, + "grad_norm": 0.6212542057037354, + "learning_rate": 1.0962823445652959e-05, + "loss": 0.2283, + "step": 8647 + }, + { + "epoch": 2.776689677315781, + "grad_norm": 1.0696156024932861, + "learning_rate": 1.0946938201328416e-05, + "loss": 0.4078, + "step": 8648 + }, + { + "epoch": 2.7770107561406325, + "grad_norm": 0.7603042125701904, + "learning_rate": 1.093106380784934e-05, + "loss": 0.324, + "step": 8649 + }, + { + "epoch": 2.777331834965484, + "grad_norm": 0.9371538162231445, + "learning_rate": 1.0915200267149972e-05, + "loss": 0.392, + "step": 8650 + }, + { + "epoch": 2.7776529137903356, + "grad_norm": 0.7637982368469238, + "learning_rate": 1.0899347581163221e-05, + "loss": 0.3077, + "step": 8651 + }, + { + "epoch": 2.777973992615187, + "grad_norm": 1.0060869455337524, + "learning_rate": 1.08835057518207e-05, + "loss": 0.3866, + "step": 8652 + }, + { + "epoch": 2.7782950714400387, + "grad_norm": 0.8908117413520813, + "learning_rate": 1.0867674781052684e-05, + "loss": 0.3209, + "step": 8653 + }, + { + "epoch": 2.77861615026489, + "grad_norm": 1.2514095306396484, + "learning_rate": 1.0851854670788108e-05, + "loss": 0.422, + "step": 8654 + }, + { + "epoch": 2.7789372290897414, + "grad_norm": 0.9697837233543396, + "learning_rate": 1.0836045422954666e-05, + "loss": 0.3297, + "step": 8655 + }, + { + "epoch": 2.779258307914593, + "grad_norm": 0.8684007525444031, + "learning_rate": 1.0820247039478604e-05, + "loss": 0.3689, + "step": 8656 + }, + { + "epoch": 2.7795793867394445, + "grad_norm": 1.1245943307876587, + "learning_rate": 1.0804459522284926e-05, + "loss": 0.3227, + "step": 8657 + }, + { + "epoch": 2.779900465564296, + "grad_norm": 0.9986475706100464, + "learning_rate": 1.0788682873297307e-05, + "loss": 0.3325, + "step": 8658 + }, + { + "epoch": 2.7802215443891476, + "grad_norm": 0.8092026114463806, + "learning_rate": 1.0772917094438051e-05, + "loss": 0.3339, + "step": 8659 + }, + { + "epoch": 2.780542623213999, + "grad_norm": 0.6285796165466309, + "learning_rate": 1.0757162187628222e-05, + "loss": 0.2289, + "step": 8660 + }, + { + "epoch": 2.7808637020388507, + "grad_norm": 0.81215900182724, + "learning_rate": 1.0741418154787442e-05, + "loss": 0.3342, + "step": 8661 + }, + { + "epoch": 2.7811847808637022, + "grad_norm": 0.7589025497436523, + "learning_rate": 1.0725684997834162e-05, + "loss": 0.3424, + "step": 8662 + }, + { + "epoch": 2.7815058596885534, + "grad_norm": 0.7941707968711853, + "learning_rate": 1.0709962718685318e-05, + "loss": 0.342, + "step": 8663 + }, + { + "epoch": 2.781826938513405, + "grad_norm": 1.3057998418807983, + "learning_rate": 1.0694251319256687e-05, + "loss": 0.3437, + "step": 8664 + }, + { + "epoch": 2.7821480173382565, + "grad_norm": 1.0559587478637695, + "learning_rate": 1.067855080146266e-05, + "loss": 0.3157, + "step": 8665 + }, + { + "epoch": 2.782469096163108, + "grad_norm": 1.1683728694915771, + "learning_rate": 1.0662861167216243e-05, + "loss": 0.3433, + "step": 8666 + }, + { + "epoch": 2.7827901749879596, + "grad_norm": 0.8493248820304871, + "learning_rate": 1.0647182418429225e-05, + "loss": 0.3193, + "step": 8667 + }, + { + "epoch": 2.783111253812811, + "grad_norm": 1.0568920373916626, + "learning_rate": 1.063151455701199e-05, + "loss": 0.3286, + "step": 8668 + }, + { + "epoch": 2.7834323326376627, + "grad_norm": NaN, + "learning_rate": 1.063151455701199e-05, + "loss": 0.3763, + "step": 8669 + }, + { + "epoch": 2.783753411462514, + "grad_norm": 0.562329888343811, + "learning_rate": 1.0615857584873623e-05, + "loss": 0.2469, + "step": 8670 + }, + { + "epoch": 2.7840744902873658, + "grad_norm": 1.2342668771743774, + "learning_rate": 1.0600211503921887e-05, + "loss": 0.4084, + "step": 8671 + }, + { + "epoch": 2.784395569112217, + "grad_norm": 0.6708762645721436, + "learning_rate": 1.0584576316063188e-05, + "loss": 0.2886, + "step": 8672 + }, + { + "epoch": 2.7847166479370684, + "grad_norm": 0.7978615164756775, + "learning_rate": 1.0568952023202638e-05, + "loss": 0.3324, + "step": 8673 + }, + { + "epoch": 2.78503772676192, + "grad_norm": 0.4819890260696411, + "learning_rate": 1.0553338627244025e-05, + "loss": 0.2395, + "step": 8674 + }, + { + "epoch": 2.7853588055867715, + "grad_norm": 0.7550230026245117, + "learning_rate": 1.053773613008977e-05, + "loss": 0.2846, + "step": 8675 + }, + { + "epoch": 2.785679884411623, + "grad_norm": 0.5349231958389282, + "learning_rate": 1.0522144533640998e-05, + "loss": 0.2853, + "step": 8676 + }, + { + "epoch": 2.7860009632364746, + "grad_norm": 1.028280258178711, + "learning_rate": 1.0506563839797501e-05, + "loss": 0.3143, + "step": 8677 + }, + { + "epoch": 2.786322042061326, + "grad_norm": 0.6815648674964905, + "learning_rate": 1.0490994050457748e-05, + "loss": 0.2822, + "step": 8678 + }, + { + "epoch": 2.7866431208861777, + "grad_norm": 0.6263010501861572, + "learning_rate": 1.0475435167518843e-05, + "loss": 0.3049, + "step": 8679 + }, + { + "epoch": 2.7869641997110293, + "grad_norm": 0.5349334478378296, + "learning_rate": 1.0459887192876594e-05, + "loss": 0.6824, + "step": 8680 + }, + { + "epoch": 2.7872852785358804, + "grad_norm": 0.4282764494419098, + "learning_rate": 1.0444350128425529e-05, + "loss": 0.4408, + "step": 8681 + }, + { + "epoch": 2.787606357360732, + "grad_norm": 0.4315713346004486, + "learning_rate": 1.042882397605871e-05, + "loss": 0.4115, + "step": 8682 + }, + { + "epoch": 2.7879274361855835, + "grad_norm": 0.4827350974082947, + "learning_rate": 1.0413308737668005e-05, + "loss": 0.1461, + "step": 8683 + }, + { + "epoch": 2.788248515010435, + "grad_norm": 0.28072378039360046, + "learning_rate": 1.0397804415143909e-05, + "loss": 0.1271, + "step": 8684 + }, + { + "epoch": 2.7885695938352866, + "grad_norm": 0.3307070732116699, + "learning_rate": 1.0382311010375512e-05, + "loss": 0.1209, + "step": 8685 + }, + { + "epoch": 2.788890672660138, + "grad_norm": 0.18453341722488403, + "learning_rate": 1.0366828525250726e-05, + "loss": 0.0659, + "step": 8686 + }, + { + "epoch": 2.7892117514849897, + "grad_norm": 0.5584306716918945, + "learning_rate": 1.0351356961655945e-05, + "loss": 0.339, + "step": 8687 + }, + { + "epoch": 2.7895328303098412, + "grad_norm": 0.9177886247634888, + "learning_rate": 1.0335896321476413e-05, + "loss": 0.4866, + "step": 8688 + }, + { + "epoch": 2.7898539091346928, + "grad_norm": 1.1755162477493286, + "learning_rate": 1.0320446606595934e-05, + "loss": 0.5634, + "step": 8689 + }, + { + "epoch": 2.790174987959544, + "grad_norm": 0.9318304657936096, + "learning_rate": 1.0305007818897006e-05, + "loss": 0.3891, + "step": 8690 + }, + { + "epoch": 2.7904960667843954, + "grad_norm": 0.7948139309883118, + "learning_rate": 1.028957996026081e-05, + "loss": 0.3518, + "step": 8691 + }, + { + "epoch": 2.790817145609247, + "grad_norm": 0.9322606921195984, + "learning_rate": 1.0274163032567163e-05, + "loss": 0.3725, + "step": 8692 + }, + { + "epoch": 2.7911382244340985, + "grad_norm": 0.8909361958503723, + "learning_rate": 1.0258757037694589e-05, + "loss": 0.2977, + "step": 8693 + }, + { + "epoch": 2.79145930325895, + "grad_norm": 0.964710533618927, + "learning_rate": 1.0243361977520249e-05, + "loss": 0.4172, + "step": 8694 + }, + { + "epoch": 2.7917803820838016, + "grad_norm": 0.9714322686195374, + "learning_rate": 1.022797785392e-05, + "loss": 0.3906, + "step": 8695 + }, + { + "epoch": 2.792101460908653, + "grad_norm": 0.7900899648666382, + "learning_rate": 1.0212604668768343e-05, + "loss": 0.2676, + "step": 8696 + }, + { + "epoch": 2.7924225397335047, + "grad_norm": 0.7134426236152649, + "learning_rate": 1.0197242423938446e-05, + "loss": 0.2615, + "step": 8697 + }, + { + "epoch": 2.7927436185583563, + "grad_norm": 0.9049974679946899, + "learning_rate": 1.0181891121302145e-05, + "loss": 0.2837, + "step": 8698 + }, + { + "epoch": 2.7930646973832074, + "grad_norm": 0.9333779215812683, + "learning_rate": 1.0166550762729998e-05, + "loss": 0.2789, + "step": 8699 + }, + { + "epoch": 2.793385776208059, + "grad_norm": 0.8934162855148315, + "learning_rate": 1.0151221350091134e-05, + "loss": 0.3497, + "step": 8700 + }, + { + "epoch": 2.7937068550329105, + "grad_norm": 0.6106538772583008, + "learning_rate": 1.0135902885253401e-05, + "loss": 0.2565, + "step": 8701 + }, + { + "epoch": 2.794027933857762, + "grad_norm": 0.990350067615509, + "learning_rate": 1.0120595370083318e-05, + "loss": 0.4381, + "step": 8702 + }, + { + "epoch": 2.7943490126826136, + "grad_norm": 0.8355160355567932, + "learning_rate": 1.0105298806446028e-05, + "loss": 0.3453, + "step": 8703 + }, + { + "epoch": 2.794670091507465, + "grad_norm": 0.881846010684967, + "learning_rate": 1.009001319620545e-05, + "loss": 0.2857, + "step": 8704 + }, + { + "epoch": 2.7949911703323167, + "grad_norm": 0.8830981254577637, + "learning_rate": 1.0074738541223993e-05, + "loss": 0.327, + "step": 8705 + }, + { + "epoch": 2.7953122491571682, + "grad_norm": 0.9528623819351196, + "learning_rate": 1.0059474843362892e-05, + "loss": 0.3359, + "step": 8706 + }, + { + "epoch": 2.79563332798202, + "grad_norm": 0.9913058280944824, + "learning_rate": 1.0044222104481971e-05, + "loss": 0.3238, + "step": 8707 + }, + { + "epoch": 2.795954406806871, + "grad_norm": 1.4466516971588135, + "learning_rate": 1.0028980326439707e-05, + "loss": 0.5732, + "step": 8708 + }, + { + "epoch": 2.7962754856317225, + "grad_norm": 0.9511708617210388, + "learning_rate": 1.0013749511093307e-05, + "loss": 0.3061, + "step": 8709 + }, + { + "epoch": 2.796596564456574, + "grad_norm": 0.5760300755500793, + "learning_rate": 9.998529660298539e-06, + "loss": 0.2568, + "step": 8710 + }, + { + "epoch": 2.7969176432814256, + "grad_norm": 0.8406627774238586, + "learning_rate": 9.983320775909933e-06, + "loss": 0.3046, + "step": 8711 + }, + { + "epoch": 2.797238722106277, + "grad_norm": 0.9190598726272583, + "learning_rate": 9.968122859780648e-06, + "loss": 0.3231, + "step": 8712 + }, + { + "epoch": 2.7975598009311287, + "grad_norm": 0.6795236468315125, + "learning_rate": 9.952935913762506e-06, + "loss": 0.247, + "step": 8713 + }, + { + "epoch": 2.79788087975598, + "grad_norm": 0.6721789240837097, + "learning_rate": 9.937759939705971e-06, + "loss": 0.2513, + "step": 8714 + }, + { + "epoch": 2.7982019585808313, + "grad_norm": 1.041229009628296, + "learning_rate": 9.922594939460194e-06, + "loss": 0.3641, + "step": 8715 + }, + { + "epoch": 2.7985230374056833, + "grad_norm": 1.105242371559143, + "learning_rate": 9.907440914873e-06, + "loss": 0.3677, + "step": 8716 + }, + { + "epoch": 2.7988441162305344, + "grad_norm": 0.7640735507011414, + "learning_rate": 9.892297867790845e-06, + "loss": 0.2991, + "step": 8717 + }, + { + "epoch": 2.799165195055386, + "grad_norm": 0.6073140501976013, + "learning_rate": 9.877165800058874e-06, + "loss": 0.2511, + "step": 8718 + }, + { + "epoch": 2.7994862738802375, + "grad_norm": 0.9953632950782776, + "learning_rate": 9.86204471352088e-06, + "loss": 0.365, + "step": 8719 + }, + { + "epoch": 2.799807352705089, + "grad_norm": 0.6479620337486267, + "learning_rate": 9.84693461001932e-06, + "loss": 0.2704, + "step": 8720 + }, + { + "epoch": 2.8001284315299406, + "grad_norm": 0.9347096681594849, + "learning_rate": 9.831835491395292e-06, + "loss": 0.3145, + "step": 8721 + }, + { + "epoch": 2.800449510354792, + "grad_norm": 0.5598117113113403, + "learning_rate": 9.816747359488632e-06, + "loss": 0.2696, + "step": 8722 + }, + { + "epoch": 2.8007705891796437, + "grad_norm": 0.6550912857055664, + "learning_rate": 9.801670216137727e-06, + "loss": 0.2139, + "step": 8723 + }, + { + "epoch": 2.801091668004495, + "grad_norm": 0.6343926787376404, + "learning_rate": 9.786604063179728e-06, + "loss": 0.2569, + "step": 8724 + }, + { + "epoch": 2.801412746829347, + "grad_norm": 1.2895246744155884, + "learning_rate": 9.771548902450357e-06, + "loss": 0.3091, + "step": 8725 + }, + { + "epoch": 2.801733825654198, + "grad_norm": 0.6956881880760193, + "learning_rate": 9.756504735784067e-06, + "loss": 0.3224, + "step": 8726 + }, + { + "epoch": 2.8020549044790495, + "grad_norm": 0.8281194567680359, + "learning_rate": 9.74147156501396e-06, + "loss": 0.3129, + "step": 8727 + }, + { + "epoch": 2.802375983303901, + "grad_norm": 0.6391897201538086, + "learning_rate": 9.726449391971714e-06, + "loss": 0.3007, + "step": 8728 + }, + { + "epoch": 2.8026970621287526, + "grad_norm": 0.5030215978622437, + "learning_rate": 9.711438218487834e-06, + "loss": 0.2709, + "step": 8729 + }, + { + "epoch": 2.803018140953604, + "grad_norm": 0.5472492575645447, + "learning_rate": 9.696438046391288e-06, + "loss": 0.7123, + "step": 8730 + }, + { + "epoch": 2.8033392197784557, + "grad_norm": 0.44840529561042786, + "learning_rate": 9.681448877509858e-06, + "loss": 0.4614, + "step": 8731 + }, + { + "epoch": 2.8036602986033072, + "grad_norm": 0.39549553394317627, + "learning_rate": 9.666470713669918e-06, + "loss": 0.3304, + "step": 8732 + }, + { + "epoch": 2.8039813774281583, + "grad_norm": 0.3779802918434143, + "learning_rate": 9.651503556696516e-06, + "loss": 0.1788, + "step": 8733 + }, + { + "epoch": 2.8043024562530103, + "grad_norm": 0.1729205846786499, + "learning_rate": 9.636547408413355e-06, + "loss": 0.0618, + "step": 8734 + }, + { + "epoch": 2.8046235350778614, + "grad_norm": 0.44540268182754517, + "learning_rate": 9.621602270642781e-06, + "loss": 0.1766, + "step": 8735 + }, + { + "epoch": 2.804944613902713, + "grad_norm": 0.7657910585403442, + "learning_rate": 9.606668145205833e-06, + "loss": 0.4684, + "step": 8736 + }, + { + "epoch": 2.8052656927275645, + "grad_norm": 0.8212659358978271, + "learning_rate": 9.591745033922173e-06, + "loss": 0.384, + "step": 8737 + }, + { + "epoch": 2.805586771552416, + "grad_norm": 0.8744103908538818, + "learning_rate": 9.576832938610137e-06, + "loss": 0.338, + "step": 8738 + }, + { + "epoch": 2.8059078503772676, + "grad_norm": 0.7278825640678406, + "learning_rate": 9.561931861086737e-06, + "loss": 0.2896, + "step": 8739 + }, + { + "epoch": 2.806228929202119, + "grad_norm": 0.5598768591880798, + "learning_rate": 9.5470418031676e-06, + "loss": 0.2411, + "step": 8740 + }, + { + "epoch": 2.8065500080269707, + "grad_norm": 0.7483347058296204, + "learning_rate": 9.532162766667042e-06, + "loss": 0.2827, + "step": 8741 + }, + { + "epoch": 2.806871086851822, + "grad_norm": 0.7367462515830994, + "learning_rate": 9.517294753398064e-06, + "loss": 0.3007, + "step": 8742 + }, + { + "epoch": 2.807192165676674, + "grad_norm": 0.6921778917312622, + "learning_rate": 9.502437765172212e-06, + "loss": 0.2562, + "step": 8743 + }, + { + "epoch": 2.807513244501525, + "grad_norm": 0.8599756956100464, + "learning_rate": 9.487591803799856e-06, + "loss": 0.3628, + "step": 8744 + }, + { + "epoch": 2.8078343233263765, + "grad_norm": 0.8034688234329224, + "learning_rate": 9.47275687108986e-06, + "loss": 0.3219, + "step": 8745 + }, + { + "epoch": 2.808155402151228, + "grad_norm": 1.0342375040054321, + "learning_rate": 9.457932968849825e-06, + "loss": 0.3947, + "step": 8746 + }, + { + "epoch": 2.8084764809760796, + "grad_norm": 1.0900487899780273, + "learning_rate": 9.443120098886061e-06, + "loss": 0.357, + "step": 8747 + }, + { + "epoch": 2.808797559800931, + "grad_norm": 0.7890565991401672, + "learning_rate": 9.428318263003378e-06, + "loss": 0.2979, + "step": 8748 + }, + { + "epoch": 2.8091186386257827, + "grad_norm": 1.0363215208053589, + "learning_rate": 9.4135274630054e-06, + "loss": 0.4129, + "step": 8749 + }, + { + "epoch": 2.8094397174506343, + "grad_norm": 0.8779375553131104, + "learning_rate": 9.398747700694322e-06, + "loss": 0.2943, + "step": 8750 + }, + { + "epoch": 2.8097607962754854, + "grad_norm": 1.0878396034240723, + "learning_rate": 9.383978977871021e-06, + "loss": 0.3559, + "step": 8751 + }, + { + "epoch": 2.8100818751003374, + "grad_norm": 0.8391216397285461, + "learning_rate": 9.369221296335006e-06, + "loss": 0.3069, + "step": 8752 + }, + { + "epoch": 2.8104029539251885, + "grad_norm": 0.6143542528152466, + "learning_rate": 9.354474657884472e-06, + "loss": 0.225, + "step": 8753 + }, + { + "epoch": 2.81072403275004, + "grad_norm": 0.9501350522041321, + "learning_rate": 9.339739064316233e-06, + "loss": 0.3666, + "step": 8754 + }, + { + "epoch": 2.8110451115748916, + "grad_norm": 1.024901032447815, + "learning_rate": 9.32501451742579e-06, + "loss": 0.3373, + "step": 8755 + }, + { + "epoch": 2.811366190399743, + "grad_norm": 1.0849446058273315, + "learning_rate": 9.310301019007285e-06, + "loss": 0.3409, + "step": 8756 + }, + { + "epoch": 2.8116872692245947, + "grad_norm": 0.8314627408981323, + "learning_rate": 9.295598570853514e-06, + "loss": 0.3095, + "step": 8757 + }, + { + "epoch": 2.812008348049446, + "grad_norm": 0.5649183392524719, + "learning_rate": 9.280907174755915e-06, + "loss": 0.2587, + "step": 8758 + }, + { + "epoch": 2.8123294268742978, + "grad_norm": 1.2007514238357544, + "learning_rate": 9.266226832504598e-06, + "loss": 0.4184, + "step": 8759 + }, + { + "epoch": 2.812650505699149, + "grad_norm": 0.562286376953125, + "learning_rate": 9.251557545888312e-06, + "loss": 0.2122, + "step": 8760 + }, + { + "epoch": 2.812971584524001, + "grad_norm": 1.045385479927063, + "learning_rate": 9.236899316694459e-06, + "loss": 0.3434, + "step": 8761 + }, + { + "epoch": 2.813292663348852, + "grad_norm": 1.0073976516723633, + "learning_rate": 9.222252146709142e-06, + "loss": 0.334, + "step": 8762 + }, + { + "epoch": 2.8136137421737035, + "grad_norm": 0.5607120990753174, + "learning_rate": 9.207616037717025e-06, + "loss": 0.2412, + "step": 8763 + }, + { + "epoch": 2.813934820998555, + "grad_norm": 0.7444592714309692, + "learning_rate": 9.192990991501482e-06, + "loss": 0.2799, + "step": 8764 + }, + { + "epoch": 2.8142558998234066, + "grad_norm": 0.9598581194877625, + "learning_rate": 9.178377009844563e-06, + "loss": 0.3674, + "step": 8765 + }, + { + "epoch": 2.814576978648258, + "grad_norm": 0.42026737332344055, + "learning_rate": 9.163774094526889e-06, + "loss": 0.2069, + "step": 8766 + }, + { + "epoch": 2.8148980574731097, + "grad_norm": 0.8369678258895874, + "learning_rate": 9.149182247327837e-06, + "loss": 0.3303, + "step": 8767 + }, + { + "epoch": 2.8152191362979613, + "grad_norm": 0.8033504486083984, + "learning_rate": 9.134601470025306e-06, + "loss": 0.2976, + "step": 8768 + }, + { + "epoch": 2.8155402151228124, + "grad_norm": 0.7782095670700073, + "learning_rate": 9.120031764395987e-06, + "loss": 0.2766, + "step": 8769 + }, + { + "epoch": 2.8158612939476644, + "grad_norm": 0.6862573623657227, + "learning_rate": 9.105473132215125e-06, + "loss": 0.2771, + "step": 8770 + }, + { + "epoch": 2.8161823727725155, + "grad_norm": 1.1740379333496094, + "learning_rate": 9.09092557525666e-06, + "loss": 0.3947, + "step": 8771 + }, + { + "epoch": 2.816503451597367, + "grad_norm": 1.0018607378005981, + "learning_rate": 9.076389095293148e-06, + "loss": 0.4111, + "step": 8772 + }, + { + "epoch": 2.8168245304222186, + "grad_norm": 0.7517522573471069, + "learning_rate": 9.061863694095828e-06, + "loss": 0.2955, + "step": 8773 + }, + { + "epoch": 2.81714560924707, + "grad_norm": 1.8710321187973022, + "learning_rate": 9.047349373434566e-06, + "loss": 0.2693, + "step": 8774 + }, + { + "epoch": 2.8174666880719217, + "grad_norm": 0.8470088839530945, + "learning_rate": 9.0328461350779e-06, + "loss": 0.3039, + "step": 8775 + }, + { + "epoch": 2.8177877668967732, + "grad_norm": 0.5964589715003967, + "learning_rate": 9.018353980792993e-06, + "loss": 0.2547, + "step": 8776 + }, + { + "epoch": 2.818108845721625, + "grad_norm": 0.8444490432739258, + "learning_rate": 9.00387291234569e-06, + "loss": 0.3071, + "step": 8777 + }, + { + "epoch": 2.818429924546476, + "grad_norm": 0.41703271865844727, + "learning_rate": 8.989402931500434e-06, + "loss": 0.2554, + "step": 8778 + }, + { + "epoch": 2.818751003371328, + "grad_norm": 0.3994010090827942, + "learning_rate": 8.974944040020362e-06, + "loss": 0.2379, + "step": 8779 + }, + { + "epoch": 2.819072082196179, + "grad_norm": 0.443518728017807, + "learning_rate": 8.960496239667282e-06, + "loss": 0.5404, + "step": 8780 + }, + { + "epoch": 2.8193931610210305, + "grad_norm": 0.3758715093135834, + "learning_rate": 8.946059532201567e-06, + "loss": 0.3376, + "step": 8781 + }, + { + "epoch": 2.819714239845882, + "grad_norm": 0.3072349727153778, + "learning_rate": 8.931633919382298e-06, + "loss": 0.1223, + "step": 8782 + }, + { + "epoch": 2.8200353186707336, + "grad_norm": 0.3410075306892395, + "learning_rate": 8.917219402967202e-06, + "loss": 0.11, + "step": 8783 + }, + { + "epoch": 2.820356397495585, + "grad_norm": 0.1781407743692398, + "learning_rate": 8.90281598471262e-06, + "loss": 0.0609, + "step": 8784 + }, + { + "epoch": 2.8206774763204367, + "grad_norm": 0.3771248161792755, + "learning_rate": 8.888423666373614e-06, + "loss": 0.1744, + "step": 8785 + }, + { + "epoch": 2.8209985551452883, + "grad_norm": 0.8106180429458618, + "learning_rate": 8.87404244970378e-06, + "loss": 0.4395, + "step": 8786 + }, + { + "epoch": 2.8213196339701394, + "grad_norm": 0.7213160991668701, + "learning_rate": 8.85967233645547e-06, + "loss": 0.3335, + "step": 8787 + }, + { + "epoch": 2.8216407127949914, + "grad_norm": 0.8492469787597656, + "learning_rate": 8.845313328379634e-06, + "loss": 0.401, + "step": 8788 + }, + { + "epoch": 2.8219617916198425, + "grad_norm": 1.2384346723556519, + "learning_rate": 8.830965427225868e-06, + "loss": 0.3545, + "step": 8789 + }, + { + "epoch": 2.822282870444694, + "grad_norm": 0.8457659482955933, + "learning_rate": 8.816628634742441e-06, + "loss": 0.3346, + "step": 8790 + }, + { + "epoch": 2.8226039492695456, + "grad_norm": 0.9293232560157776, + "learning_rate": 8.80230295267619e-06, + "loss": 0.3992, + "step": 8791 + }, + { + "epoch": 2.822925028094397, + "grad_norm": 0.7842317819595337, + "learning_rate": 8.787988382772705e-06, + "loss": 0.355, + "step": 8792 + }, + { + "epoch": 2.8232461069192487, + "grad_norm": 0.9196096062660217, + "learning_rate": 8.77368492677616e-06, + "loss": 0.3592, + "step": 8793 + }, + { + "epoch": 2.8235671857441003, + "grad_norm": 0.7270289659500122, + "learning_rate": 8.759392586429393e-06, + "loss": 0.3004, + "step": 8794 + }, + { + "epoch": 2.823888264568952, + "grad_norm": 0.7461056709289551, + "learning_rate": 8.745111363473868e-06, + "loss": 0.2673, + "step": 8795 + }, + { + "epoch": 2.824209343393803, + "grad_norm": 0.9872536659240723, + "learning_rate": 8.730841259649725e-06, + "loss": 0.3072, + "step": 8796 + }, + { + "epoch": 2.824530422218655, + "grad_norm": 0.9411637783050537, + "learning_rate": 8.716582276695728e-06, + "loss": 0.3584, + "step": 8797 + }, + { + "epoch": 2.824851501043506, + "grad_norm": 1.1243722438812256, + "learning_rate": 8.702334416349278e-06, + "loss": 0.4163, + "step": 8798 + }, + { + "epoch": 2.8251725798683576, + "grad_norm": 0.9346780776977539, + "learning_rate": 8.688097680346453e-06, + "loss": 0.3681, + "step": 8799 + }, + { + "epoch": 2.825493658693209, + "grad_norm": 1.0615766048431396, + "learning_rate": 8.67387207042194e-06, + "loss": 0.4532, + "step": 8800 + }, + { + "epoch": 2.8258147375180607, + "grad_norm": 0.979707658290863, + "learning_rate": 8.6596575883091e-06, + "loss": 0.3883, + "step": 8801 + }, + { + "epoch": 2.826135816342912, + "grad_norm": 0.7535959482192993, + "learning_rate": 8.645454235739903e-06, + "loss": 0.2761, + "step": 8802 + }, + { + "epoch": 2.8264568951677638, + "grad_norm": 0.7734266519546509, + "learning_rate": 8.63126201444503e-06, + "loss": 0.2963, + "step": 8803 + }, + { + "epoch": 2.8267779739926153, + "grad_norm": 0.7798780202865601, + "learning_rate": 8.617080926153698e-06, + "loss": 0.3009, + "step": 8804 + }, + { + "epoch": 2.8270990528174664, + "grad_norm": 0.9741690158843994, + "learning_rate": 8.602910972593892e-06, + "loss": 0.3527, + "step": 8805 + }, + { + "epoch": 2.8274201316423184, + "grad_norm": 0.9816387891769409, + "learning_rate": 8.588752155492119e-06, + "loss": 0.3465, + "step": 8806 + }, + { + "epoch": 2.8277412104671695, + "grad_norm": 0.7801307439804077, + "learning_rate": 8.574604476573621e-06, + "loss": 0.292, + "step": 8807 + }, + { + "epoch": 2.828062289292021, + "grad_norm": 0.5235636234283447, + "learning_rate": 8.560467937562277e-06, + "loss": 0.2318, + "step": 8808 + }, + { + "epoch": 2.8283833681168726, + "grad_norm": 0.6542462110519409, + "learning_rate": 8.546342540180508e-06, + "loss": 0.2817, + "step": 8809 + }, + { + "epoch": 2.828704446941724, + "grad_norm": 0.6339287161827087, + "learning_rate": 8.532228286149501e-06, + "loss": 0.2691, + "step": 8810 + }, + { + "epoch": 2.8290255257665757, + "grad_norm": 0.9842689633369446, + "learning_rate": 8.51812517718904e-06, + "loss": 0.3822, + "step": 8811 + }, + { + "epoch": 2.8293466045914273, + "grad_norm": 0.6883869767189026, + "learning_rate": 8.504033215017527e-06, + "loss": 0.3071, + "step": 8812 + }, + { + "epoch": 2.829667683416279, + "grad_norm": 1.0689724683761597, + "learning_rate": 8.489952401352019e-06, + "loss": 0.4108, + "step": 8813 + }, + { + "epoch": 2.82998876224113, + "grad_norm": 0.9899790287017822, + "learning_rate": 8.475882737908248e-06, + "loss": 0.3652, + "step": 8814 + }, + { + "epoch": 2.830309841065982, + "grad_norm": 0.8097178936004639, + "learning_rate": 8.46182422640054e-06, + "loss": 0.3144, + "step": 8815 + }, + { + "epoch": 2.830630919890833, + "grad_norm": 0.8424767255783081, + "learning_rate": 8.447776868541879e-06, + "loss": 0.2967, + "step": 8816 + }, + { + "epoch": 2.8309519987156846, + "grad_norm": 0.7293074727058411, + "learning_rate": 8.433740666043898e-06, + "loss": 0.2959, + "step": 8817 + }, + { + "epoch": 2.831273077540536, + "grad_norm": 0.6922279000282288, + "learning_rate": 8.419715620616874e-06, + "loss": 0.2773, + "step": 8818 + }, + { + "epoch": 2.8315941563653877, + "grad_norm": 0.5553762316703796, + "learning_rate": 8.405701733969706e-06, + "loss": 0.2382, + "step": 8819 + }, + { + "epoch": 2.8319152351902392, + "grad_norm": 0.7001710534095764, + "learning_rate": 8.39169900780995e-06, + "loss": 0.2947, + "step": 8820 + }, + { + "epoch": 2.832236314015091, + "grad_norm": 0.524242639541626, + "learning_rate": 8.377707443843786e-06, + "loss": 0.2546, + "step": 8821 + }, + { + "epoch": 2.8325573928399423, + "grad_norm": 0.7304323315620422, + "learning_rate": 8.363727043776038e-06, + "loss": 0.311, + "step": 8822 + }, + { + "epoch": 2.8328784716647935, + "grad_norm": 0.5915831923484802, + "learning_rate": 8.34975780931021e-06, + "loss": 0.2488, + "step": 8823 + }, + { + "epoch": 2.8331995504896454, + "grad_norm": 0.8694709539413452, + "learning_rate": 8.335799742148387e-06, + "loss": 0.368, + "step": 8824 + }, + { + "epoch": 2.8335206293144966, + "grad_norm": 0.6131924986839294, + "learning_rate": 8.321852843991295e-06, + "loss": 0.2581, + "step": 8825 + }, + { + "epoch": 2.833841708139348, + "grad_norm": 0.42285412549972534, + "learning_rate": 8.307917116538378e-06, + "loss": 0.2609, + "step": 8826 + }, + { + "epoch": 2.8341627869641997, + "grad_norm": 0.6987262964248657, + "learning_rate": 8.293992561487596e-06, + "loss": 0.309, + "step": 8827 + }, + { + "epoch": 2.834483865789051, + "grad_norm": 0.8132457137107849, + "learning_rate": 8.280079180535672e-06, + "loss": 0.2866, + "step": 8828 + }, + { + "epoch": 2.8348049446139028, + "grad_norm": 0.3915092647075653, + "learning_rate": 8.26617697537786e-06, + "loss": 0.2629, + "step": 8829 + }, + { + "epoch": 2.8351260234387543, + "grad_norm": 0.4772132337093353, + "learning_rate": 8.252285947708139e-06, + "loss": 0.5484, + "step": 8830 + }, + { + "epoch": 2.835447102263606, + "grad_norm": 0.39662283658981323, + "learning_rate": 8.238406099219077e-06, + "loss": 0.3962, + "step": 8831 + }, + { + "epoch": 2.835768181088457, + "grad_norm": 0.4367140233516693, + "learning_rate": 8.224537431601886e-06, + "loss": 0.319, + "step": 8832 + }, + { + "epoch": 2.836089259913309, + "grad_norm": 0.6027937531471252, + "learning_rate": 8.21067994654644e-06, + "loss": 0.2027, + "step": 8833 + }, + { + "epoch": 2.83641033873816, + "grad_norm": 0.3798494040966034, + "learning_rate": 8.196833645741186e-06, + "loss": 0.155, + "step": 8834 + }, + { + "epoch": 2.8367314175630116, + "grad_norm": 0.2580123841762543, + "learning_rate": 8.182998530873298e-06, + "loss": 0.1179, + "step": 8835 + }, + { + "epoch": 2.837052496387863, + "grad_norm": 0.6310969591140747, + "learning_rate": 8.169174603628538e-06, + "loss": 0.3001, + "step": 8836 + }, + { + "epoch": 2.8373735752127147, + "grad_norm": 0.7293077707290649, + "learning_rate": 8.15536186569129e-06, + "loss": 0.3546, + "step": 8837 + }, + { + "epoch": 2.8376946540375663, + "grad_norm": 0.7130358219146729, + "learning_rate": 8.1415603187446e-06, + "loss": 0.2964, + "step": 8838 + }, + { + "epoch": 2.838015732862418, + "grad_norm": 0.7875835299491882, + "learning_rate": 8.127769964470156e-06, + "loss": 0.339, + "step": 8839 + }, + { + "epoch": 2.8383368116872694, + "grad_norm": 0.784997820854187, + "learning_rate": 8.113990804548244e-06, + "loss": 0.3349, + "step": 8840 + }, + { + "epoch": 2.8386578905121205, + "grad_norm": 0.6344702839851379, + "learning_rate": 8.100222840657878e-06, + "loss": 0.2666, + "step": 8841 + }, + { + "epoch": 2.8389789693369725, + "grad_norm": 0.984622061252594, + "learning_rate": 8.086466074476563e-06, + "loss": 0.432, + "step": 8842 + }, + { + "epoch": 2.8393000481618236, + "grad_norm": 0.8080422282218933, + "learning_rate": 8.072720507680565e-06, + "loss": 0.3034, + "step": 8843 + }, + { + "epoch": 2.839621126986675, + "grad_norm": 0.9881112575531006, + "learning_rate": 8.058986141944724e-06, + "loss": 0.3383, + "step": 8844 + }, + { + "epoch": 2.8399422058115267, + "grad_norm": 0.5864750742912292, + "learning_rate": 8.045262978942513e-06, + "loss": 0.2296, + "step": 8845 + }, + { + "epoch": 2.8402632846363782, + "grad_norm": 0.9349332451820374, + "learning_rate": 8.031551020346128e-06, + "loss": 0.3404, + "step": 8846 + }, + { + "epoch": 2.8405843634612298, + "grad_norm": 1.3247158527374268, + "learning_rate": 8.017850267826232e-06, + "loss": 0.4181, + "step": 8847 + }, + { + "epoch": 2.8409054422860813, + "grad_norm": 0.8365796804428101, + "learning_rate": 8.004160723052312e-06, + "loss": 0.2536, + "step": 8848 + }, + { + "epoch": 2.841226521110933, + "grad_norm": 0.7529785633087158, + "learning_rate": 7.990482387692311e-06, + "loss": 0.3008, + "step": 8849 + }, + { + "epoch": 2.841547599935784, + "grad_norm": 1.1152650117874146, + "learning_rate": 7.976815263412963e-06, + "loss": 0.4836, + "step": 8850 + }, + { + "epoch": 2.841868678760636, + "grad_norm": 1.6716127395629883, + "learning_rate": 7.963159351879556e-06, + "loss": 0.4522, + "step": 8851 + }, + { + "epoch": 2.842189757585487, + "grad_norm": 0.8794559836387634, + "learning_rate": 7.949514654755962e-06, + "loss": 0.3452, + "step": 8852 + }, + { + "epoch": 2.8425108364103386, + "grad_norm": 1.2077842950820923, + "learning_rate": 7.935881173704819e-06, + "loss": 0.3147, + "step": 8853 + }, + { + "epoch": 2.84283191523519, + "grad_norm": 0.605907678604126, + "learning_rate": 7.922258910387282e-06, + "loss": 0.2739, + "step": 8854 + }, + { + "epoch": 2.8431529940600417, + "grad_norm": 0.7989472150802612, + "learning_rate": 7.908647866463203e-06, + "loss": 0.2794, + "step": 8855 + }, + { + "epoch": 2.8434740728848933, + "grad_norm": 1.0483945608139038, + "learning_rate": 7.895048043591036e-06, + "loss": 0.3598, + "step": 8856 + }, + { + "epoch": 2.843795151709745, + "grad_norm": 0.4776444137096405, + "learning_rate": 7.881459443427886e-06, + "loss": 0.2168, + "step": 8857 + }, + { + "epoch": 2.8441162305345964, + "grad_norm": 0.8608068227767944, + "learning_rate": 7.867882067629472e-06, + "loss": 0.3373, + "step": 8858 + }, + { + "epoch": 2.8444373093594475, + "grad_norm": 0.8778037428855896, + "learning_rate": 7.854315917850163e-06, + "loss": 0.2954, + "step": 8859 + }, + { + "epoch": 2.8447583881842995, + "grad_norm": 1.0180518627166748, + "learning_rate": 7.840760995742946e-06, + "loss": 0.3334, + "step": 8860 + }, + { + "epoch": 2.8450794670091506, + "grad_norm": 0.9998841285705566, + "learning_rate": 7.827217302959467e-06, + "loss": 0.3415, + "step": 8861 + }, + { + "epoch": 2.845400545834002, + "grad_norm": 0.7613440155982971, + "learning_rate": 7.81368484114996e-06, + "loss": 0.2876, + "step": 8862 + }, + { + "epoch": 2.8457216246588537, + "grad_norm": 0.8815329670906067, + "learning_rate": 7.800163611963318e-06, + "loss": 0.3486, + "step": 8863 + }, + { + "epoch": 2.8460427034837052, + "grad_norm": 0.6765179634094238, + "learning_rate": 7.786653617047079e-06, + "loss": 0.2835, + "step": 8864 + }, + { + "epoch": 2.846363782308557, + "grad_norm": 0.7167448997497559, + "learning_rate": 7.77315485804736e-06, + "loss": 0.2964, + "step": 8865 + }, + { + "epoch": 2.8466848611334084, + "grad_norm": 0.879822850227356, + "learning_rate": 7.75966733660901e-06, + "loss": 0.3248, + "step": 8866 + }, + { + "epoch": 2.84700593995826, + "grad_norm": 0.6780831813812256, + "learning_rate": 7.746191054375362e-06, + "loss": 0.2852, + "step": 8867 + }, + { + "epoch": 2.847327018783111, + "grad_norm": 0.6202150583267212, + "learning_rate": 7.73272601298851e-06, + "loss": 0.273, + "step": 8868 + }, + { + "epoch": 2.847648097607963, + "grad_norm": 0.5386216640472412, + "learning_rate": 7.719272214089145e-06, + "loss": 0.2531, + "step": 8869 + }, + { + "epoch": 2.847969176432814, + "grad_norm": 0.7454226016998291, + "learning_rate": 7.7058296593165e-06, + "loss": 0.2987, + "step": 8870 + }, + { + "epoch": 2.8482902552576657, + "grad_norm": 0.7010564208030701, + "learning_rate": 7.692398350308594e-06, + "loss": 0.2399, + "step": 8871 + }, + { + "epoch": 2.848611334082517, + "grad_norm": 1.0159744024276733, + "learning_rate": 7.67897828870191e-06, + "loss": 0.3846, + "step": 8872 + }, + { + "epoch": 2.8489324129073688, + "grad_norm": 0.6551013588905334, + "learning_rate": 7.665569476131706e-06, + "loss": 0.2866, + "step": 8873 + }, + { + "epoch": 2.8492534917322203, + "grad_norm": 0.5894836783409119, + "learning_rate": 7.652171914231776e-06, + "loss": 0.2376, + "step": 8874 + }, + { + "epoch": 2.849574570557072, + "grad_norm": 1.2114777565002441, + "learning_rate": 7.638785604634579e-06, + "loss": 0.3111, + "step": 8875 + }, + { + "epoch": 2.8498956493819234, + "grad_norm": 0.9398203492164612, + "learning_rate": 7.625410548971191e-06, + "loss": 0.2726, + "step": 8876 + }, + { + "epoch": 2.8502167282067745, + "grad_norm": 0.8695142865180969, + "learning_rate": 7.612046748871327e-06, + "loss": 0.3361, + "step": 8877 + }, + { + "epoch": 2.8505378070316265, + "grad_norm": 0.6532575488090515, + "learning_rate": 7.59869420596333e-06, + "loss": 0.3271, + "step": 8878 + }, + { + "epoch": 2.8508588858564776, + "grad_norm": 0.48674994707107544, + "learning_rate": 7.585352921874156e-06, + "loss": 0.2893, + "step": 8879 + }, + { + "epoch": 2.851179964681329, + "grad_norm": 0.5763227343559265, + "learning_rate": 7.572022898229403e-06, + "loss": 0.8793, + "step": 8880 + }, + { + "epoch": 2.8515010435061807, + "grad_norm": 0.5322999954223633, + "learning_rate": 7.558704136653305e-06, + "loss": 0.7263, + "step": 8881 + }, + { + "epoch": 2.8518221223310323, + "grad_norm": 0.44331344962120056, + "learning_rate": 7.545396638768698e-06, + "loss": 0.279, + "step": 8882 + }, + { + "epoch": 2.852143201155884, + "grad_norm": 0.41767483949661255, + "learning_rate": 7.5321004061970405e-06, + "loss": 0.2259, + "step": 8883 + }, + { + "epoch": 2.8524642799807354, + "grad_norm": 0.3086419403553009, + "learning_rate": 7.518815440558513e-06, + "loss": 0.1316, + "step": 8884 + }, + { + "epoch": 2.852785358805587, + "grad_norm": 0.29793986678123474, + "learning_rate": 7.505541743471756e-06, + "loss": 0.1329, + "step": 8885 + }, + { + "epoch": 2.853106437630438, + "grad_norm": 0.4482042193412781, + "learning_rate": 7.492279316554207e-06, + "loss": 0.2962, + "step": 8886 + }, + { + "epoch": 2.85342751645529, + "grad_norm": 0.780487060546875, + "learning_rate": 7.479028161421797e-06, + "loss": 0.4438, + "step": 8887 + }, + { + "epoch": 2.853748595280141, + "grad_norm": 1.0584594011306763, + "learning_rate": 7.465788279689156e-06, + "loss": 0.3568, + "step": 8888 + }, + { + "epoch": 2.8540696741049927, + "grad_norm": 0.6687178611755371, + "learning_rate": 7.45255967296955e-06, + "loss": 0.2615, + "step": 8889 + }, + { + "epoch": 2.8543907529298442, + "grad_norm": 0.9293543696403503, + "learning_rate": 7.439342342874789e-06, + "loss": 0.3512, + "step": 8890 + }, + { + "epoch": 2.854711831754696, + "grad_norm": 0.8716065287590027, + "learning_rate": 7.426136291015417e-06, + "loss": 0.3854, + "step": 8891 + }, + { + "epoch": 2.8550329105795473, + "grad_norm": 0.711845338344574, + "learning_rate": 7.412941519000527e-06, + "loss": 0.3156, + "step": 8892 + }, + { + "epoch": 2.855353989404399, + "grad_norm": 0.7607097625732422, + "learning_rate": 7.399758028437864e-06, + "loss": 0.3388, + "step": 8893 + }, + { + "epoch": 2.8556750682292504, + "grad_norm": 0.6983038783073425, + "learning_rate": 7.386585820933811e-06, + "loss": 0.2692, + "step": 8894 + }, + { + "epoch": 2.8559961470541015, + "grad_norm": 0.703209638595581, + "learning_rate": 7.3734248980933395e-06, + "loss": 0.2435, + "step": 8895 + }, + { + "epoch": 2.8563172258789535, + "grad_norm": 1.0315760374069214, + "learning_rate": 7.360275261520078e-06, + "loss": 0.3833, + "step": 8896 + }, + { + "epoch": 2.8566383047038046, + "grad_norm": 0.6447276473045349, + "learning_rate": 7.347136912816277e-06, + "loss": 0.2404, + "step": 8897 + }, + { + "epoch": 2.856959383528656, + "grad_norm": 1.043185830116272, + "learning_rate": 7.3340098535827905e-06, + "loss": 0.4912, + "step": 8898 + }, + { + "epoch": 2.8572804623535077, + "grad_norm": 1.1728825569152832, + "learning_rate": 7.320894085419116e-06, + "loss": 0.4808, + "step": 8899 + }, + { + "epoch": 2.8576015411783593, + "grad_norm": 0.8470208644866943, + "learning_rate": 7.3077896099233765e-06, + "loss": 0.38, + "step": 8900 + }, + { + "epoch": 2.857922620003211, + "grad_norm": 0.6861622333526611, + "learning_rate": 7.2946964286923046e-06, + "loss": 0.2538, + "step": 8901 + }, + { + "epoch": 2.8582436988280624, + "grad_norm": 0.7869503498077393, + "learning_rate": 7.281614543321269e-06, + "loss": 0.2132, + "step": 8902 + }, + { + "epoch": 2.858564777652914, + "grad_norm": 0.7390521764755249, + "learning_rate": 7.268543955404239e-06, + "loss": 0.2631, + "step": 8903 + }, + { + "epoch": 2.858885856477765, + "grad_norm": 0.6110241413116455, + "learning_rate": 7.255484666533874e-06, + "loss": 0.2363, + "step": 8904 + }, + { + "epoch": 2.859206935302617, + "grad_norm": 0.7435978651046753, + "learning_rate": 7.242436678301367e-06, + "loss": 0.3225, + "step": 8905 + }, + { + "epoch": 2.859528014127468, + "grad_norm": 0.5957700610160828, + "learning_rate": 7.2293999922965705e-06, + "loss": 0.2636, + "step": 8906 + }, + { + "epoch": 2.8598490929523197, + "grad_norm": 0.6576164364814758, + "learning_rate": 7.216374610108012e-06, + "loss": 0.3076, + "step": 8907 + }, + { + "epoch": 2.8601701717771713, + "grad_norm": 0.7818155288696289, + "learning_rate": 7.203360533322734e-06, + "loss": 0.3078, + "step": 8908 + }, + { + "epoch": 2.860491250602023, + "grad_norm": 0.974690854549408, + "learning_rate": 7.190357763526523e-06, + "loss": 0.4259, + "step": 8909 + }, + { + "epoch": 2.8608123294268744, + "grad_norm": 0.795142412185669, + "learning_rate": 7.177366302303667e-06, + "loss": 0.3382, + "step": 8910 + }, + { + "epoch": 2.861133408251726, + "grad_norm": 1.03029465675354, + "learning_rate": 7.164386151237179e-06, + "loss": 0.3553, + "step": 8911 + }, + { + "epoch": 2.8614544870765775, + "grad_norm": 0.9264429211616516, + "learning_rate": 7.151417311908648e-06, + "loss": 0.3729, + "step": 8912 + }, + { + "epoch": 2.8617755659014286, + "grad_norm": 0.9430894255638123, + "learning_rate": 7.138459785898266e-06, + "loss": 0.4435, + "step": 8913 + }, + { + "epoch": 2.8620966447262806, + "grad_norm": 0.7884198427200317, + "learning_rate": 7.125513574784903e-06, + "loss": 0.3564, + "step": 8914 + }, + { + "epoch": 2.8624177235511317, + "grad_norm": 1.3142826557159424, + "learning_rate": 7.112578680145954e-06, + "loss": 0.3726, + "step": 8915 + }, + { + "epoch": 2.862738802375983, + "grad_norm": 0.9246410727500916, + "learning_rate": 7.099655103557556e-06, + "loss": 0.3109, + "step": 8916 + }, + { + "epoch": 2.8630598812008348, + "grad_norm": 0.8271921277046204, + "learning_rate": 7.086742846594385e-06, + "loss": 0.2753, + "step": 8917 + }, + { + "epoch": 2.8633809600256863, + "grad_norm": 1.1295146942138672, + "learning_rate": 7.07384191082977e-06, + "loss": 0.3933, + "step": 8918 + }, + { + "epoch": 2.863702038850538, + "grad_norm": 2.6274220943450928, + "learning_rate": 7.060952297835633e-06, + "loss": 0.4857, + "step": 8919 + }, + { + "epoch": 2.8640231176753894, + "grad_norm": 0.9029250741004944, + "learning_rate": 7.048074009182548e-06, + "loss": 0.3297, + "step": 8920 + }, + { + "epoch": 2.864344196500241, + "grad_norm": 0.5170316696166992, + "learning_rate": 7.035207046439673e-06, + "loss": 0.2399, + "step": 8921 + }, + { + "epoch": 2.864665275325092, + "grad_norm": 0.6248924732208252, + "learning_rate": 7.022351411174866e-06, + "loss": 0.2731, + "step": 8922 + }, + { + "epoch": 2.864986354149944, + "grad_norm": 0.6657963991165161, + "learning_rate": 7.009507104954493e-06, + "loss": 0.2576, + "step": 8923 + }, + { + "epoch": 2.865307432974795, + "grad_norm": 0.5382866263389587, + "learning_rate": 6.996674129343606e-06, + "loss": 0.2667, + "step": 8924 + }, + { + "epoch": 2.8656285117996467, + "grad_norm": 0.9319119453430176, + "learning_rate": 6.9838524859058616e-06, + "loss": 0.3681, + "step": 8925 + }, + { + "epoch": 2.8659495906244983, + "grad_norm": 0.8236702084541321, + "learning_rate": 6.971042176203535e-06, + "loss": 0.3227, + "step": 8926 + }, + { + "epoch": 2.86627066944935, + "grad_norm": 0.42309141159057617, + "learning_rate": 6.958243201797554e-06, + "loss": 0.2548, + "step": 8927 + }, + { + "epoch": 2.8665917482742014, + "grad_norm": 0.6998513340950012, + "learning_rate": 6.945455564247394e-06, + "loss": 0.3271, + "step": 8928 + }, + { + "epoch": 2.866912827099053, + "grad_norm": 0.48560941219329834, + "learning_rate": 6.932679265111231e-06, + "loss": 0.2742, + "step": 8929 + }, + { + "epoch": 2.8672339059239045, + "grad_norm": 0.45436060428619385, + "learning_rate": 6.919914305945774e-06, + "loss": 0.6581, + "step": 8930 + }, + { + "epoch": 2.8675549847487556, + "grad_norm": 0.4203808903694153, + "learning_rate": 6.907160688306425e-06, + "loss": 0.3959, + "step": 8931 + }, + { + "epoch": 2.8678760635736076, + "grad_norm": 0.45685985684394836, + "learning_rate": 6.894418413747183e-06, + "loss": 0.3185, + "step": 8932 + }, + { + "epoch": 2.8681971423984587, + "grad_norm": 0.4417474865913391, + "learning_rate": 6.881687483820609e-06, + "loss": 0.2633, + "step": 8933 + }, + { + "epoch": 2.8685182212233102, + "grad_norm": 0.34829646348953247, + "learning_rate": 6.868967900077972e-06, + "loss": 0.1154, + "step": 8934 + }, + { + "epoch": 2.868839300048162, + "grad_norm": 0.4470880925655365, + "learning_rate": 6.856259664069098e-06, + "loss": 0.1363, + "step": 8935 + }, + { + "epoch": 2.8691603788730133, + "grad_norm": 0.42514151334762573, + "learning_rate": 6.8435627773424495e-06, + "loss": 0.1393, + "step": 8936 + }, + { + "epoch": 2.869481457697865, + "grad_norm": 0.8983972668647766, + "learning_rate": 6.830877241445111e-06, + "loss": 0.5135, + "step": 8937 + }, + { + "epoch": 2.8698025365227164, + "grad_norm": 1.0805124044418335, + "learning_rate": 6.818203057922757e-06, + "loss": 0.4579, + "step": 8938 + }, + { + "epoch": 2.870123615347568, + "grad_norm": 0.8403233289718628, + "learning_rate": 6.805540228319718e-06, + "loss": 0.3314, + "step": 8939 + }, + { + "epoch": 2.870444694172419, + "grad_norm": 0.9584758281707764, + "learning_rate": 6.7928887541789055e-06, + "loss": 0.3441, + "step": 8940 + }, + { + "epoch": 2.870765772997271, + "grad_norm": 0.9408062100410461, + "learning_rate": 6.780248637041875e-06, + "loss": 0.3953, + "step": 8941 + }, + { + "epoch": 2.871086851822122, + "grad_norm": 0.7710734605789185, + "learning_rate": 6.767619878448783e-06, + "loss": 0.3212, + "step": 8942 + }, + { + "epoch": 2.8714079306469737, + "grad_norm": 0.6842841506004333, + "learning_rate": 6.755002479938411e-06, + "loss": 0.2995, + "step": 8943 + }, + { + "epoch": 2.8717290094718253, + "grad_norm": 0.8255465030670166, + "learning_rate": 6.742396443048138e-06, + "loss": 0.3496, + "step": 8944 + }, + { + "epoch": 2.872050088296677, + "grad_norm": 0.838579535484314, + "learning_rate": 6.729801769313981e-06, + "loss": 0.3619, + "step": 8945 + }, + { + "epoch": 2.8723711671215284, + "grad_norm": 0.7680864930152893, + "learning_rate": 6.717218460270536e-06, + "loss": 0.3327, + "step": 8946 + }, + { + "epoch": 2.87269224594638, + "grad_norm": 0.6783306002616882, + "learning_rate": 6.704646517451107e-06, + "loss": 0.3324, + "step": 8947 + }, + { + "epoch": 2.8730133247712315, + "grad_norm": 0.8792276978492737, + "learning_rate": 6.692085942387483e-06, + "loss": 0.3289, + "step": 8948 + }, + { + "epoch": 2.8733344035960826, + "grad_norm": 0.9213592410087585, + "learning_rate": 6.679536736610137e-06, + "loss": 0.3607, + "step": 8949 + }, + { + "epoch": 2.8736554824209346, + "grad_norm": 0.8916637897491455, + "learning_rate": 6.666998901648203e-06, + "loss": 0.3322, + "step": 8950 + }, + { + "epoch": 2.8739765612457857, + "grad_norm": 0.9735648036003113, + "learning_rate": 6.654472439029313e-06, + "loss": 0.3434, + "step": 8951 + }, + { + "epoch": 2.8742976400706373, + "grad_norm": 0.7319313287734985, + "learning_rate": 6.6419573502798374e-06, + "loss": 0.3157, + "step": 8952 + }, + { + "epoch": 2.874618718895489, + "grad_norm": 0.9477907419204712, + "learning_rate": 6.629453636924643e-06, + "loss": 0.3392, + "step": 8953 + }, + { + "epoch": 2.8749397977203404, + "grad_norm": 0.9957652688026428, + "learning_rate": 6.616961300487324e-06, + "loss": 0.448, + "step": 8954 + }, + { + "epoch": 2.875260876545192, + "grad_norm": 1.1732362508773804, + "learning_rate": 6.604480342490004e-06, + "loss": 0.438, + "step": 8955 + }, + { + "epoch": 2.8755819553700435, + "grad_norm": 0.710220992565155, + "learning_rate": 6.592010764453449e-06, + "loss": 0.2393, + "step": 8956 + }, + { + "epoch": 2.875903034194895, + "grad_norm": 0.8079162836074829, + "learning_rate": 6.579552567897051e-06, + "loss": 0.3013, + "step": 8957 + }, + { + "epoch": 2.876224113019746, + "grad_norm": 0.8686780333518982, + "learning_rate": 6.5671057543387985e-06, + "loss": 0.3435, + "step": 8958 + }, + { + "epoch": 2.876545191844598, + "grad_norm": 1.116487979888916, + "learning_rate": 6.554670325295298e-06, + "loss": 0.4351, + "step": 8959 + }, + { + "epoch": 2.876866270669449, + "grad_norm": 0.6508135199546814, + "learning_rate": 6.542246282281772e-06, + "loss": 0.311, + "step": 8960 + }, + { + "epoch": 2.8771873494943008, + "grad_norm": 1.2992252111434937, + "learning_rate": 6.529833626812043e-06, + "loss": 0.3634, + "step": 8961 + }, + { + "epoch": 2.8775084283191523, + "grad_norm": 0.7349622845649719, + "learning_rate": 6.517432360398556e-06, + "loss": 0.3065, + "step": 8962 + }, + { + "epoch": 2.877829507144004, + "grad_norm": 0.7028359770774841, + "learning_rate": 6.5050424845523815e-06, + "loss": 0.2529, + "step": 8963 + }, + { + "epoch": 2.8781505859688554, + "grad_norm": 0.6974001526832581, + "learning_rate": 6.492664000783166e-06, + "loss": 0.2822, + "step": 8964 + }, + { + "epoch": 2.878471664793707, + "grad_norm": 0.9233234524726868, + "learning_rate": 6.480296910599237e-06, + "loss": 0.3762, + "step": 8965 + }, + { + "epoch": 2.8787927436185585, + "grad_norm": 1.0546296834945679, + "learning_rate": 6.467941215507433e-06, + "loss": 0.4223, + "step": 8966 + }, + { + "epoch": 2.8791138224434096, + "grad_norm": 0.8296759724617004, + "learning_rate": 6.455596917013273e-06, + "loss": 0.3025, + "step": 8967 + }, + { + "epoch": 2.8794349012682616, + "grad_norm": 0.6233950257301331, + "learning_rate": 6.443264016620887e-06, + "loss": 0.2302, + "step": 8968 + }, + { + "epoch": 2.8797559800931127, + "grad_norm": 0.8595796227455139, + "learning_rate": 6.430942515832983e-06, + "loss": 0.267, + "step": 8969 + }, + { + "epoch": 2.8800770589179643, + "grad_norm": 0.9391990900039673, + "learning_rate": 6.418632416150927e-06, + "loss": 0.3733, + "step": 8970 + }, + { + "epoch": 2.880398137742816, + "grad_norm": 0.7740428447723389, + "learning_rate": 6.406333719074619e-06, + "loss": 0.2623, + "step": 8971 + }, + { + "epoch": 2.8807192165676674, + "grad_norm": 0.760464608669281, + "learning_rate": 6.394046426102674e-06, + "loss": 0.3015, + "step": 8972 + }, + { + "epoch": 2.881040295392519, + "grad_norm": 0.8069641590118408, + "learning_rate": 6.381770538732224e-06, + "loss": 0.324, + "step": 8973 + }, + { + "epoch": 2.8813613742173705, + "grad_norm": 0.6506406664848328, + "learning_rate": 6.3695060584590625e-06, + "loss": 0.2543, + "step": 8974 + }, + { + "epoch": 2.881682453042222, + "grad_norm": 0.4887990355491638, + "learning_rate": 6.357252986777595e-06, + "loss": 0.2442, + "step": 8975 + }, + { + "epoch": 2.882003531867073, + "grad_norm": 1.2128475904464722, + "learning_rate": 6.345011325180772e-06, + "loss": 0.3487, + "step": 8976 + }, + { + "epoch": 2.882324610691925, + "grad_norm": 0.8624823689460754, + "learning_rate": 6.332781075160243e-06, + "loss": 0.301, + "step": 8977 + }, + { + "epoch": 2.8826456895167762, + "grad_norm": 0.4960239827632904, + "learning_rate": 6.320562238206218e-06, + "loss": 0.2894, + "step": 8978 + }, + { + "epoch": 2.882966768341628, + "grad_norm": 0.48501670360565186, + "learning_rate": 6.308354815807527e-06, + "loss": 0.2678, + "step": 8979 + }, + { + "epoch": 2.8832878471664793, + "grad_norm": 0.42197084426879883, + "learning_rate": 6.296158809451602e-06, + "loss": 0.4037, + "step": 8980 + }, + { + "epoch": 2.883608925991331, + "grad_norm": 0.36780065298080444, + "learning_rate": 6.283974220624489e-06, + "loss": 0.3432, + "step": 8981 + }, + { + "epoch": 2.8839300048161824, + "grad_norm": 0.3911552131175995, + "learning_rate": 6.2718010508108545e-06, + "loss": 0.217, + "step": 8982 + }, + { + "epoch": 2.884251083641034, + "grad_norm": 0.5778048634529114, + "learning_rate": 6.259639301493947e-06, + "loss": 0.409, + "step": 8983 + }, + { + "epoch": 2.8845721624658855, + "grad_norm": 0.13645879924297333, + "learning_rate": 6.2474889741556575e-06, + "loss": 0.0592, + "step": 8984 + }, + { + "epoch": 2.8848932412907367, + "grad_norm": 0.3099921941757202, + "learning_rate": 6.235350070276447e-06, + "loss": 0.1358, + "step": 8985 + }, + { + "epoch": 2.8852143201155886, + "grad_norm": 0.9487568736076355, + "learning_rate": 6.223222591335409e-06, + "loss": 0.3908, + "step": 8986 + }, + { + "epoch": 2.8855353989404398, + "grad_norm": 0.6402967572212219, + "learning_rate": 6.21110653881023e-06, + "loss": 0.2223, + "step": 8987 + }, + { + "epoch": 2.8858564777652913, + "grad_norm": 1.0103484392166138, + "learning_rate": 6.1990019141772605e-06, + "loss": 0.4384, + "step": 8988 + }, + { + "epoch": 2.886177556590143, + "grad_norm": 0.8888756632804871, + "learning_rate": 6.186908718911344e-06, + "loss": 0.3182, + "step": 8989 + }, + { + "epoch": 2.8864986354149944, + "grad_norm": 0.6990677118301392, + "learning_rate": 6.174826954486068e-06, + "loss": 0.2554, + "step": 8990 + }, + { + "epoch": 2.886819714239846, + "grad_norm": 0.9189555048942566, + "learning_rate": 6.1627566223735e-06, + "loss": 0.3981, + "step": 8991 + }, + { + "epoch": 2.8871407930646975, + "grad_norm": 0.9261887073516846, + "learning_rate": 6.1506977240444074e-06, + "loss": 0.3965, + "step": 8992 + }, + { + "epoch": 2.887461871889549, + "grad_norm": 0.7619607448577881, + "learning_rate": 6.138650260968137e-06, + "loss": 0.3196, + "step": 8993 + }, + { + "epoch": 2.8877829507144, + "grad_norm": 0.7999249696731567, + "learning_rate": 6.126614234612593e-06, + "loss": 0.2928, + "step": 8994 + }, + { + "epoch": 2.888104029539252, + "grad_norm": 0.8483562469482422, + "learning_rate": 6.1145896464443685e-06, + "loss": 0.3259, + "step": 8995 + }, + { + "epoch": 2.8884251083641033, + "grad_norm": 0.9350925087928772, + "learning_rate": 6.102576497928614e-06, + "loss": 0.3718, + "step": 8996 + }, + { + "epoch": 2.888746187188955, + "grad_norm": 1.1752997636795044, + "learning_rate": 6.090574790529091e-06, + "loss": 0.4583, + "step": 8997 + }, + { + "epoch": 2.8890672660138064, + "grad_norm": 1.18094801902771, + "learning_rate": 6.078584525708176e-06, + "loss": 0.4047, + "step": 8998 + }, + { + "epoch": 2.889388344838658, + "grad_norm": 1.1742857694625854, + "learning_rate": 6.066605704926831e-06, + "loss": 0.4776, + "step": 8999 + }, + { + "epoch": 2.8897094236635095, + "grad_norm": 0.905214786529541, + "learning_rate": 6.054638329644657e-06, + "loss": 0.3768, + "step": 9000 + }, + { + "epoch": 2.890030502488361, + "grad_norm": 1.0812019109725952, + "learning_rate": 6.042682401319844e-06, + "loss": 0.4689, + "step": 9001 + }, + { + "epoch": 2.8903515813132126, + "grad_norm": 1.0372034311294556, + "learning_rate": 6.030737921409169e-06, + "loss": 0.2963, + "step": 9002 + }, + { + "epoch": 2.8906726601380637, + "grad_norm": 1.742632508277893, + "learning_rate": 6.018804891368035e-06, + "loss": 0.3989, + "step": 9003 + }, + { + "epoch": 2.8909937389629157, + "grad_norm": 0.8828332424163818, + "learning_rate": 6.006883312650457e-06, + "loss": 0.3274, + "step": 9004 + }, + { + "epoch": 2.8913148177877668, + "grad_norm": 0.6829835772514343, + "learning_rate": 5.994973186709041e-06, + "loss": 0.2829, + "step": 9005 + }, + { + "epoch": 2.8916358966126183, + "grad_norm": 0.8279037475585938, + "learning_rate": 5.98307451499498e-06, + "loss": 0.3327, + "step": 9006 + }, + { + "epoch": 2.89195697543747, + "grad_norm": 0.7003656625747681, + "learning_rate": 5.971187298958103e-06, + "loss": 0.2629, + "step": 9007 + }, + { + "epoch": 2.8922780542623214, + "grad_norm": 0.7397180795669556, + "learning_rate": 5.9593115400468636e-06, + "loss": 0.2892, + "step": 9008 + }, + { + "epoch": 2.892599133087173, + "grad_norm": 0.88568514585495, + "learning_rate": 5.947447239708215e-06, + "loss": 0.3233, + "step": 9009 + }, + { + "epoch": 2.8929202119120245, + "grad_norm": 0.6601244807243347, + "learning_rate": 5.935594399387856e-06, + "loss": 0.27, + "step": 9010 + }, + { + "epoch": 2.893241290736876, + "grad_norm": 1.0693775415420532, + "learning_rate": 5.923753020529999e-06, + "loss": 0.4009, + "step": 9011 + }, + { + "epoch": 2.893562369561727, + "grad_norm": 0.7800514698028564, + "learning_rate": 5.911923104577455e-06, + "loss": 0.3128, + "step": 9012 + }, + { + "epoch": 2.893883448386579, + "grad_norm": 0.9529879689216614, + "learning_rate": 5.900104652971694e-06, + "loss": 0.3787, + "step": 9013 + }, + { + "epoch": 2.8942045272114303, + "grad_norm": 1.1537600755691528, + "learning_rate": 5.888297667152731e-06, + "loss": 0.531, + "step": 9014 + }, + { + "epoch": 2.894525606036282, + "grad_norm": 0.9319456219673157, + "learning_rate": 5.8765021485592376e-06, + "loss": 0.2995, + "step": 9015 + }, + { + "epoch": 2.8948466848611334, + "grad_norm": 0.5598946809768677, + "learning_rate": 5.864718098628441e-06, + "loss": 0.2649, + "step": 9016 + }, + { + "epoch": 2.895167763685985, + "grad_norm": 0.70030277967453, + "learning_rate": 5.852945518796205e-06, + "loss": 0.2977, + "step": 9017 + }, + { + "epoch": 2.8954888425108365, + "grad_norm": 0.9624463319778442, + "learning_rate": 5.8411844104969916e-06, + "loss": 0.3796, + "step": 9018 + }, + { + "epoch": 2.895809921335688, + "grad_norm": 0.5246722102165222, + "learning_rate": 5.829434775163833e-06, + "loss": 0.2232, + "step": 9019 + }, + { + "epoch": 2.8961310001605396, + "grad_norm": 0.5366243720054626, + "learning_rate": 5.8176966142283965e-06, + "loss": 0.2416, + "step": 9020 + }, + { + "epoch": 2.8964520789853907, + "grad_norm": 0.7249617576599121, + "learning_rate": 5.805969929120947e-06, + "loss": 0.2787, + "step": 9021 + }, + { + "epoch": 2.8967731578102422, + "grad_norm": 0.7474877834320068, + "learning_rate": 5.7942547212703315e-06, + "loss": 0.2923, + "step": 9022 + }, + { + "epoch": 2.897094236635094, + "grad_norm": 0.5960254669189453, + "learning_rate": 5.78255099210403e-06, + "loss": 0.2618, + "step": 9023 + }, + { + "epoch": 2.8974153154599454, + "grad_norm": 0.7626693844795227, + "learning_rate": 5.770858743048091e-06, + "loss": 0.2727, + "step": 9024 + }, + { + "epoch": 2.897736394284797, + "grad_norm": 0.5556107759475708, + "learning_rate": 5.759177975527186e-06, + "loss": 0.258, + "step": 9025 + }, + { + "epoch": 2.8980574731096485, + "grad_norm": 0.471110463142395, + "learning_rate": 5.747508690964598e-06, + "loss": 0.2506, + "step": 9026 + }, + { + "epoch": 2.8983785519345, + "grad_norm": 0.47644883394241333, + "learning_rate": 5.735850890782157e-06, + "loss": 0.2521, + "step": 9027 + }, + { + "epoch": 2.8986996307593516, + "grad_norm": 0.49500352144241333, + "learning_rate": 5.724204576400371e-06, + "loss": 0.275, + "step": 9028 + }, + { + "epoch": 2.899020709584203, + "grad_norm": 0.31801772117614746, + "learning_rate": 5.7125697492382835e-06, + "loss": 0.2572, + "step": 9029 + }, + { + "epoch": 2.899341788409054, + "grad_norm": 0.7811435461044312, + "learning_rate": 5.700946410713548e-06, + "loss": 0.8103, + "step": 9030 + }, + { + "epoch": 2.8996628672339058, + "grad_norm": 0.3786062002182007, + "learning_rate": 5.6893345622424874e-06, + "loss": 0.3159, + "step": 9031 + }, + { + "epoch": 2.8999839460587573, + "grad_norm": 0.2506957948207855, + "learning_rate": 5.6777342052399045e-06, + "loss": 0.0899, + "step": 9032 + }, + { + "epoch": 2.900305024883609, + "grad_norm": 0.23247039318084717, + "learning_rate": 5.666145341119322e-06, + "loss": 0.0615, + "step": 9033 + }, + { + "epoch": 2.9006261037084604, + "grad_norm": 0.1694241166114807, + "learning_rate": 5.654567971292757e-06, + "loss": 0.0589, + "step": 9034 + }, + { + "epoch": 2.900947182533312, + "grad_norm": 0.25635042786598206, + "learning_rate": 5.643002097170924e-06, + "loss": 0.1044, + "step": 9035 + }, + { + "epoch": 2.9012682613581635, + "grad_norm": 0.9146557450294495, + "learning_rate": 5.6314477201630745e-06, + "loss": 0.5639, + "step": 9036 + }, + { + "epoch": 2.901589340183015, + "grad_norm": 0.8883252739906311, + "learning_rate": 5.619904841677059e-06, + "loss": 0.4244, + "step": 9037 + }, + { + "epoch": 2.9019104190078666, + "grad_norm": 0.751973032951355, + "learning_rate": 5.608373463119354e-06, + "loss": 0.3138, + "step": 9038 + }, + { + "epoch": 2.9022314978327177, + "grad_norm": 0.7506957054138184, + "learning_rate": 5.5968535858950345e-06, + "loss": 0.2615, + "step": 9039 + }, + { + "epoch": 2.9025525766575693, + "grad_norm": 0.7786146402359009, + "learning_rate": 5.585345211407733e-06, + "loss": 0.3453, + "step": 9040 + }, + { + "epoch": 2.902873655482421, + "grad_norm": 0.9360180497169495, + "learning_rate": 5.573848341059739e-06, + "loss": 0.4307, + "step": 9041 + }, + { + "epoch": 2.9031947343072724, + "grad_norm": 0.9217041730880737, + "learning_rate": 5.562362976251901e-06, + "loss": 0.3228, + "step": 9042 + }, + { + "epoch": 2.903515813132124, + "grad_norm": 0.904606819152832, + "learning_rate": 5.550889118383673e-06, + "loss": 0.3926, + "step": 9043 + }, + { + "epoch": 2.9038368919569755, + "grad_norm": 0.7405612468719482, + "learning_rate": 5.5394267688531066e-06, + "loss": 0.2862, + "step": 9044 + }, + { + "epoch": 2.904157970781827, + "grad_norm": 0.7969362735748291, + "learning_rate": 5.52797592905685e-06, + "loss": 0.3321, + "step": 9045 + }, + { + "epoch": 2.9044790496066786, + "grad_norm": 0.8210546970367432, + "learning_rate": 5.516536600390188e-06, + "loss": 0.3051, + "step": 9046 + }, + { + "epoch": 2.90480012843153, + "grad_norm": 0.796033501625061, + "learning_rate": 5.505108784246926e-06, + "loss": 0.3624, + "step": 9047 + }, + { + "epoch": 2.9051212072563812, + "grad_norm": 0.7903563380241394, + "learning_rate": 5.49369248201953e-06, + "loss": 0.318, + "step": 9048 + }, + { + "epoch": 2.905442286081233, + "grad_norm": 1.0777714252471924, + "learning_rate": 5.482287695099031e-06, + "loss": 0.4097, + "step": 9049 + }, + { + "epoch": 2.9057633649060843, + "grad_norm": 1.2699135541915894, + "learning_rate": 5.470894424875062e-06, + "loss": 0.3572, + "step": 9050 + }, + { + "epoch": 2.906084443730936, + "grad_norm": 0.935076892375946, + "learning_rate": 5.4595126727359e-06, + "loss": 0.3222, + "step": 9051 + }, + { + "epoch": 2.9064055225557874, + "grad_norm": 0.7485237121582031, + "learning_rate": 5.448142440068316e-06, + "loss": 0.3005, + "step": 9052 + }, + { + "epoch": 2.906726601380639, + "grad_norm": 0.9007347822189331, + "learning_rate": 5.436783728257788e-06, + "loss": 0.3643, + "step": 9053 + }, + { + "epoch": 2.9070476802054905, + "grad_norm": 0.8481326699256897, + "learning_rate": 5.425436538688322e-06, + "loss": 0.3329, + "step": 9054 + }, + { + "epoch": 2.907368759030342, + "grad_norm": 0.8909382820129395, + "learning_rate": 5.414100872742534e-06, + "loss": 0.2989, + "step": 9055 + }, + { + "epoch": 2.9076898378551936, + "grad_norm": 0.9575411677360535, + "learning_rate": 5.402776731801662e-06, + "loss": 0.269, + "step": 9056 + }, + { + "epoch": 2.9080109166800447, + "grad_norm": 0.7255709767341614, + "learning_rate": 5.39146411724547e-06, + "loss": 0.294, + "step": 9057 + }, + { + "epoch": 2.9083319955048963, + "grad_norm": 1.5980173349380493, + "learning_rate": 5.380163030452412e-06, + "loss": 0.2823, + "step": 9058 + }, + { + "epoch": 2.908653074329748, + "grad_norm": 0.8485715389251709, + "learning_rate": 5.368873472799474e-06, + "loss": 0.2583, + "step": 9059 + }, + { + "epoch": 2.9089741531545994, + "grad_norm": 0.9129398465156555, + "learning_rate": 5.357595445662267e-06, + "loss": 0.3554, + "step": 9060 + }, + { + "epoch": 2.909295231979451, + "grad_norm": 0.7431888580322266, + "learning_rate": 5.346328950414969e-06, + "loss": 0.2997, + "step": 9061 + }, + { + "epoch": 2.9096163108043025, + "grad_norm": 1.5763301849365234, + "learning_rate": 5.335073988430372e-06, + "loss": 0.3451, + "step": 9062 + }, + { + "epoch": 2.909937389629154, + "grad_norm": 0.8064843416213989, + "learning_rate": 5.3238305610798565e-06, + "loss": 0.3082, + "step": 9063 + }, + { + "epoch": 2.9102584684540056, + "grad_norm": 1.073921799659729, + "learning_rate": 5.312598669733404e-06, + "loss": 0.3749, + "step": 9064 + }, + { + "epoch": 2.910579547278857, + "grad_norm": 0.5479134917259216, + "learning_rate": 5.301378315759598e-06, + "loss": 0.2452, + "step": 9065 + }, + { + "epoch": 2.9109006261037083, + "grad_norm": 0.9387060403823853, + "learning_rate": 5.290169500525577e-06, + "loss": 0.3453, + "step": 9066 + }, + { + "epoch": 2.91122170492856, + "grad_norm": 0.6247801184654236, + "learning_rate": 5.278972225397127e-06, + "loss": 0.2472, + "step": 9067 + }, + { + "epoch": 2.9115427837534114, + "grad_norm": 0.6648100018501282, + "learning_rate": 5.267786491738569e-06, + "loss": 0.2595, + "step": 9068 + }, + { + "epoch": 2.911863862578263, + "grad_norm": 0.5038353204727173, + "learning_rate": 5.256612300912911e-06, + "loss": 0.2275, + "step": 9069 + }, + { + "epoch": 2.9121849414031145, + "grad_norm": 0.5695350170135498, + "learning_rate": 5.245449654281631e-06, + "loss": 0.2695, + "step": 9070 + }, + { + "epoch": 2.912506020227966, + "grad_norm": NaN, + "learning_rate": 5.245449654281631e-06, + "loss": 0.4011, + "step": 9071 + }, + { + "epoch": 2.9128270990528176, + "grad_norm": 1.1442893743515015, + "learning_rate": 5.2342985532049084e-06, + "loss": 0.3682, + "step": 9072 + }, + { + "epoch": 2.913148177877669, + "grad_norm": 0.6914578676223755, + "learning_rate": 5.223158999041444e-06, + "loss": 0.2844, + "step": 9073 + }, + { + "epoch": 2.9134692567025207, + "grad_norm": 0.8676597476005554, + "learning_rate": 5.212030993148553e-06, + "loss": 0.3133, + "step": 9074 + }, + { + "epoch": 2.9137903355273718, + "grad_norm": 0.6819028854370117, + "learning_rate": 5.200914536882185e-06, + "loss": 0.2754, + "step": 9075 + }, + { + "epoch": 2.9141114143522233, + "grad_norm": 0.5778715014457703, + "learning_rate": 5.189809631596798e-06, + "loss": 0.2804, + "step": 9076 + }, + { + "epoch": 2.914432493177075, + "grad_norm": 0.4760773479938507, + "learning_rate": 5.178716278645535e-06, + "loss": 0.245, + "step": 9077 + }, + { + "epoch": 2.9147535720019264, + "grad_norm": 0.41519516706466675, + "learning_rate": 5.167634479380068e-06, + "loss": 0.2327, + "step": 9078 + }, + { + "epoch": 2.915074650826778, + "grad_norm": 0.5341471433639526, + "learning_rate": 5.1565642351506845e-06, + "loss": 0.2778, + "step": 9079 + }, + { + "epoch": 2.9153957296516295, + "grad_norm": 0.6417379379272461, + "learning_rate": 5.145505547306251e-06, + "loss": 0.9807, + "step": 9080 + }, + { + "epoch": 2.915716808476481, + "grad_norm": 0.5737770199775696, + "learning_rate": 5.134458417194254e-06, + "loss": 0.5596, + "step": 9081 + }, + { + "epoch": 2.9160378873013326, + "grad_norm": 0.29216137528419495, + "learning_rate": 5.1234228461607304e-06, + "loss": 0.1449, + "step": 9082 + }, + { + "epoch": 2.916358966126184, + "grad_norm": 0.20528793334960938, + "learning_rate": 5.1123988355503475e-06, + "loss": 0.06, + "step": 9083 + }, + { + "epoch": 2.9166800449510353, + "grad_norm": 0.1566951721906662, + "learning_rate": 5.101386386706342e-06, + "loss": 0.0578, + "step": 9084 + }, + { + "epoch": 2.917001123775887, + "grad_norm": 0.1581335961818695, + "learning_rate": 5.0903855009705514e-06, + "loss": 0.0592, + "step": 9085 + }, + { + "epoch": 2.9173222026007384, + "grad_norm": 0.24517378211021423, + "learning_rate": 5.079396179683383e-06, + "loss": 0.1089, + "step": 9086 + }, + { + "epoch": 2.91764328142559, + "grad_norm": 0.6500630974769592, + "learning_rate": 5.068418424183874e-06, + "loss": 0.3078, + "step": 9087 + }, + { + "epoch": 2.9179643602504415, + "grad_norm": 0.8994437456130981, + "learning_rate": 5.057452235809624e-06, + "loss": 0.509, + "step": 9088 + }, + { + "epoch": 2.918285439075293, + "grad_norm": 0.7024485468864441, + "learning_rate": 5.046497615896806e-06, + "loss": 0.2671, + "step": 9089 + }, + { + "epoch": 2.9186065179001446, + "grad_norm": 0.7663185000419617, + "learning_rate": 5.035554565780265e-06, + "loss": 0.3624, + "step": 9090 + }, + { + "epoch": 2.918927596724996, + "grad_norm": 0.6804545521736145, + "learning_rate": 5.024623086793323e-06, + "loss": 0.2751, + "step": 9091 + }, + { + "epoch": 2.9192486755498477, + "grad_norm": 0.78728187084198, + "learning_rate": 5.013703180267959e-06, + "loss": 0.3272, + "step": 9092 + }, + { + "epoch": 2.919569754374699, + "grad_norm": 0.8548991680145264, + "learning_rate": 5.002794847534764e-06, + "loss": 0.3579, + "step": 9093 + }, + { + "epoch": 2.9198908331995503, + "grad_norm": 1.0403963327407837, + "learning_rate": 4.991898089922819e-06, + "loss": 0.4054, + "step": 9094 + }, + { + "epoch": 2.920211912024402, + "grad_norm": 1.1474127769470215, + "learning_rate": 4.981012908759941e-06, + "loss": 0.4484, + "step": 9095 + }, + { + "epoch": 2.9205329908492534, + "grad_norm": 0.6695740818977356, + "learning_rate": 4.97013930537239e-06, + "loss": 0.2586, + "step": 9096 + }, + { + "epoch": 2.920854069674105, + "grad_norm": 0.9828323125839233, + "learning_rate": 4.959277281085129e-06, + "loss": 0.3797, + "step": 9097 + }, + { + "epoch": 2.9211751484989565, + "grad_norm": 0.9727553725242615, + "learning_rate": 4.948426837221631e-06, + "loss": 0.3328, + "step": 9098 + }, + { + "epoch": 2.921496227323808, + "grad_norm": 0.7968175411224365, + "learning_rate": 4.937587975103996e-06, + "loss": 0.2932, + "step": 9099 + }, + { + "epoch": 2.9218173061486596, + "grad_norm": 0.786050021648407, + "learning_rate": 4.926760696052934e-06, + "loss": 0.3092, + "step": 9100 + }, + { + "epoch": 2.922138384973511, + "grad_norm": 0.9386889338493347, + "learning_rate": 4.915945001387667e-06, + "loss": 0.3844, + "step": 9101 + }, + { + "epoch": 2.9224594637983623, + "grad_norm": 0.8530262112617493, + "learning_rate": 4.905140892426097e-06, + "loss": 0.3504, + "step": 9102 + }, + { + "epoch": 2.922780542623214, + "grad_norm": 0.7439628839492798, + "learning_rate": 4.8943483704846475e-06, + "loss": 0.3001, + "step": 9103 + }, + { + "epoch": 2.9231016214480654, + "grad_norm": 0.8572718501091003, + "learning_rate": 4.8835674368783666e-06, + "loss": 0.3333, + "step": 9104 + }, + { + "epoch": 2.923422700272917, + "grad_norm": 0.9241260886192322, + "learning_rate": 4.872798092920872e-06, + "loss": 0.3437, + "step": 9105 + }, + { + "epoch": 2.9237437790977685, + "grad_norm": 0.8410215973854065, + "learning_rate": 4.862040339924378e-06, + "loss": 0.3783, + "step": 9106 + }, + { + "epoch": 2.92406485792262, + "grad_norm": 1.0775259733200073, + "learning_rate": 4.8512941791996726e-06, + "loss": 0.3497, + "step": 9107 + }, + { + "epoch": 2.9243859367474716, + "grad_norm": 0.4644877016544342, + "learning_rate": 4.840559612056183e-06, + "loss": 0.2042, + "step": 9108 + }, + { + "epoch": 2.924707015572323, + "grad_norm": 0.7072948217391968, + "learning_rate": 4.829836639801843e-06, + "loss": 0.2708, + "step": 9109 + }, + { + "epoch": 2.9250280943971747, + "grad_norm": 0.8432921767234802, + "learning_rate": 4.819125263743229e-06, + "loss": 0.313, + "step": 9110 + }, + { + "epoch": 2.925349173222026, + "grad_norm": 0.8821758031845093, + "learning_rate": 4.808425485185486e-06, + "loss": 0.4026, + "step": 9111 + }, + { + "epoch": 2.9256702520468774, + "grad_norm": 1.0225335359573364, + "learning_rate": 4.797737305432337e-06, + "loss": 0.463, + "step": 9112 + }, + { + "epoch": 2.925991330871729, + "grad_norm": 0.8588323593139648, + "learning_rate": 4.7870607257861415e-06, + "loss": 0.3394, + "step": 9113 + }, + { + "epoch": 2.9263124096965805, + "grad_norm": 0.6533500552177429, + "learning_rate": 4.776395747547757e-06, + "loss": 0.2785, + "step": 9114 + }, + { + "epoch": 2.926633488521432, + "grad_norm": 1.2023701667785645, + "learning_rate": 4.765742372016735e-06, + "loss": 0.2919, + "step": 9115 + }, + { + "epoch": 2.9269545673462836, + "grad_norm": 1.0757431983947754, + "learning_rate": 4.755100600491102e-06, + "loss": 0.4093, + "step": 9116 + }, + { + "epoch": 2.927275646171135, + "grad_norm": 1.0217454433441162, + "learning_rate": 4.744470434267567e-06, + "loss": 0.3653, + "step": 9117 + }, + { + "epoch": 2.9275967249959867, + "grad_norm": 0.9355355501174927, + "learning_rate": 4.733851874641382e-06, + "loss": 0.3976, + "step": 9118 + }, + { + "epoch": 2.927917803820838, + "grad_norm": 1.0048413276672363, + "learning_rate": 4.723244922906356e-06, + "loss": 0.3092, + "step": 9119 + }, + { + "epoch": 2.9282388826456893, + "grad_norm": 0.6854848861694336, + "learning_rate": 4.712649580354933e-06, + "loss": 0.3157, + "step": 9120 + }, + { + "epoch": 2.928559961470541, + "grad_norm": 1.0658174753189087, + "learning_rate": 4.702065848278126e-06, + "loss": 0.3169, + "step": 9121 + }, + { + "epoch": 2.9288810402953924, + "grad_norm": 1.368660807609558, + "learning_rate": 4.691493727965512e-06, + "loss": 0.4592, + "step": 9122 + }, + { + "epoch": 2.929202119120244, + "grad_norm": 0.9535953402519226, + "learning_rate": 4.680933220705308e-06, + "loss": 0.3418, + "step": 9123 + }, + { + "epoch": 2.9295231979450955, + "grad_norm": 0.46225616335868835, + "learning_rate": 4.670384327784239e-06, + "loss": 0.2457, + "step": 9124 + }, + { + "epoch": 2.929844276769947, + "grad_norm": 0.45245954394340515, + "learning_rate": 4.659847050487687e-06, + "loss": 0.248, + "step": 9125 + }, + { + "epoch": 2.9301653555947986, + "grad_norm": 0.48897916078567505, + "learning_rate": 4.64932139009957e-06, + "loss": 0.261, + "step": 9126 + }, + { + "epoch": 2.93048643441965, + "grad_norm": 0.7464796900749207, + "learning_rate": 4.638807347902408e-06, + "loss": 0.2816, + "step": 9127 + }, + { + "epoch": 2.9308075132445017, + "grad_norm": 0.8038036823272705, + "learning_rate": 4.628304925177318e-06, + "loss": 0.3102, + "step": 9128 + }, + { + "epoch": 2.931128592069353, + "grad_norm": 0.49647238850593567, + "learning_rate": 4.617814123203967e-06, + "loss": 0.2667, + "step": 9129 + }, + { + "epoch": 2.9314496708942044, + "grad_norm": 0.3765721917152405, + "learning_rate": 4.607334943260655e-06, + "loss": 0.3785, + "step": 9130 + }, + { + "epoch": 2.931770749719056, + "grad_norm": 0.40889376401901245, + "learning_rate": 4.596867386624215e-06, + "loss": 0.3646, + "step": 9131 + }, + { + "epoch": 2.9320918285439075, + "grad_norm": 0.600988507270813, + "learning_rate": 4.586411454570083e-06, + "loss": 0.3995, + "step": 9132 + }, + { + "epoch": 2.932412907368759, + "grad_norm": 0.4408535659313202, + "learning_rate": 4.575967148372317e-06, + "loss": 0.1431, + "step": 9133 + }, + { + "epoch": 2.9327339861936106, + "grad_norm": 0.45499712228775024, + "learning_rate": 4.5655344693034895e-06, + "loss": 0.2273, + "step": 9134 + }, + { + "epoch": 2.933055065018462, + "grad_norm": 0.5091567039489746, + "learning_rate": 4.555113418634805e-06, + "loss": 0.123, + "step": 9135 + }, + { + "epoch": 2.9333761438433137, + "grad_norm": 0.4760441780090332, + "learning_rate": 4.544703997636046e-06, + "loss": 0.2153, + "step": 9136 + }, + { + "epoch": 2.9336972226681652, + "grad_norm": 0.5263906717300415, + "learning_rate": 4.534306207575545e-06, + "loss": 0.2508, + "step": 9137 + }, + { + "epoch": 2.9340183014930163, + "grad_norm": 0.9112762212753296, + "learning_rate": 4.523920049720265e-06, + "loss": 0.5172, + "step": 9138 + }, + { + "epoch": 2.934339380317868, + "grad_norm": 0.9089072346687317, + "learning_rate": 4.513545525335705e-06, + "loss": 0.4721, + "step": 9139 + }, + { + "epoch": 2.9346604591427194, + "grad_norm": 0.6721816062927246, + "learning_rate": 4.5031826356859876e-06, + "loss": 0.2626, + "step": 9140 + }, + { + "epoch": 2.934981537967571, + "grad_norm": 0.9276089072227478, + "learning_rate": 4.492831382033791e-06, + "loss": 0.3248, + "step": 9141 + }, + { + "epoch": 2.9353026167924225, + "grad_norm": 0.9039839506149292, + "learning_rate": 4.482491765640395e-06, + "loss": 0.3331, + "step": 9142 + }, + { + "epoch": 2.935623695617274, + "grad_norm": 0.983473539352417, + "learning_rate": 4.4721637877656375e-06, + "loss": 0.3217, + "step": 9143 + }, + { + "epoch": 2.9359447744421256, + "grad_norm": 0.7742102146148682, + "learning_rate": 4.461847449667955e-06, + "loss": 0.3411, + "step": 9144 + }, + { + "epoch": 2.936265853266977, + "grad_norm": 0.8041900396347046, + "learning_rate": 4.451542752604365e-06, + "loss": 0.3223, + "step": 9145 + }, + { + "epoch": 2.9365869320918287, + "grad_norm": 0.7607382535934448, + "learning_rate": 4.4412496978304515e-06, + "loss": 0.2994, + "step": 9146 + }, + { + "epoch": 2.93690801091668, + "grad_norm": 0.6458768844604492, + "learning_rate": 4.4309682866004125e-06, + "loss": 0.2517, + "step": 9147 + }, + { + "epoch": 2.9372290897415314, + "grad_norm": 0.8893207311630249, + "learning_rate": 4.420698520166988e-06, + "loss": 0.3428, + "step": 9148 + }, + { + "epoch": 2.937550168566383, + "grad_norm": 1.1111611127853394, + "learning_rate": 4.410440399781534e-06, + "loss": 0.3199, + "step": 9149 + }, + { + "epoch": 2.9378712473912345, + "grad_norm": 0.8783401846885681, + "learning_rate": 4.400193926693952e-06, + "loss": 0.3904, + "step": 9150 + }, + { + "epoch": 2.938192326216086, + "grad_norm": 0.6565034985542297, + "learning_rate": 4.389959102152774e-06, + "loss": 0.2862, + "step": 9151 + }, + { + "epoch": 2.9385134050409376, + "grad_norm": 0.8540425300598145, + "learning_rate": 4.379735927405038e-06, + "loss": 0.2698, + "step": 9152 + }, + { + "epoch": 2.938834483865789, + "grad_norm": 1.0489143133163452, + "learning_rate": 4.369524403696457e-06, + "loss": 0.3294, + "step": 9153 + }, + { + "epoch": 2.9391555626906407, + "grad_norm": 1.0321301221847534, + "learning_rate": 4.3593245322712474e-06, + "loss": 0.351, + "step": 9154 + }, + { + "epoch": 2.9394766415154923, + "grad_norm": 0.7734938263893127, + "learning_rate": 4.349136314372204e-06, + "loss": 0.3194, + "step": 9155 + }, + { + "epoch": 2.9397977203403434, + "grad_norm": 0.8938072919845581, + "learning_rate": 4.338959751240801e-06, + "loss": 0.3961, + "step": 9156 + }, + { + "epoch": 2.940118799165195, + "grad_norm": 0.6044966578483582, + "learning_rate": 4.328794844116946e-06, + "loss": 0.2416, + "step": 9157 + }, + { + "epoch": 2.9404398779900465, + "grad_norm": 0.69122314453125, + "learning_rate": 4.318641594239259e-06, + "loss": 0.2702, + "step": 9158 + }, + { + "epoch": 2.940760956814898, + "grad_norm": 1.0131958723068237, + "learning_rate": 4.308500002844862e-06, + "loss": 0.362, + "step": 9159 + }, + { + "epoch": 2.9410820356397496, + "grad_norm": 0.8207650184631348, + "learning_rate": 4.2983700711694665e-06, + "loss": 0.3119, + "step": 9160 + }, + { + "epoch": 2.941403114464601, + "grad_norm": 0.6501420736312866, + "learning_rate": 4.288251800447385e-06, + "loss": 0.2507, + "step": 9161 + }, + { + "epoch": 2.9417241932894527, + "grad_norm": 0.638103187084198, + "learning_rate": 4.278145191911509e-06, + "loss": 0.279, + "step": 9162 + }, + { + "epoch": 2.942045272114304, + "grad_norm": 0.9142710566520691, + "learning_rate": 4.268050246793276e-06, + "loss": 0.416, + "step": 9163 + }, + { + "epoch": 2.9423663509391558, + "grad_norm": 0.9541748762130737, + "learning_rate": 4.257966966322735e-06, + "loss": 0.314, + "step": 9164 + }, + { + "epoch": 2.942687429764007, + "grad_norm": 0.9409205317497253, + "learning_rate": 4.247895351728504e-06, + "loss": 0.3532, + "step": 9165 + }, + { + "epoch": 2.9430085085888584, + "grad_norm": 0.777114748954773, + "learning_rate": 4.237835404237778e-06, + "loss": 0.3019, + "step": 9166 + }, + { + "epoch": 2.94332958741371, + "grad_norm": 0.796989917755127, + "learning_rate": 4.227787125076332e-06, + "loss": 0.251, + "step": 9167 + }, + { + "epoch": 2.9436506662385615, + "grad_norm": 0.7624577879905701, + "learning_rate": 4.217750515468522e-06, + "loss": 0.2639, + "step": 9168 + }, + { + "epoch": 2.943971745063413, + "grad_norm": 0.6462528109550476, + "learning_rate": 4.207725576637256e-06, + "loss": 0.2605, + "step": 9169 + }, + { + "epoch": 2.9442928238882646, + "grad_norm": 0.8757241368293762, + "learning_rate": 4.197712309804058e-06, + "loss": 0.2975, + "step": 9170 + }, + { + "epoch": 2.944613902713116, + "grad_norm": 0.9458180665969849, + "learning_rate": 4.187710716189042e-06, + "loss": 0.3727, + "step": 9171 + }, + { + "epoch": 2.9449349815379677, + "grad_norm": 0.7695515155792236, + "learning_rate": 4.177720797010831e-06, + "loss": 0.2814, + "step": 9172 + }, + { + "epoch": 2.9452560603628193, + "grad_norm": 1.1303104162216187, + "learning_rate": 4.167742553486675e-06, + "loss": 0.3553, + "step": 9173 + }, + { + "epoch": 2.9455771391876704, + "grad_norm": 0.6950247883796692, + "learning_rate": 4.1577759868324125e-06, + "loss": 0.2848, + "step": 9174 + }, + { + "epoch": 2.945898218012522, + "grad_norm": 0.5095248818397522, + "learning_rate": 4.147821098262405e-06, + "loss": 0.2629, + "step": 9175 + }, + { + "epoch": 2.9462192968373735, + "grad_norm": 0.6144112944602966, + "learning_rate": 4.137877888989672e-06, + "loss": 0.2893, + "step": 9176 + }, + { + "epoch": 2.946540375662225, + "grad_norm": 0.970645546913147, + "learning_rate": 4.127946360225721e-06, + "loss": 0.3173, + "step": 9177 + }, + { + "epoch": 2.9468614544870766, + "grad_norm": 0.5283989310264587, + "learning_rate": 4.118026513180695e-06, + "loss": 0.2829, + "step": 9178 + }, + { + "epoch": 2.947182533311928, + "grad_norm": 0.4269641935825348, + "learning_rate": 4.108118349063306e-06, + "loss": 0.2613, + "step": 9179 + }, + { + "epoch": 2.9475036121367797, + "grad_norm": 0.476938396692276, + "learning_rate": 4.09822186908082e-06, + "loss": 0.64, + "step": 9180 + }, + { + "epoch": 2.9478246909616312, + "grad_norm": 0.4373842477798462, + "learning_rate": 4.088337074439097e-06, + "loss": 0.4771, + "step": 9181 + }, + { + "epoch": 2.948145769786483, + "grad_norm": 0.4393393397331238, + "learning_rate": 4.078463966342571e-06, + "loss": 0.3074, + "step": 9182 + }, + { + "epoch": 2.948466848611334, + "grad_norm": 0.7229725122451782, + "learning_rate": 4.068602545994249e-06, + "loss": 0.4381, + "step": 9183 + }, + { + "epoch": 2.9487879274361855, + "grad_norm": 0.48776933550834656, + "learning_rate": 4.0587528145957235e-06, + "loss": 0.3591, + "step": 9184 + }, + { + "epoch": 2.949109006261037, + "grad_norm": 0.2947094738483429, + "learning_rate": 4.048914773347134e-06, + "loss": 0.1124, + "step": 9185 + }, + { + "epoch": 2.9494300850858886, + "grad_norm": 0.3417683243751526, + "learning_rate": 4.039088423447235e-06, + "loss": 0.1572, + "step": 9186 + }, + { + "epoch": 2.94975116391074, + "grad_norm": 0.2823491394519806, + "learning_rate": 4.029273766093333e-06, + "loss": 0.131, + "step": 9187 + }, + { + "epoch": 2.9500722427355917, + "grad_norm": 0.8070669770240784, + "learning_rate": 4.019470802481307e-06, + "loss": 0.437, + "step": 9188 + }, + { + "epoch": 2.950393321560443, + "grad_norm": 0.7624936699867249, + "learning_rate": 4.009679533805633e-06, + "loss": 0.3359, + "step": 9189 + }, + { + "epoch": 2.9507144003852948, + "grad_norm": 0.7309533357620239, + "learning_rate": 3.999899961259335e-06, + "loss": 0.3002, + "step": 9190 + }, + { + "epoch": 2.9510354792101463, + "grad_norm": 0.8740092515945435, + "learning_rate": 3.990132086034026e-06, + "loss": 0.4231, + "step": 9191 + }, + { + "epoch": 2.9513565580349974, + "grad_norm": 0.9942984580993652, + "learning_rate": 3.9803759093199e-06, + "loss": 0.4098, + "step": 9192 + }, + { + "epoch": 2.951677636859849, + "grad_norm": 0.6029365062713623, + "learning_rate": 3.970631432305694e-06, + "loss": 0.2482, + "step": 9193 + }, + { + "epoch": 2.9519987156847005, + "grad_norm": 0.9003799557685852, + "learning_rate": 3.96089865617878e-06, + "loss": 0.3181, + "step": 9194 + }, + { + "epoch": 2.952319794509552, + "grad_norm": 0.7407945990562439, + "learning_rate": 3.951177582125021e-06, + "loss": 0.2781, + "step": 9195 + }, + { + "epoch": 2.9526408733344036, + "grad_norm": 0.8002344965934753, + "learning_rate": 3.941468211328947e-06, + "loss": 0.3166, + "step": 9196 + }, + { + "epoch": 2.952961952159255, + "grad_norm": 0.7346023321151733, + "learning_rate": 3.931770544973601e-06, + "loss": 0.2666, + "step": 9197 + }, + { + "epoch": 2.9532830309841067, + "grad_norm": 0.8236327767372131, + "learning_rate": 3.922084584240582e-06, + "loss": 0.3224, + "step": 9198 + }, + { + "epoch": 2.953604109808958, + "grad_norm": 0.9807097315788269, + "learning_rate": 3.912410330310156e-06, + "loss": 0.4151, + "step": 9199 + }, + { + "epoch": 2.95392518863381, + "grad_norm": 0.6370247006416321, + "learning_rate": 3.902747784361038e-06, + "loss": 0.2875, + "step": 9200 + }, + { + "epoch": 2.954246267458661, + "grad_norm": 0.8275413513183594, + "learning_rate": 3.893096947570618e-06, + "loss": 0.3529, + "step": 9201 + }, + { + "epoch": 2.9545673462835125, + "grad_norm": 1.0060149431228638, + "learning_rate": 3.883457821114811e-06, + "loss": 0.3392, + "step": 9202 + }, + { + "epoch": 2.954888425108364, + "grad_norm": 0.9771203398704529, + "learning_rate": 3.873830406168111e-06, + "loss": 0.423, + "step": 9203 + }, + { + "epoch": 2.9552095039332156, + "grad_norm": 0.7501790523529053, + "learning_rate": 3.864214703903601e-06, + "loss": 0.2913, + "step": 9204 + }, + { + "epoch": 2.955530582758067, + "grad_norm": 1.0703070163726807, + "learning_rate": 3.8546107154929235e-06, + "loss": 0.4205, + "step": 9205 + }, + { + "epoch": 2.9558516615829187, + "grad_norm": 0.6897678971290588, + "learning_rate": 3.845018442106285e-06, + "loss": 0.2425, + "step": 9206 + }, + { + "epoch": 2.9561727404077702, + "grad_norm": 0.917190670967102, + "learning_rate": 3.835437884912474e-06, + "loss": 0.392, + "step": 9207 + }, + { + "epoch": 2.9564938192326213, + "grad_norm": 1.1160334348678589, + "learning_rate": 3.825869045078867e-06, + "loss": 0.4136, + "step": 9208 + }, + { + "epoch": 2.9568148980574733, + "grad_norm": 1.1579893827438354, + "learning_rate": 3.816311923771387e-06, + "loss": 0.4582, + "step": 9209 + }, + { + "epoch": 2.9571359768823244, + "grad_norm": 0.7560253739356995, + "learning_rate": 3.806766522154548e-06, + "loss": 0.3024, + "step": 9210 + }, + { + "epoch": 2.957457055707176, + "grad_norm": 1.1074497699737549, + "learning_rate": 3.797232841391407e-06, + "loss": 0.4429, + "step": 9211 + }, + { + "epoch": 2.9577781345320275, + "grad_norm": 0.941001832485199, + "learning_rate": 3.787710882643658e-06, + "loss": 0.3496, + "step": 9212 + }, + { + "epoch": 2.958099213356879, + "grad_norm": 0.8937235474586487, + "learning_rate": 3.7782006470714616e-06, + "loss": 0.325, + "step": 9213 + }, + { + "epoch": 2.9584202921817306, + "grad_norm": 1.424865961074829, + "learning_rate": 3.7687021358336683e-06, + "loss": 0.4345, + "step": 9214 + }, + { + "epoch": 2.958741371006582, + "grad_norm": 0.7509331703186035, + "learning_rate": 3.759215350087619e-06, + "loss": 0.2709, + "step": 9215 + }, + { + "epoch": 2.9590624498314337, + "grad_norm": 0.6785710453987122, + "learning_rate": 3.7497402909892342e-06, + "loss": 0.237, + "step": 9216 + }, + { + "epoch": 2.959383528656285, + "grad_norm": 0.7602850794792175, + "learning_rate": 3.7402769596930563e-06, + "loss": 0.3031, + "step": 9217 + }, + { + "epoch": 2.959704607481137, + "grad_norm": 1.575135588645935, + "learning_rate": 3.730825357352119e-06, + "loss": 0.3567, + "step": 9218 + }, + { + "epoch": 2.960025686305988, + "grad_norm": 0.6166884899139404, + "learning_rate": 3.721385485118123e-06, + "loss": 0.2792, + "step": 9219 + }, + { + "epoch": 2.9603467651308395, + "grad_norm": 0.6132175922393799, + "learning_rate": 3.711957344141237e-06, + "loss": 0.2788, + "step": 9220 + }, + { + "epoch": 2.960667843955691, + "grad_norm": 0.7397081255912781, + "learning_rate": 3.7025409355702976e-06, + "loss": 0.3096, + "step": 9221 + }, + { + "epoch": 2.9609889227805426, + "grad_norm": 0.7033382654190063, + "learning_rate": 3.693136260552632e-06, + "loss": 0.2897, + "step": 9222 + }, + { + "epoch": 2.961310001605394, + "grad_norm": 0.62275230884552, + "learning_rate": 3.68374332023419e-06, + "loss": 0.2907, + "step": 9223 + }, + { + "epoch": 2.9616310804302457, + "grad_norm": 0.716752827167511, + "learning_rate": 3.6743621157594555e-06, + "loss": 0.2795, + "step": 9224 + }, + { + "epoch": 2.9619521592550972, + "grad_norm": 1.4299006462097168, + "learning_rate": 3.664992648271526e-06, + "loss": 0.2909, + "step": 9225 + }, + { + "epoch": 2.9622732380799484, + "grad_norm": 0.8173677921295166, + "learning_rate": 3.6556349189120097e-06, + "loss": 0.3142, + "step": 9226 + }, + { + "epoch": 2.9625943169048004, + "grad_norm": 0.9796795845031738, + "learning_rate": 3.6462889288211512e-06, + "loss": 0.3166, + "step": 9227 + }, + { + "epoch": 2.9629153957296515, + "grad_norm": 0.45657387375831604, + "learning_rate": 3.6369546791377052e-06, + "loss": 0.2369, + "step": 9228 + }, + { + "epoch": 2.963236474554503, + "grad_norm": 0.5515535473823547, + "learning_rate": 3.627632170999029e-06, + "loss": 0.2845, + "step": 9229 + }, + { + "epoch": 2.9635575533793546, + "grad_norm": 0.4276106357574463, + "learning_rate": 3.6183214055410586e-06, + "loss": 0.3697, + "step": 9230 + }, + { + "epoch": 2.963878632204206, + "grad_norm": 0.45924896001815796, + "learning_rate": 3.6090223838982417e-06, + "loss": 0.4946, + "step": 9231 + }, + { + "epoch": 2.9641997110290577, + "grad_norm": 0.29669415950775146, + "learning_rate": 3.5997351072036945e-06, + "loss": 0.142, + "step": 9232 + }, + { + "epoch": 2.964520789853909, + "grad_norm": 0.5130549669265747, + "learning_rate": 3.5904595765890005e-06, + "loss": 0.2495, + "step": 9233 + }, + { + "epoch": 2.9648418686787608, + "grad_norm": 0.23706403374671936, + "learning_rate": 3.5811957931843554e-06, + "loss": 0.0929, + "step": 9234 + }, + { + "epoch": 2.965162947503612, + "grad_norm": 0.2668483853340149, + "learning_rate": 3.5719437581185454e-06, + "loss": 0.1031, + "step": 9235 + }, + { + "epoch": 2.965484026328464, + "grad_norm": 0.812019407749176, + "learning_rate": 3.562703472518869e-06, + "loss": 0.3787, + "step": 9236 + }, + { + "epoch": 2.965805105153315, + "grad_norm": 0.8627951741218567, + "learning_rate": 3.553474937511281e-06, + "loss": 0.372, + "step": 9237 + }, + { + "epoch": 2.9661261839781665, + "grad_norm": 0.92494797706604, + "learning_rate": 3.5442581542201923e-06, + "loss": 0.3974, + "step": 9238 + }, + { + "epoch": 2.966447262803018, + "grad_norm": 0.898228108882904, + "learning_rate": 3.5350531237686724e-06, + "loss": 0.2598, + "step": 9239 + }, + { + "epoch": 2.9667683416278696, + "grad_norm": 0.7887357473373413, + "learning_rate": 3.5258598472783233e-06, + "loss": 0.2791, + "step": 9240 + }, + { + "epoch": 2.967089420452721, + "grad_norm": 0.8121463060379028, + "learning_rate": 3.516678325869316e-06, + "loss": 0.3809, + "step": 9241 + }, + { + "epoch": 2.9674104992775727, + "grad_norm": 0.7240415215492249, + "learning_rate": 3.5075085606604e-06, + "loss": 0.3064, + "step": 9242 + }, + { + "epoch": 2.9677315781024243, + "grad_norm": 1.0837162733078003, + "learning_rate": 3.4983505527688586e-06, + "loss": 0.3971, + "step": 9243 + }, + { + "epoch": 2.9680526569272754, + "grad_norm": 0.7732176780700684, + "learning_rate": 3.489204303310578e-06, + "loss": 0.2647, + "step": 9244 + }, + { + "epoch": 2.9683737357521274, + "grad_norm": 0.777204155921936, + "learning_rate": 3.480069813400022e-06, + "loss": 0.3172, + "step": 9245 + }, + { + "epoch": 2.9686948145769785, + "grad_norm": 0.9173356294631958, + "learning_rate": 3.470947084150167e-06, + "loss": 0.3888, + "step": 9246 + }, + { + "epoch": 2.96901589340183, + "grad_norm": 0.6100045442581177, + "learning_rate": 3.461836116672612e-06, + "loss": 0.2627, + "step": 9247 + }, + { + "epoch": 2.9693369722266816, + "grad_norm": 0.8461219668388367, + "learning_rate": 3.452736912077503e-06, + "loss": 0.342, + "step": 9248 + }, + { + "epoch": 2.969658051051533, + "grad_norm": 0.9259991645812988, + "learning_rate": 3.443649471473531e-06, + "loss": 0.4195, + "step": 9249 + }, + { + "epoch": 2.9699791298763847, + "grad_norm": 1.0365054607391357, + "learning_rate": 3.434573795967988e-06, + "loss": 0.3616, + "step": 9250 + }, + { + "epoch": 2.9703002087012362, + "grad_norm": 1.170913815498352, + "learning_rate": 3.425509886666711e-06, + "loss": 0.4531, + "step": 9251 + }, + { + "epoch": 2.970621287526088, + "grad_norm": 0.9523593187332153, + "learning_rate": 3.4164577446741174e-06, + "loss": 0.3797, + "step": 9252 + }, + { + "epoch": 2.970942366350939, + "grad_norm": 0.5297471284866333, + "learning_rate": 3.40741737109318e-06, + "loss": 0.2028, + "step": 9253 + }, + { + "epoch": 2.971263445175791, + "grad_norm": 0.84348064661026, + "learning_rate": 3.3983887670254177e-06, + "loss": 0.3622, + "step": 9254 + }, + { + "epoch": 2.971584524000642, + "grad_norm": 0.9488373398780823, + "learning_rate": 3.389371933570995e-06, + "loss": 0.3887, + "step": 9255 + }, + { + "epoch": 2.9719056028254935, + "grad_norm": 0.7091740369796753, + "learning_rate": 3.380366871828522e-06, + "loss": 0.2702, + "step": 9256 + }, + { + "epoch": 2.972226681650345, + "grad_norm": 0.7415878176689148, + "learning_rate": 3.3713735828952987e-06, + "loss": 0.2945, + "step": 9257 + }, + { + "epoch": 2.9725477604751966, + "grad_norm": 0.7940482497215271, + "learning_rate": 3.3623920678670597e-06, + "loss": 0.2931, + "step": 9258 + }, + { + "epoch": 2.972868839300048, + "grad_norm": 0.745911717414856, + "learning_rate": 3.3534223278382405e-06, + "loss": 0.278, + "step": 9259 + }, + { + "epoch": 2.9731899181248997, + "grad_norm": 0.6489874124526978, + "learning_rate": 3.344464363901756e-06, + "loss": 0.2768, + "step": 9260 + }, + { + "epoch": 2.9735109969497513, + "grad_norm": 0.9306034445762634, + "learning_rate": 3.3355181771490772e-06, + "loss": 0.3243, + "step": 9261 + }, + { + "epoch": 2.9738320757746024, + "grad_norm": 0.9093040823936462, + "learning_rate": 3.3265837686703106e-06, + "loss": 0.3566, + "step": 9262 + }, + { + "epoch": 2.9741531545994544, + "grad_norm": 0.7288286685943604, + "learning_rate": 3.3176611395540626e-06, + "loss": 0.2532, + "step": 9263 + }, + { + "epoch": 2.9744742334243055, + "grad_norm": 1.2597209215164185, + "learning_rate": 3.3087502908875413e-06, + "loss": 0.4041, + "step": 9264 + }, + { + "epoch": 2.974795312249157, + "grad_norm": 0.8495983481407166, + "learning_rate": 3.2998512237565005e-06, + "loss": 0.3545, + "step": 9265 + }, + { + "epoch": 2.9751163910740086, + "grad_norm": 0.9554901719093323, + "learning_rate": 3.290963939245262e-06, + "loss": 0.3384, + "step": 9266 + }, + { + "epoch": 2.97543746989886, + "grad_norm": 0.5555322766304016, + "learning_rate": 3.2820884384367146e-06, + "loss": 0.2401, + "step": 9267 + }, + { + "epoch": 2.9757585487237117, + "grad_norm": 0.5823652744293213, + "learning_rate": 3.273224722412327e-06, + "loss": 0.2434, + "step": 9268 + }, + { + "epoch": 2.9760796275485633, + "grad_norm": 0.7519864439964294, + "learning_rate": 3.2643727922520906e-06, + "loss": 0.2569, + "step": 9269 + }, + { + "epoch": 2.976400706373415, + "grad_norm": 0.6338984966278076, + "learning_rate": 3.2555326490346095e-06, + "loss": 0.2871, + "step": 9270 + }, + { + "epoch": 2.976721785198266, + "grad_norm": 0.7331449389457703, + "learning_rate": 3.246704293837011e-06, + "loss": 0.2932, + "step": 9271 + }, + { + "epoch": 2.977042864023118, + "grad_norm": 0.7755066752433777, + "learning_rate": 3.2378877277350116e-06, + "loss": 0.3737, + "step": 9272 + }, + { + "epoch": 2.977363942847969, + "grad_norm": 1.2024750709533691, + "learning_rate": 3.2290829518028862e-06, + "loss": 0.2638, + "step": 9273 + }, + { + "epoch": 2.9776850216728206, + "grad_norm": 0.7302016019821167, + "learning_rate": 3.2202899671134546e-06, + "loss": 0.2853, + "step": 9274 + }, + { + "epoch": 2.978006100497672, + "grad_norm": 0.6680705547332764, + "learning_rate": 3.2115087747381366e-06, + "loss": 0.2578, + "step": 9275 + }, + { + "epoch": 2.9783271793225237, + "grad_norm": 0.6737269163131714, + "learning_rate": 3.2027393757468773e-06, + "loss": 0.308, + "step": 9276 + }, + { + "epoch": 2.978648258147375, + "grad_norm": 0.5532433986663818, + "learning_rate": 3.19398177120821e-06, + "loss": 0.2525, + "step": 9277 + }, + { + "epoch": 2.9789693369722268, + "grad_norm": 0.49847257137298584, + "learning_rate": 3.1852359621892367e-06, + "loss": 0.2842, + "step": 9278 + }, + { + "epoch": 2.9792904157970783, + "grad_norm": 0.4007408022880554, + "learning_rate": 3.1765019497555616e-06, + "loss": 0.2705, + "step": 9279 + }, + { + "epoch": 2.9796114946219294, + "grad_norm": 0.4380526542663574, + "learning_rate": 3.1677797349714544e-06, + "loss": 0.4338, + "step": 9280 + }, + { + "epoch": 2.9799325734467814, + "grad_norm": 0.45914846658706665, + "learning_rate": 3.1590693188996323e-06, + "loss": 0.33, + "step": 9281 + }, + { + "epoch": 2.9802536522716325, + "grad_norm": 0.43808650970458984, + "learning_rate": 3.1503707026014906e-06, + "loss": 0.3765, + "step": 9282 + }, + { + "epoch": 2.980574731096484, + "grad_norm": 0.36031296849250793, + "learning_rate": 3.1416838871368924e-06, + "loss": 0.1771, + "step": 9283 + }, + { + "epoch": 2.9808958099213356, + "grad_norm": 0.28183841705322266, + "learning_rate": 3.1330088735643025e-06, + "loss": 0.1466, + "step": 9284 + }, + { + "epoch": 2.981216888746187, + "grad_norm": 0.4093777537345886, + "learning_rate": 3.124345662940764e-06, + "loss": 0.1417, + "step": 9285 + }, + { + "epoch": 2.9815379675710387, + "grad_norm": 0.3650287687778473, + "learning_rate": 3.115694256321855e-06, + "loss": 0.1786, + "step": 9286 + }, + { + "epoch": 2.9818590463958903, + "grad_norm": 0.7063071131706238, + "learning_rate": 3.1070546547617095e-06, + "loss": 0.3095, + "step": 9287 + }, + { + "epoch": 2.982180125220742, + "grad_norm": 0.9610477685928345, + "learning_rate": 3.0984268593130528e-06, + "loss": 0.4365, + "step": 9288 + }, + { + "epoch": 2.982501204045593, + "grad_norm": 0.9653912782669067, + "learning_rate": 3.0898108710271434e-06, + "loss": 0.3772, + "step": 9289 + }, + { + "epoch": 2.982822282870445, + "grad_norm": 0.7585698962211609, + "learning_rate": 3.081206690953831e-06, + "loss": 0.314, + "step": 9290 + }, + { + "epoch": 2.983143361695296, + "grad_norm": 0.867253839969635, + "learning_rate": 3.072614320141487e-06, + "loss": 0.3468, + "step": 9291 + }, + { + "epoch": 2.9834644405201476, + "grad_norm": 0.6894168257713318, + "learning_rate": 3.064033759637064e-06, + "loss": 0.2712, + "step": 9292 + }, + { + "epoch": 2.983785519344999, + "grad_norm": 0.5496251583099365, + "learning_rate": 3.0554650104861136e-06, + "loss": 0.2125, + "step": 9293 + }, + { + "epoch": 2.9841065981698507, + "grad_norm": 0.7446759343147278, + "learning_rate": 3.046908073732668e-06, + "loss": 0.3081, + "step": 9294 + }, + { + "epoch": 2.9844276769947022, + "grad_norm": 0.8124382495880127, + "learning_rate": 3.0383629504194046e-06, + "loss": 0.281, + "step": 9295 + }, + { + "epoch": 2.984748755819554, + "grad_norm": 1.0698877573013306, + "learning_rate": 3.0298296415874894e-06, + "loss": 0.3884, + "step": 9296 + }, + { + "epoch": 2.9850698346444053, + "grad_norm": 0.9363170862197876, + "learning_rate": 3.0213081482766805e-06, + "loss": 0.3386, + "step": 9297 + }, + { + "epoch": 2.9853909134692564, + "grad_norm": 0.909624457359314, + "learning_rate": 3.012798471525324e-06, + "loss": 0.3767, + "step": 9298 + }, + { + "epoch": 2.9857119922941084, + "grad_norm": 0.8160519003868103, + "learning_rate": 3.0043006123702697e-06, + "loss": 0.4078, + "step": 9299 + }, + { + "epoch": 2.9860330711189595, + "grad_norm": 0.7235801219940186, + "learning_rate": 2.9958145718469777e-06, + "loss": 0.3223, + "step": 9300 + }, + { + "epoch": 2.986354149943811, + "grad_norm": 0.9839329719543457, + "learning_rate": 2.9873403509894203e-06, + "loss": 0.4005, + "step": 9301 + }, + { + "epoch": 2.9866752287686626, + "grad_norm": 0.7384074926376343, + "learning_rate": 2.978877950830172e-06, + "loss": 0.2616, + "step": 9302 + }, + { + "epoch": 2.986996307593514, + "grad_norm": 0.7919383645057678, + "learning_rate": 2.970427372400353e-06, + "loss": 0.3024, + "step": 9303 + }, + { + "epoch": 2.9873173864183657, + "grad_norm": 0.8400102257728577, + "learning_rate": 2.9619886167296384e-06, + "loss": 0.3372, + "step": 9304 + }, + { + "epoch": 2.9876384652432173, + "grad_norm": 0.610448956489563, + "learning_rate": 2.953561684846262e-06, + "loss": 0.2516, + "step": 9305 + }, + { + "epoch": 2.987959544068069, + "grad_norm": 0.9138132333755493, + "learning_rate": 2.9451465777770247e-06, + "loss": 0.3427, + "step": 9306 + }, + { + "epoch": 2.98828062289292, + "grad_norm": 1.5538538694381714, + "learning_rate": 2.936743296547273e-06, + "loss": 0.4386, + "step": 9307 + }, + { + "epoch": 2.988601701717772, + "grad_norm": 0.7500302195549011, + "learning_rate": 2.928351842180921e-06, + "loss": 0.2692, + "step": 9308 + }, + { + "epoch": 2.988922780542623, + "grad_norm": 0.8792135119438171, + "learning_rate": 2.9199722157004616e-06, + "loss": 0.3637, + "step": 9309 + }, + { + "epoch": 2.9892438593674746, + "grad_norm": 0.4652367830276489, + "learning_rate": 2.9116044181269007e-06, + "loss": 0.2135, + "step": 9310 + }, + { + "epoch": 2.989564938192326, + "grad_norm": 0.9699134826660156, + "learning_rate": 2.9032484504798452e-06, + "loss": 0.3932, + "step": 9311 + }, + { + "epoch": 2.9898860170171777, + "grad_norm": 0.7483917474746704, + "learning_rate": 2.8949043137774355e-06, + "loss": 0.3142, + "step": 9312 + }, + { + "epoch": 2.9902070958420293, + "grad_norm": 0.6890571117401123, + "learning_rate": 2.8865720090364034e-06, + "loss": 0.3233, + "step": 9313 + }, + { + "epoch": 2.990528174666881, + "grad_norm": 0.7219890356063843, + "learning_rate": 2.878251537271981e-06, + "loss": 0.254, + "step": 9314 + }, + { + "epoch": 2.9908492534917324, + "grad_norm": 0.8167535662651062, + "learning_rate": 2.8699428994980017e-06, + "loss": 0.2907, + "step": 9315 + }, + { + "epoch": 2.9911703323165835, + "grad_norm": 0.8420015573501587, + "learning_rate": 2.8616460967268667e-06, + "loss": 0.3101, + "step": 9316 + }, + { + "epoch": 2.9914914111414355, + "grad_norm": 0.5992588996887207, + "learning_rate": 2.8533611299694783e-06, + "loss": 0.2537, + "step": 9317 + }, + { + "epoch": 2.9918124899662866, + "grad_norm": 0.9275438189506531, + "learning_rate": 2.845088000235396e-06, + "loss": 0.2914, + "step": 9318 + }, + { + "epoch": 2.992133568791138, + "grad_norm": 0.6742895245552063, + "learning_rate": 2.836826708532603e-06, + "loss": 0.2999, + "step": 9319 + }, + { + "epoch": 2.9924546476159897, + "grad_norm": 0.9248641729354858, + "learning_rate": 2.8285772558677705e-06, + "loss": 0.3232, + "step": 9320 + }, + { + "epoch": 2.992775726440841, + "grad_norm": 0.8363873362541199, + "learning_rate": 2.8203396432460506e-06, + "loss": 0.3208, + "step": 9321 + }, + { + "epoch": 2.9930968052656928, + "grad_norm": 0.5395601987838745, + "learning_rate": 2.8121138716711404e-06, + "loss": 0.2489, + "step": 9322 + }, + { + "epoch": 2.9934178840905443, + "grad_norm": 0.5238479375839233, + "learning_rate": 2.8038999421453826e-06, + "loss": 0.2793, + "step": 9323 + }, + { + "epoch": 2.993738962915396, + "grad_norm": 0.9257795214653015, + "learning_rate": 2.7956978556695767e-06, + "loss": 0.3502, + "step": 9324 + }, + { + "epoch": 2.994060041740247, + "grad_norm": 0.7510598301887512, + "learning_rate": 2.7875076132431344e-06, + "loss": 0.3333, + "step": 9325 + }, + { + "epoch": 2.994381120565099, + "grad_norm": 0.5098982453346252, + "learning_rate": 2.7793292158640126e-06, + "loss": 0.2552, + "step": 9326 + }, + { + "epoch": 2.99470219938995, + "grad_norm": 0.6713955402374268, + "learning_rate": 2.771162664528726e-06, + "loss": 0.2947, + "step": 9327 + }, + { + "epoch": 2.9950232782148016, + "grad_norm": 0.7259739637374878, + "learning_rate": 2.7630079602323442e-06, + "loss": 0.3072, + "step": 9328 + }, + { + "epoch": 2.995344357039653, + "grad_norm": 0.29132696986198425, + "learning_rate": 2.7548651039684846e-06, + "loss": 0.247, + "step": 9329 + }, + { + "epoch": 2.9956654358645047, + "grad_norm": 0.37000009417533875, + "learning_rate": 2.746734096729342e-06, + "loss": 0.2994, + "step": 9330 + }, + { + "epoch": 2.9959865146893563, + "grad_norm": 0.6013002395629883, + "learning_rate": 2.738614939505646e-06, + "loss": 0.2101, + "step": 9331 + }, + { + "epoch": 2.996307593514208, + "grad_norm": 0.7807649970054626, + "learning_rate": 2.7305076332867054e-06, + "loss": 0.3249, + "step": 9332 + }, + { + "epoch": 2.9966286723390594, + "grad_norm": 0.8677118420600891, + "learning_rate": 2.7224121790603517e-06, + "loss": 0.3319, + "step": 9333 + }, + { + "epoch": 2.9969497511639105, + "grad_norm": 0.7280771136283875, + "learning_rate": 2.714328577812997e-06, + "loss": 0.3257, + "step": 9334 + }, + { + "epoch": 2.9972708299887625, + "grad_norm": 1.6487222909927368, + "learning_rate": 2.7062568305295965e-06, + "loss": 0.2733, + "step": 9335 + }, + { + "epoch": 2.9975919088136136, + "grad_norm": 0.6912175416946411, + "learning_rate": 2.6981969381936978e-06, + "loss": 0.2817, + "step": 9336 + }, + { + "epoch": 2.997912987638465, + "grad_norm": 0.8025344610214233, + "learning_rate": 2.690148901787337e-06, + "loss": 0.327, + "step": 9337 + }, + { + "epoch": 2.9982340664633167, + "grad_norm": 0.6244665384292603, + "learning_rate": 2.6821127222911857e-06, + "loss": 0.2552, + "step": 9338 + }, + { + "epoch": 2.9985551452881682, + "grad_norm": 0.8349512219429016, + "learning_rate": 2.6740884006843825e-06, + "loss": 0.3398, + "step": 9339 + }, + { + "epoch": 2.99887622411302, + "grad_norm": 0.9518395066261292, + "learning_rate": 2.66607593794469e-06, + "loss": 0.3202, + "step": 9340 + }, + { + "epoch": 2.9991973029378713, + "grad_norm": 0.6690359115600586, + "learning_rate": 2.6580753350484046e-06, + "loss": 0.2847, + "step": 9341 + }, + { + "epoch": 2.999518381762723, + "grad_norm": 0.6814465522766113, + "learning_rate": 2.650086592970358e-06, + "loss": 0.28, + "step": 9342 + }, + { + "epoch": 2.999839460587574, + "grad_norm": 1.4310189485549927, + "learning_rate": 2.6421097126839712e-06, + "loss": 0.4217, + "step": 9343 + }, + { + "epoch": 3.0001605394124256, + "grad_norm": 0.43879497051239014, + "learning_rate": 2.6341446951612005e-06, + "loss": 0.3707, + "step": 9344 + }, + { + "epoch": 3.000481618237277, + "grad_norm": 0.40804460644721985, + "learning_rate": 2.6261915413725578e-06, + "loss": 0.4423, + "step": 9345 + }, + { + "epoch": 3.0008026970621287, + "grad_norm": 0.38268548250198364, + "learning_rate": 2.618250252287113e-06, + "loss": 0.2687, + "step": 9346 + }, + { + "epoch": 3.00112377588698, + "grad_norm": 0.2906617224216461, + "learning_rate": 2.610320828872481e-06, + "loss": 0.1438, + "step": 9347 + }, + { + "epoch": 3.0014448547118318, + "grad_norm": 0.2549183964729309, + "learning_rate": 2.6024032720948443e-06, + "loss": 0.1358, + "step": 9348 + }, + { + "epoch": 3.0017659335366833, + "grad_norm": 0.3139497637748718, + "learning_rate": 2.59449758291892e-06, + "loss": 0.1061, + "step": 9349 + }, + { + "epoch": 3.002087012361535, + "grad_norm": 0.11320850253105164, + "learning_rate": 2.5866037623080153e-06, + "loss": 0.0611, + "step": 9350 + }, + { + "epoch": 3.0024080911863864, + "grad_norm": 0.5909608602523804, + "learning_rate": 2.5787218112239496e-06, + "loss": 0.3883, + "step": 9351 + }, + { + "epoch": 3.002729170011238, + "grad_norm": 0.5687462687492371, + "learning_rate": 2.570851730627122e-06, + "loss": 0.2654, + "step": 9352 + }, + { + "epoch": 3.003050248836089, + "grad_norm": 0.6400092840194702, + "learning_rate": 2.5629935214764865e-06, + "loss": 0.302, + "step": 9353 + }, + { + "epoch": 3.0033713276609406, + "grad_norm": 0.46738189458847046, + "learning_rate": 2.5551471847295228e-06, + "loss": 0.1982, + "step": 9354 + }, + { + "epoch": 3.003692406485792, + "grad_norm": 0.5949174165725708, + "learning_rate": 2.5473127213422763e-06, + "loss": 0.2598, + "step": 9355 + }, + { + "epoch": 3.0040134853106437, + "grad_norm": 1.0250693559646606, + "learning_rate": 2.5394901322694067e-06, + "loss": 0.2021, + "step": 9356 + }, + { + "epoch": 3.0043345641354953, + "grad_norm": 0.5591093897819519, + "learning_rate": 2.531679418464006e-06, + "loss": 0.2286, + "step": 9357 + }, + { + "epoch": 3.004655642960347, + "grad_norm": 0.5650901198387146, + "learning_rate": 2.5238805808778242e-06, + "loss": 0.2416, + "step": 9358 + }, + { + "epoch": 3.0049767217851984, + "grad_norm": 0.5326758027076721, + "learning_rate": 2.516093620461124e-06, + "loss": 0.2221, + "step": 9359 + }, + { + "epoch": 3.00529780061005, + "grad_norm": 0.6107519865036011, + "learning_rate": 2.508318538162702e-06, + "loss": 0.2568, + "step": 9360 + }, + { + "epoch": 3.0056188794349015, + "grad_norm": 0.5060980319976807, + "learning_rate": 2.5005553349299547e-06, + "loss": 0.228, + "step": 9361 + }, + { + "epoch": 3.0059399582597526, + "grad_norm": 0.5858919024467468, + "learning_rate": 2.4928040117087827e-06, + "loss": 0.2552, + "step": 9362 + }, + { + "epoch": 3.006261037084604, + "grad_norm": 0.5963205099105835, + "learning_rate": 2.4850645694436736e-06, + "loss": 0.2336, + "step": 9363 + }, + { + "epoch": 3.0065821159094557, + "grad_norm": 0.5636211633682251, + "learning_rate": 2.4773370090776626e-06, + "loss": 0.1921, + "step": 9364 + }, + { + "epoch": 3.0069031947343072, + "grad_norm": 0.5564295649528503, + "learning_rate": 2.4696213315523074e-06, + "loss": 0.2321, + "step": 9365 + }, + { + "epoch": 3.007224273559159, + "grad_norm": 0.5773066282272339, + "learning_rate": 2.4619175378077565e-06, + "loss": 0.2308, + "step": 9366 + }, + { + "epoch": 3.0075453523840103, + "grad_norm": 0.6271586418151855, + "learning_rate": 2.4542256287826914e-06, + "loss": 0.2407, + "step": 9367 + }, + { + "epoch": 3.007866431208862, + "grad_norm": 0.5038358569145203, + "learning_rate": 2.446545605414341e-06, + "loss": 0.2377, + "step": 9368 + }, + { + "epoch": 3.0081875100337134, + "grad_norm": 0.6960586905479431, + "learning_rate": 2.4388774686385007e-06, + "loss": 0.2911, + "step": 9369 + }, + { + "epoch": 3.008508588858565, + "grad_norm": 0.5753246545791626, + "learning_rate": 2.4312212193895125e-06, + "loss": 0.2706, + "step": 9370 + }, + { + "epoch": 3.008829667683416, + "grad_norm": 0.6433031558990479, + "learning_rate": 2.4235768586002517e-06, + "loss": 0.2616, + "step": 9371 + }, + { + "epoch": 3.0091507465082676, + "grad_norm": 0.615394115447998, + "learning_rate": 2.415944387202174e-06, + "loss": 0.2654, + "step": 9372 + }, + { + "epoch": 3.009471825333119, + "grad_norm": 0.7627741694450378, + "learning_rate": 2.4083238061252567e-06, + "loss": 0.318, + "step": 9373 + }, + { + "epoch": 3.0097929041579707, + "grad_norm": 0.6070566773414612, + "learning_rate": 2.40071511629808e-06, + "loss": 0.2499, + "step": 9374 + }, + { + "epoch": 3.0101139829828223, + "grad_norm": 0.8189682364463806, + "learning_rate": 2.3931183186477026e-06, + "loss": 0.2997, + "step": 9375 + }, + { + "epoch": 3.010435061807674, + "grad_norm": 0.6750493049621582, + "learning_rate": 2.385533414099783e-06, + "loss": 0.2817, + "step": 9376 + }, + { + "epoch": 3.0107561406325254, + "grad_norm": 0.7114454507827759, + "learning_rate": 2.3779604035785273e-06, + "loss": 0.3113, + "step": 9377 + }, + { + "epoch": 3.011077219457377, + "grad_norm": 0.6532655954360962, + "learning_rate": 2.3703992880066638e-06, + "loss": 0.2617, + "step": 9378 + }, + { + "epoch": 3.0113982982822285, + "grad_norm": 0.44464078545570374, + "learning_rate": 2.3628500683055222e-06, + "loss": 0.2033, + "step": 9379 + }, + { + "epoch": 3.0117193771070796, + "grad_norm": 0.6115095615386963, + "learning_rate": 2.355312745394922e-06, + "loss": 0.2694, + "step": 9380 + }, + { + "epoch": 3.012040455931931, + "grad_norm": 0.3214268982410431, + "learning_rate": 2.3477873201932734e-06, + "loss": 0.1775, + "step": 9381 + }, + { + "epoch": 3.0123615347567827, + "grad_norm": 0.6672443151473999, + "learning_rate": 2.3402737936175425e-06, + "loss": 0.2725, + "step": 9382 + }, + { + "epoch": 3.0126826135816343, + "grad_norm": 0.5313708782196045, + "learning_rate": 2.332772166583208e-06, + "loss": 0.2661, + "step": 9383 + }, + { + "epoch": 3.013003692406486, + "grad_norm": 0.7770663499832153, + "learning_rate": 2.325282440004339e-06, + "loss": 0.3296, + "step": 9384 + }, + { + "epoch": 3.0133247712313374, + "grad_norm": 0.4733831584453583, + "learning_rate": 2.3178046147935175e-06, + "loss": 0.2151, + "step": 9385 + }, + { + "epoch": 3.013645850056189, + "grad_norm": 0.5805613398551941, + "learning_rate": 2.3103386918619018e-06, + "loss": 0.2395, + "step": 9386 + }, + { + "epoch": 3.0139669288810405, + "grad_norm": 0.6747123003005981, + "learning_rate": 2.3028846721191876e-06, + "loss": 0.3138, + "step": 9387 + }, + { + "epoch": 3.014288007705892, + "grad_norm": 0.4454551637172699, + "learning_rate": 2.295442556473637e-06, + "loss": 0.231, + "step": 9388 + }, + { + "epoch": 3.014609086530743, + "grad_norm": 0.5723088979721069, + "learning_rate": 2.288012345832047e-06, + "loss": 0.2234, + "step": 9389 + }, + { + "epoch": 3.0149301653555947, + "grad_norm": 0.5944519639015198, + "learning_rate": 2.2805940410997484e-06, + "loss": 0.2433, + "step": 9390 + }, + { + "epoch": 3.015251244180446, + "grad_norm": 0.32701706886291504, + "learning_rate": 2.273187643180652e-06, + "loss": 0.2194, + "step": 9391 + }, + { + "epoch": 3.0155723230052978, + "grad_norm": 0.37137144804000854, + "learning_rate": 2.2657931529772136e-06, + "loss": 0.2192, + "step": 9392 + }, + { + "epoch": 3.0158934018301493, + "grad_norm": 0.4273146092891693, + "learning_rate": 2.2584105713904125e-06, + "loss": 0.2554, + "step": 9393 + }, + { + "epoch": 3.016214480655001, + "grad_norm": 0.357513964176178, + "learning_rate": 2.2510398993198067e-06, + "loss": 0.393, + "step": 9394 + }, + { + "epoch": 3.0165355594798524, + "grad_norm": 0.5867639183998108, + "learning_rate": 2.2436811376634893e-06, + "loss": 0.7159, + "step": 9395 + }, + { + "epoch": 3.016856638304704, + "grad_norm": 0.3277376890182495, + "learning_rate": 2.2363342873180757e-06, + "loss": 0.2004, + "step": 9396 + }, + { + "epoch": 3.0171777171295555, + "grad_norm": 0.2729596793651581, + "learning_rate": 2.2289993491788064e-06, + "loss": 0.1336, + "step": 9397 + }, + { + "epoch": 3.0174987959544066, + "grad_norm": 0.2100820392370224, + "learning_rate": 2.2216763241393767e-06, + "loss": 0.092, + "step": 9398 + }, + { + "epoch": 3.017819874779258, + "grad_norm": 0.24260498583316803, + "learning_rate": 2.2143652130921176e-06, + "loss": 0.0919, + "step": 9399 + }, + { + "epoch": 3.0181409536041097, + "grad_norm": 0.4891859292984009, + "learning_rate": 2.2070660169278166e-06, + "loss": 0.2427, + "step": 9400 + }, + { + "epoch": 3.0184620324289613, + "grad_norm": 0.6580247282981873, + "learning_rate": 2.1997787365358958e-06, + "loss": 0.3484, + "step": 9401 + }, + { + "epoch": 3.018783111253813, + "grad_norm": 0.6362596154212952, + "learning_rate": 2.192503372804278e-06, + "loss": 0.2941, + "step": 9402 + }, + { + "epoch": 3.0191041900786644, + "grad_norm": 0.7072274088859558, + "learning_rate": 2.1852399266194314e-06, + "loss": 0.3115, + "step": 9403 + }, + { + "epoch": 3.019425268903516, + "grad_norm": 0.5164889097213745, + "learning_rate": 2.177988398866415e-06, + "loss": 0.2195, + "step": 9404 + }, + { + "epoch": 3.0197463477283675, + "grad_norm": 0.5418469905853271, + "learning_rate": 2.1707487904287672e-06, + "loss": 0.2464, + "step": 9405 + }, + { + "epoch": 3.020067426553219, + "grad_norm": 0.6452725529670715, + "learning_rate": 2.163521102188648e-06, + "loss": 0.2784, + "step": 9406 + }, + { + "epoch": 3.02038850537807, + "grad_norm": 0.5608880519866943, + "learning_rate": 2.156305335026698e-06, + "loss": 0.2553, + "step": 9407 + }, + { + "epoch": 3.0207095842029217, + "grad_norm": 0.4648946225643158, + "learning_rate": 2.1491014898221582e-06, + "loss": 0.1944, + "step": 9408 + }, + { + "epoch": 3.0210306630277732, + "grad_norm": 0.5879479050636292, + "learning_rate": 2.141909567452793e-06, + "loss": 0.2373, + "step": 9409 + }, + { + "epoch": 3.021351741852625, + "grad_norm": 0.6517860889434814, + "learning_rate": 2.134729568794902e-06, + "loss": 0.2796, + "step": 9410 + }, + { + "epoch": 3.0216728206774763, + "grad_norm": 1.0598267316818237, + "learning_rate": 2.1275614947233624e-06, + "loss": 0.249, + "step": 9411 + }, + { + "epoch": 3.021993899502328, + "grad_norm": 0.6839745044708252, + "learning_rate": 2.120405346111576e-06, + "loss": 0.269, + "step": 9412 + }, + { + "epoch": 3.0223149783271794, + "grad_norm": 0.6479743123054504, + "learning_rate": 2.1132611238315003e-06, + "loss": 0.2993, + "step": 9413 + }, + { + "epoch": 3.022636057152031, + "grad_norm": 0.5197052955627441, + "learning_rate": 2.1061288287536285e-06, + "loss": 0.2089, + "step": 9414 + }, + { + "epoch": 3.0229571359768825, + "grad_norm": 0.5962384939193726, + "learning_rate": 2.0990084617470206e-06, + "loss": 0.2327, + "step": 9415 + }, + { + "epoch": 3.0232782148017336, + "grad_norm": 1.2533619403839111, + "learning_rate": 2.0919000236792607e-06, + "loss": 0.3158, + "step": 9416 + }, + { + "epoch": 3.023599293626585, + "grad_norm": 0.5709370374679565, + "learning_rate": 2.084803515416511e-06, + "loss": 0.2074, + "step": 9417 + }, + { + "epoch": 3.0239203724514367, + "grad_norm": 0.8780946731567383, + "learning_rate": 2.0777189378234143e-06, + "loss": 0.356, + "step": 9418 + }, + { + "epoch": 3.0242414512762883, + "grad_norm": 0.6024912595748901, + "learning_rate": 2.0706462917632673e-06, + "loss": 0.2511, + "step": 9419 + }, + { + "epoch": 3.02456253010114, + "grad_norm": 0.6366291046142578, + "learning_rate": 2.0635855780978044e-06, + "loss": 0.2302, + "step": 9420 + }, + { + "epoch": 3.0248836089259914, + "grad_norm": 0.6438530683517456, + "learning_rate": 2.0565367976873584e-06, + "loss": 0.2584, + "step": 9421 + }, + { + "epoch": 3.025204687750843, + "grad_norm": 0.5729436874389648, + "learning_rate": 2.049499951390832e-06, + "loss": 0.237, + "step": 9422 + }, + { + "epoch": 3.0255257665756945, + "grad_norm": 0.7282575964927673, + "learning_rate": 2.0424750400655947e-06, + "loss": 0.2925, + "step": 9423 + }, + { + "epoch": 3.025846845400546, + "grad_norm": 0.5626926422119141, + "learning_rate": 2.0354620645676504e-06, + "loss": 0.2095, + "step": 9424 + }, + { + "epoch": 3.026167924225397, + "grad_norm": 0.5786295533180237, + "learning_rate": 2.0284610257514937e-06, + "loss": 0.2728, + "step": 9425 + }, + { + "epoch": 3.0264890030502487, + "grad_norm": 0.48927032947540283, + "learning_rate": 2.021471924470175e-06, + "loss": 0.201, + "step": 9426 + }, + { + "epoch": 3.0268100818751003, + "grad_norm": 0.5696834921836853, + "learning_rate": 2.014494761575314e-06, + "loss": 0.2512, + "step": 9427 + }, + { + "epoch": 3.027131160699952, + "grad_norm": 0.32591769099235535, + "learning_rate": 2.0075295379170412e-06, + "loss": 0.1816, + "step": 9428 + }, + { + "epoch": 3.0274522395248034, + "grad_norm": 0.4733005464076996, + "learning_rate": 2.0005762543440445e-06, + "loss": 0.2298, + "step": 9429 + }, + { + "epoch": 3.027773318349655, + "grad_norm": 0.42047736048698425, + "learning_rate": 1.993634911703579e-06, + "loss": 0.1959, + "step": 9430 + }, + { + "epoch": 3.0280943971745065, + "grad_norm": 0.5498672127723694, + "learning_rate": 1.986705510841402e-06, + "loss": 0.2253, + "step": 9431 + }, + { + "epoch": 3.028415475999358, + "grad_norm": 0.5200008153915405, + "learning_rate": 1.979788052601861e-06, + "loss": 0.2527, + "step": 9432 + }, + { + "epoch": 3.0287365548242096, + "grad_norm": 0.5989855527877808, + "learning_rate": 1.9728825378278246e-06, + "loss": 0.2299, + "step": 9433 + }, + { + "epoch": 3.0290576336490607, + "grad_norm": 0.48008453845977783, + "learning_rate": 1.965988967360688e-06, + "loss": 0.2221, + "step": 9434 + }, + { + "epoch": 3.029378712473912, + "grad_norm": 0.6560676097869873, + "learning_rate": 1.9591073420404337e-06, + "loss": 0.2343, + "step": 9435 + }, + { + "epoch": 3.0296997912987638, + "grad_norm": 0.5400110483169556, + "learning_rate": 1.9522376627055583e-06, + "loss": 0.2449, + "step": 9436 + }, + { + "epoch": 3.0300208701236153, + "grad_norm": 0.4094994366168976, + "learning_rate": 1.945379930193125e-06, + "loss": 0.2177, + "step": 9437 + }, + { + "epoch": 3.030341948948467, + "grad_norm": 0.37665337324142456, + "learning_rate": 1.9385341453386995e-06, + "loss": 0.2032, + "step": 9438 + }, + { + "epoch": 3.0306630277733184, + "grad_norm": 0.6781096458435059, + "learning_rate": 1.931700308976436e-06, + "loss": 0.2793, + "step": 9439 + }, + { + "epoch": 3.03098410659817, + "grad_norm": 0.43711480498313904, + "learning_rate": 1.924878421939036e-06, + "loss": 0.2372, + "step": 9440 + }, + { + "epoch": 3.0313051854230215, + "grad_norm": 0.8434777855873108, + "learning_rate": 1.918068485057689e-06, + "loss": 0.2299, + "step": 9441 + }, + { + "epoch": 3.031626264247873, + "grad_norm": 0.38669654726982117, + "learning_rate": 1.911270499162199e-06, + "loss": 0.2403, + "step": 9442 + }, + { + "epoch": 3.031947343072724, + "grad_norm": 0.29554906487464905, + "learning_rate": 1.904484465080847e-06, + "loss": 0.2357, + "step": 9443 + }, + { + "epoch": 3.0322684218975757, + "grad_norm": 0.5543558597564697, + "learning_rate": 1.8977103836405053e-06, + "loss": 0.3659, + "step": 9444 + }, + { + "epoch": 3.0325895007224273, + "grad_norm": 0.2872603237628937, + "learning_rate": 1.8909482556666024e-06, + "loss": 0.2276, + "step": 9445 + }, + { + "epoch": 3.032910579547279, + "grad_norm": 0.3142801523208618, + "learning_rate": 1.8841980819830351e-06, + "loss": 0.1512, + "step": 9446 + }, + { + "epoch": 3.0332316583721304, + "grad_norm": 0.301505446434021, + "learning_rate": 1.8774598634123232e-06, + "loss": 0.1045, + "step": 9447 + }, + { + "epoch": 3.033552737196982, + "grad_norm": 0.5997371077537537, + "learning_rate": 1.8707336007754873e-06, + "loss": 0.2805, + "step": 9448 + }, + { + "epoch": 3.0338738160218335, + "grad_norm": 0.7243043184280396, + "learning_rate": 1.8640192948921054e-06, + "loss": 0.3329, + "step": 9449 + }, + { + "epoch": 3.034194894846685, + "grad_norm": 0.6885005831718445, + "learning_rate": 1.8573169465802898e-06, + "loss": 0.3327, + "step": 9450 + }, + { + "epoch": 3.034515973671536, + "grad_norm": 0.6366089582443237, + "learning_rate": 1.8506265566567094e-06, + "loss": 0.2606, + "step": 9451 + }, + { + "epoch": 3.0348370524963877, + "grad_norm": 0.5336918234825134, + "learning_rate": 1.8439481259365675e-06, + "loss": 0.2271, + "step": 9452 + }, + { + "epoch": 3.0351581313212392, + "grad_norm": 0.4865361154079437, + "learning_rate": 1.8372816552336026e-06, + "loss": 0.1939, + "step": 9453 + }, + { + "epoch": 3.035479210146091, + "grad_norm": 0.6160674095153809, + "learning_rate": 1.8306271453601199e-06, + "loss": 0.2288, + "step": 9454 + }, + { + "epoch": 3.0358002889709423, + "grad_norm": 0.49999356269836426, + "learning_rate": 1.8239845971269266e-06, + "loss": 0.1962, + "step": 9455 + }, + { + "epoch": 3.036121367795794, + "grad_norm": 0.8584988713264465, + "learning_rate": 1.8173540113434194e-06, + "loss": 0.3474, + "step": 9456 + }, + { + "epoch": 3.0364424466206454, + "grad_norm": 0.5525678396224976, + "learning_rate": 1.8107353888175083e-06, + "loss": 0.2213, + "step": 9457 + }, + { + "epoch": 3.036763525445497, + "grad_norm": 0.5510037541389465, + "learning_rate": 1.8041287303556364e-06, + "loss": 0.2106, + "step": 9458 + }, + { + "epoch": 3.0370846042703485, + "grad_norm": 0.8154999613761902, + "learning_rate": 1.7975340367628268e-06, + "loss": 0.2556, + "step": 9459 + }, + { + "epoch": 3.0374056830951996, + "grad_norm": 0.7554807662963867, + "learning_rate": 1.7909513088426255e-06, + "loss": 0.3069, + "step": 9460 + }, + { + "epoch": 3.037726761920051, + "grad_norm": 0.6828257441520691, + "learning_rate": 1.7843805473970798e-06, + "loss": 0.277, + "step": 9461 + }, + { + "epoch": 3.0380478407449028, + "grad_norm": 1.2714136838912964, + "learning_rate": 1.7778217532268714e-06, + "loss": 0.2337, + "step": 9462 + }, + { + "epoch": 3.0383689195697543, + "grad_norm": 1.116326093673706, + "learning_rate": 1.771274927131139e-06, + "loss": 0.3795, + "step": 9463 + }, + { + "epoch": 3.038689998394606, + "grad_norm": 0.7132155299186707, + "learning_rate": 1.7647400699075888e-06, + "loss": 0.2681, + "step": 9464 + }, + { + "epoch": 3.0390110772194574, + "grad_norm": 0.750069797039032, + "learning_rate": 1.7582171823524951e-06, + "loss": 0.2974, + "step": 9465 + }, + { + "epoch": 3.039332156044309, + "grad_norm": 0.6962805986404419, + "learning_rate": 1.751706265260611e-06, + "loss": 0.246, + "step": 9466 + }, + { + "epoch": 3.0396532348691605, + "grad_norm": 0.6339988708496094, + "learning_rate": 1.7452073194253238e-06, + "loss": 0.2297, + "step": 9467 + }, + { + "epoch": 3.039974313694012, + "grad_norm": 0.7461841702461243, + "learning_rate": 1.7387203456384782e-06, + "loss": 0.3137, + "step": 9468 + }, + { + "epoch": 3.040295392518863, + "grad_norm": 0.6014420986175537, + "learning_rate": 1.7322453446905085e-06, + "loss": 0.2633, + "step": 9469 + }, + { + "epoch": 3.0406164713437147, + "grad_norm": 1.0465008020401, + "learning_rate": 1.7257823173703503e-06, + "loss": 0.352, + "step": 9470 + }, + { + "epoch": 3.0409375501685663, + "grad_norm": 0.4762720465660095, + "learning_rate": 1.719331264465529e-06, + "loss": 0.224, + "step": 9471 + }, + { + "epoch": 3.041258628993418, + "grad_norm": 0.6413834095001221, + "learning_rate": 1.712892186762083e-06, + "loss": 0.25, + "step": 9472 + }, + { + "epoch": 3.0415797078182694, + "grad_norm": 0.46483296155929565, + "learning_rate": 1.706465085044584e-06, + "loss": 0.2117, + "step": 9473 + }, + { + "epoch": 3.041900786643121, + "grad_norm": 0.49583905935287476, + "learning_rate": 1.7000499600961505e-06, + "loss": 0.198, + "step": 9474 + }, + { + "epoch": 3.0422218654679725, + "grad_norm": 0.6456876993179321, + "learning_rate": 1.6936468126984572e-06, + "loss": 0.2849, + "step": 9475 + }, + { + "epoch": 3.042542944292824, + "grad_norm": 0.589216947555542, + "learning_rate": 1.6872556436317022e-06, + "loss": 0.228, + "step": 9476 + }, + { + "epoch": 3.0428640231176756, + "grad_norm": 0.5428295135498047, + "learning_rate": 1.680876453674629e-06, + "loss": 0.2151, + "step": 9477 + }, + { + "epoch": 3.0431851019425267, + "grad_norm": 0.5795981884002686, + "learning_rate": 1.6745092436045494e-06, + "loss": 0.2326, + "step": 9478 + }, + { + "epoch": 3.043506180767378, + "grad_norm": 0.6216808557510376, + "learning_rate": 1.6681540141972429e-06, + "loss": 0.2701, + "step": 9479 + }, + { + "epoch": 3.0438272595922298, + "grad_norm": 0.5757028460502625, + "learning_rate": 1.661810766227112e-06, + "loss": 0.2414, + "step": 9480 + }, + { + "epoch": 3.0441483384170813, + "grad_norm": 0.731472373008728, + "learning_rate": 1.6554795004670388e-06, + "loss": 0.259, + "step": 9481 + }, + { + "epoch": 3.044469417241933, + "grad_norm": 0.6411800384521484, + "learning_rate": 1.6491602176884724e-06, + "loss": 0.2675, + "step": 9482 + }, + { + "epoch": 3.0447904960667844, + "grad_norm": 0.40002959966659546, + "learning_rate": 1.6428529186614195e-06, + "loss": 0.2122, + "step": 9483 + }, + { + "epoch": 3.045111574891636, + "grad_norm": 0.9253550171852112, + "learning_rate": 1.636557604154365e-06, + "loss": 0.3639, + "step": 9484 + }, + { + "epoch": 3.0454326537164875, + "grad_norm": 0.5021322965621948, + "learning_rate": 1.6302742749344291e-06, + "loss": 0.2332, + "step": 9485 + }, + { + "epoch": 3.045753732541339, + "grad_norm": 0.3761097192764282, + "learning_rate": 1.6240029317671658e-06, + "loss": 0.2103, + "step": 9486 + }, + { + "epoch": 3.04607481136619, + "grad_norm": 0.6944969892501831, + "learning_rate": 1.6177435754167415e-06, + "loss": 0.2706, + "step": 9487 + }, + { + "epoch": 3.0463958901910417, + "grad_norm": 0.43862295150756836, + "learning_rate": 1.611496206645835e-06, + "loss": 0.2505, + "step": 9488 + }, + { + "epoch": 3.0467169690158933, + "grad_norm": 0.36449775099754333, + "learning_rate": 1.605260826215682e-06, + "loss": 0.2377, + "step": 9489 + }, + { + "epoch": 3.047038047840745, + "grad_norm": 0.4346637725830078, + "learning_rate": 1.5990374348860305e-06, + "loss": 0.2281, + "step": 9490 + }, + { + "epoch": 3.0473591266655964, + "grad_norm": 0.623367190361023, + "learning_rate": 1.5928260334151845e-06, + "loss": 0.2755, + "step": 9491 + }, + { + "epoch": 3.047680205490448, + "grad_norm": 0.8898089528083801, + "learning_rate": 1.5866266225599834e-06, + "loss": 0.2915, + "step": 9492 + }, + { + "epoch": 3.0480012843152995, + "grad_norm": 0.3457051217556, + "learning_rate": 1.580439203075812e-06, + "loss": 0.2286, + "step": 9493 + }, + { + "epoch": 3.048322363140151, + "grad_norm": 0.48671117424964905, + "learning_rate": 1.574263775716578e-06, + "loss": 0.3948, + "step": 9494 + }, + { + "epoch": 3.0486434419650026, + "grad_norm": 0.5143343210220337, + "learning_rate": 1.5681003412347572e-06, + "loss": 0.4827, + "step": 9495 + }, + { + "epoch": 3.0489645207898537, + "grad_norm": 0.37674856185913086, + "learning_rate": 1.561948900381327e-06, + "loss": 0.2705, + "step": 9496 + }, + { + "epoch": 3.0492855996147052, + "grad_norm": 0.3917763829231262, + "learning_rate": 1.555809453905821e-06, + "loss": 0.2283, + "step": 9497 + }, + { + "epoch": 3.049606678439557, + "grad_norm": 0.4415786564350128, + "learning_rate": 1.5496820025563409e-06, + "loss": 0.2775, + "step": 9498 + }, + { + "epoch": 3.0499277572644083, + "grad_norm": 0.28412747383117676, + "learning_rate": 1.543566547079467e-06, + "loss": 0.134, + "step": 9499 + }, + { + "epoch": 3.05024883608926, + "grad_norm": 0.32522836327552795, + "learning_rate": 1.5374630882203588e-06, + "loss": 0.1553, + "step": 9500 + }, + { + "epoch": 3.0505699149141114, + "grad_norm": 0.37719276547431946, + "learning_rate": 1.5313716267226996e-06, + "loss": 0.1436, + "step": 9501 + }, + { + "epoch": 3.050890993738963, + "grad_norm": 0.7100079655647278, + "learning_rate": 1.5252921633287177e-06, + "loss": 0.423, + "step": 9502 + }, + { + "epoch": 3.0512120725638145, + "grad_norm": 0.6729628443717957, + "learning_rate": 1.5192246987791981e-06, + "loss": 0.2616, + "step": 9503 + }, + { + "epoch": 3.051533151388666, + "grad_norm": 0.7652589082717896, + "learning_rate": 1.5131692338134052e-06, + "loss": 0.2771, + "step": 9504 + }, + { + "epoch": 3.051854230213517, + "grad_norm": 0.6066986918449402, + "learning_rate": 1.5071257691692153e-06, + "loss": 0.2357, + "step": 9505 + }, + { + "epoch": 3.0521753090383688, + "grad_norm": 0.580855131149292, + "learning_rate": 1.501094305582984e-06, + "loss": 0.2318, + "step": 9506 + }, + { + "epoch": 3.0524963878632203, + "grad_norm": 0.5671026110649109, + "learning_rate": 1.4950748437896234e-06, + "loss": 0.211, + "step": 9507 + }, + { + "epoch": 3.052817466688072, + "grad_norm": 0.6296615600585938, + "learning_rate": 1.4890673845226133e-06, + "loss": 0.2119, + "step": 9508 + }, + { + "epoch": 3.0531385455129234, + "grad_norm": 0.6201547980308533, + "learning_rate": 1.4830719285139127e-06, + "loss": 0.2538, + "step": 9509 + }, + { + "epoch": 3.053459624337775, + "grad_norm": 0.9474941492080688, + "learning_rate": 1.4770884764940706e-06, + "loss": 0.242, + "step": 9510 + }, + { + "epoch": 3.0537807031626265, + "grad_norm": 0.6287795901298523, + "learning_rate": 1.4711170291921484e-06, + "loss": 0.2053, + "step": 9511 + }, + { + "epoch": 3.054101781987478, + "grad_norm": 0.6485587358474731, + "learning_rate": 1.4651575873357416e-06, + "loss": 0.2242, + "step": 9512 + }, + { + "epoch": 3.0544228608123296, + "grad_norm": 0.6445651054382324, + "learning_rate": 1.4592101516509914e-06, + "loss": 0.2214, + "step": 9513 + }, + { + "epoch": 3.0547439396371807, + "grad_norm": 0.6522722244262695, + "learning_rate": 1.4532747228625854e-06, + "loss": 0.2488, + "step": 9514 + }, + { + "epoch": 3.0550650184620323, + "grad_norm": 0.6885031461715698, + "learning_rate": 1.4473513016937223e-06, + "loss": 0.2459, + "step": 9515 + }, + { + "epoch": 3.055386097286884, + "grad_norm": 0.6756393909454346, + "learning_rate": 1.4414398888661695e-06, + "loss": 0.2131, + "step": 9516 + }, + { + "epoch": 3.0557071761117354, + "grad_norm": 0.5485737919807434, + "learning_rate": 1.4355404851001952e-06, + "loss": 0.2285, + "step": 9517 + }, + { + "epoch": 3.056028254936587, + "grad_norm": 0.3958723247051239, + "learning_rate": 1.4296530911146466e-06, + "loss": 0.1763, + "step": 9518 + }, + { + "epoch": 3.0563493337614385, + "grad_norm": 0.5892900824546814, + "learning_rate": 1.423777707626872e-06, + "loss": 0.1842, + "step": 9519 + }, + { + "epoch": 3.05667041258629, + "grad_norm": 0.6213316321372986, + "learning_rate": 1.4179143353527547e-06, + "loss": 0.26, + "step": 9520 + }, + { + "epoch": 3.0569914914111416, + "grad_norm": 0.6566430330276489, + "learning_rate": 1.412062975006767e-06, + "loss": 0.2532, + "step": 9521 + }, + { + "epoch": 3.057312570235993, + "grad_norm": 0.5408963561058044, + "learning_rate": 1.4062236273018392e-06, + "loss": 0.2406, + "step": 9522 + }, + { + "epoch": 3.0576336490608442, + "grad_norm": 0.552483856678009, + "learning_rate": 1.400396292949513e-06, + "loss": 0.2385, + "step": 9523 + }, + { + "epoch": 3.057954727885696, + "grad_norm": 0.6494653820991516, + "learning_rate": 1.394580972659798e-06, + "loss": 0.215, + "step": 9524 + }, + { + "epoch": 3.0582758067105473, + "grad_norm": 0.5467716455459595, + "learning_rate": 1.3887776671412943e-06, + "loss": 0.2344, + "step": 9525 + }, + { + "epoch": 3.058596885535399, + "grad_norm": 0.6003792881965637, + "learning_rate": 1.3829863771011253e-06, + "loss": 0.2197, + "step": 9526 + }, + { + "epoch": 3.0589179643602504, + "grad_norm": 0.4577672481536865, + "learning_rate": 1.377207103244904e-06, + "loss": 0.2218, + "step": 9527 + }, + { + "epoch": 3.059239043185102, + "grad_norm": 0.6770538687705994, + "learning_rate": 1.3714398462768563e-06, + "loss": 0.239, + "step": 9528 + }, + { + "epoch": 3.0595601220099535, + "grad_norm": 0.5816624164581299, + "learning_rate": 1.3656846068996976e-06, + "loss": 0.2556, + "step": 9529 + }, + { + "epoch": 3.059881200834805, + "grad_norm": 0.7855013012886047, + "learning_rate": 1.359941385814667e-06, + "loss": 0.2844, + "step": 9530 + }, + { + "epoch": 3.0602022796596566, + "grad_norm": 0.26091650128364563, + "learning_rate": 1.3542101837215826e-06, + "loss": 0.1777, + "step": 9531 + }, + { + "epoch": 3.0605233584845077, + "grad_norm": 0.46976807713508606, + "learning_rate": 1.3484910013187524e-06, + "loss": 0.2338, + "step": 9532 + }, + { + "epoch": 3.0608444373093593, + "grad_norm": 0.46279844641685486, + "learning_rate": 1.3427838393030633e-06, + "loss": 0.2305, + "step": 9533 + }, + { + "epoch": 3.061165516134211, + "grad_norm": 0.6863793730735779, + "learning_rate": 1.3370886983698927e-06, + "loss": 0.2443, + "step": 9534 + }, + { + "epoch": 3.0614865949590624, + "grad_norm": 0.6161332130432129, + "learning_rate": 1.3314055792131964e-06, + "loss": 0.2431, + "step": 9535 + }, + { + "epoch": 3.061807673783914, + "grad_norm": 0.4130920171737671, + "learning_rate": 1.3257344825254315e-06, + "loss": 0.2002, + "step": 9536 + }, + { + "epoch": 3.0621287526087655, + "grad_norm": 0.7032944560050964, + "learning_rate": 1.320075408997612e-06, + "loss": 0.261, + "step": 9537 + }, + { + "epoch": 3.062449831433617, + "grad_norm": 0.563331127166748, + "learning_rate": 1.3144283593192752e-06, + "loss": 0.2508, + "step": 9538 + }, + { + "epoch": 3.0627709102584686, + "grad_norm": 0.2759096026420593, + "learning_rate": 1.308793334178493e-06, + "loss": 0.2001, + "step": 9539 + }, + { + "epoch": 3.06309198908332, + "grad_norm": 0.3822631537914276, + "learning_rate": 1.303170334261883e-06, + "loss": 0.2032, + "step": 9540 + }, + { + "epoch": 3.0634130679081713, + "grad_norm": 0.397834837436676, + "learning_rate": 1.2975593602545965e-06, + "loss": 0.2111, + "step": 9541 + }, + { + "epoch": 3.063734146733023, + "grad_norm": 0.3753139078617096, + "learning_rate": 1.2919604128402874e-06, + "loss": 0.2241, + "step": 9542 + }, + { + "epoch": 3.0640552255578744, + "grad_norm": 0.4659748673439026, + "learning_rate": 1.2863734927012095e-06, + "loss": 0.2573, + "step": 9543 + }, + { + "epoch": 3.064376304382726, + "grad_norm": 0.35863274335861206, + "learning_rate": 1.280798600518085e-06, + "loss": 0.33, + "step": 9544 + }, + { + "epoch": 3.0646973832075775, + "grad_norm": 0.48905041813850403, + "learning_rate": 1.275235736970193e-06, + "loss": 0.5968, + "step": 9545 + }, + { + "epoch": 3.065018462032429, + "grad_norm": 0.3606785237789154, + "learning_rate": 1.2696849027353796e-06, + "loss": 0.2948, + "step": 9546 + }, + { + "epoch": 3.0653395408572806, + "grad_norm": 0.32427453994750977, + "learning_rate": 1.26414609848996e-06, + "loss": 0.1369, + "step": 9547 + }, + { + "epoch": 3.065660619682132, + "grad_norm": 0.3010817766189575, + "learning_rate": 1.2586193249088608e-06, + "loss": 0.146, + "step": 9548 + }, + { + "epoch": 3.0659816985069837, + "grad_norm": 0.3775825500488281, + "learning_rate": 1.2531045826654653e-06, + "loss": 0.1995, + "step": 9549 + }, + { + "epoch": 3.0663027773318348, + "grad_norm": 0.1418655514717102, + "learning_rate": 1.2476018724317585e-06, + "loss": 0.0591, + "step": 9550 + }, + { + "epoch": 3.0666238561566863, + "grad_norm": 0.36233529448509216, + "learning_rate": 1.2421111948782149e-06, + "loss": 0.1744, + "step": 9551 + }, + { + "epoch": 3.066944934981538, + "grad_norm": 0.6624466180801392, + "learning_rate": 1.236632550673844e-06, + "loss": 0.3614, + "step": 9552 + }, + { + "epoch": 3.0672660138063894, + "grad_norm": 0.6916109919548035, + "learning_rate": 1.231165940486234e-06, + "loss": 0.2959, + "step": 9553 + }, + { + "epoch": 3.067587092631241, + "grad_norm": 0.7930703163146973, + "learning_rate": 1.225711364981441e-06, + "loss": 0.3368, + "step": 9554 + }, + { + "epoch": 3.0679081714560925, + "grad_norm": 0.636942982673645, + "learning_rate": 1.2202688248241112e-06, + "loss": 0.2359, + "step": 9555 + }, + { + "epoch": 3.068229250280944, + "grad_norm": 0.5777634978294373, + "learning_rate": 1.2148383206773916e-06, + "loss": 0.2183, + "step": 9556 + }, + { + "epoch": 3.0685503291057956, + "grad_norm": 0.6497059464454651, + "learning_rate": 1.2094198532029755e-06, + "loss": 0.2373, + "step": 9557 + }, + { + "epoch": 3.068871407930647, + "grad_norm": 0.6847310066223145, + "learning_rate": 1.20401342306109e-06, + "loss": 0.2691, + "step": 9558 + }, + { + "epoch": 3.0691924867554983, + "grad_norm": 0.6610705256462097, + "learning_rate": 1.1986190309104861e-06, + "loss": 0.2512, + "step": 9559 + }, + { + "epoch": 3.06951356558035, + "grad_norm": 0.5450610518455505, + "learning_rate": 1.193236677408449e-06, + "loss": 0.1912, + "step": 9560 + }, + { + "epoch": 3.0698346444052014, + "grad_norm": 0.5279379487037659, + "learning_rate": 1.1878663632108322e-06, + "loss": 0.1994, + "step": 9561 + }, + { + "epoch": 3.070155723230053, + "grad_norm": 0.5520129799842834, + "learning_rate": 1.1825080889719563e-06, + "loss": 0.2005, + "step": 9562 + }, + { + "epoch": 3.0704768020549045, + "grad_norm": 0.5607929825782776, + "learning_rate": 1.1771618553447216e-06, + "loss": 0.1891, + "step": 9563 + }, + { + "epoch": 3.070797880879756, + "grad_norm": 0.9774504899978638, + "learning_rate": 1.1718276629805625e-06, + "loss": 0.3626, + "step": 9564 + }, + { + "epoch": 3.0711189597046076, + "grad_norm": 0.7361037135124207, + "learning_rate": 1.1665055125294033e-06, + "loss": 0.2507, + "step": 9565 + }, + { + "epoch": 3.071440038529459, + "grad_norm": 0.8793400526046753, + "learning_rate": 1.16119540463977e-06, + "loss": 0.2958, + "step": 9566 + }, + { + "epoch": 3.0717611173543107, + "grad_norm": 0.8101871609687805, + "learning_rate": 1.155897339958667e-06, + "loss": 0.2885, + "step": 9567 + }, + { + "epoch": 3.072082196179162, + "grad_norm": 1.022641658782959, + "learning_rate": 1.1506113191316447e-06, + "loss": 0.4274, + "step": 9568 + }, + { + "epoch": 3.0724032750040133, + "grad_norm": 0.7252495288848877, + "learning_rate": 1.1453373428027992e-06, + "loss": 0.2599, + "step": 9569 + }, + { + "epoch": 3.072724353828865, + "grad_norm": 0.4876370429992676, + "learning_rate": 1.1400754116147271e-06, + "loss": 0.2188, + "step": 9570 + }, + { + "epoch": 3.0730454326537164, + "grad_norm": 0.6673651933670044, + "learning_rate": 1.134825526208605e-06, + "loss": 0.3235, + "step": 9571 + }, + { + "epoch": 3.073366511478568, + "grad_norm": 0.7081831693649292, + "learning_rate": 1.1295876872240873e-06, + "loss": 0.2824, + "step": 9572 + }, + { + "epoch": 3.0736875903034195, + "grad_norm": 0.8749157786369324, + "learning_rate": 1.1243618952994195e-06, + "loss": 0.3039, + "step": 9573 + }, + { + "epoch": 3.074008669128271, + "grad_norm": 0.5778833031654358, + "learning_rate": 1.1191481510713253e-06, + "loss": 0.2355, + "step": 9574 + }, + { + "epoch": 3.0743297479531226, + "grad_norm": 0.482704758644104, + "learning_rate": 1.1139464551750856e-06, + "loss": 0.2225, + "step": 9575 + }, + { + "epoch": 3.074650826777974, + "grad_norm": 0.5122305750846863, + "learning_rate": 1.1087568082445264e-06, + "loss": 0.2251, + "step": 9576 + }, + { + "epoch": 3.0749719056028253, + "grad_norm": 0.6916018128395081, + "learning_rate": 1.103579210911976e-06, + "loss": 0.2501, + "step": 9577 + }, + { + "epoch": 3.075292984427677, + "grad_norm": 0.5738651156425476, + "learning_rate": 1.0984136638083177e-06, + "loss": 0.2508, + "step": 9578 + }, + { + "epoch": 3.0756140632525284, + "grad_norm": 0.3557813763618469, + "learning_rate": 1.0932601675629595e-06, + "loss": 0.1737, + "step": 9579 + }, + { + "epoch": 3.07593514207738, + "grad_norm": 0.7535400986671448, + "learning_rate": 1.0881187228038215e-06, + "loss": 0.2763, + "step": 9580 + }, + { + "epoch": 3.0762562209022315, + "grad_norm": 0.4553007185459137, + "learning_rate": 1.0829893301573913e-06, + "loss": 0.226, + "step": 9581 + }, + { + "epoch": 3.076577299727083, + "grad_norm": 0.3509460389614105, + "learning_rate": 1.0778719902486689e-06, + "loss": 0.1899, + "step": 9582 + }, + { + "epoch": 3.0768983785519346, + "grad_norm": 0.7953051924705505, + "learning_rate": 1.0727667037011668e-06, + "loss": 0.3081, + "step": 9583 + }, + { + "epoch": 3.077219457376786, + "grad_norm": 0.563300609588623, + "learning_rate": 1.0676734711369762e-06, + "loss": 0.2307, + "step": 9584 + }, + { + "epoch": 3.0775405362016377, + "grad_norm": 0.5179484486579895, + "learning_rate": 1.0625922931766785e-06, + "loss": 0.2405, + "step": 9585 + }, + { + "epoch": 3.077861615026489, + "grad_norm": 0.8609539270401001, + "learning_rate": 1.0575231704393895e-06, + "loss": 0.2623, + "step": 9586 + }, + { + "epoch": 3.0781826938513404, + "grad_norm": 0.571172833442688, + "learning_rate": 1.052466103542793e-06, + "loss": 0.2477, + "step": 9587 + }, + { + "epoch": 3.078503772676192, + "grad_norm": 0.396445095539093, + "learning_rate": 1.0474210931030514e-06, + "loss": 0.2204, + "step": 9588 + }, + { + "epoch": 3.0788248515010435, + "grad_norm": 0.43316546082496643, + "learning_rate": 1.0423881397349068e-06, + "loss": 0.2267, + "step": 9589 + }, + { + "epoch": 3.079145930325895, + "grad_norm": 0.4050913453102112, + "learning_rate": 1.0373672440515902e-06, + "loss": 0.221, + "step": 9590 + }, + { + "epoch": 3.0794670091507466, + "grad_norm": 0.2422318160533905, + "learning_rate": 1.0323584066648795e-06, + "loss": 0.2087, + "step": 9591 + }, + { + "epoch": 3.079788087975598, + "grad_norm": 0.34273797273635864, + "learning_rate": 1.0273616281851083e-06, + "loss": 0.2479, + "step": 9592 + }, + { + "epoch": 3.0801091668004497, + "grad_norm": 0.35393643379211426, + "learning_rate": 1.0223769092211012e-06, + "loss": 0.2539, + "step": 9593 + }, + { + "epoch": 3.080430245625301, + "grad_norm": 0.4257902503013611, + "learning_rate": 1.0174042503802493e-06, + "loss": 0.4868, + "step": 9594 + }, + { + "epoch": 3.0807513244501523, + "grad_norm": 0.4722660779953003, + "learning_rate": 1.0124436522684243e-06, + "loss": 0.4076, + "step": 9595 + }, + { + "epoch": 3.081072403275004, + "grad_norm": 0.2358543872833252, + "learning_rate": 1.007495115490087e-06, + "loss": 0.0826, + "step": 9596 + }, + { + "epoch": 3.0813934820998554, + "grad_norm": 0.2644473612308502, + "learning_rate": 1.002558640648199e-06, + "loss": 0.102, + "step": 9597 + }, + { + "epoch": 3.081714560924707, + "grad_norm": 0.3907872438430786, + "learning_rate": 9.976342283442463e-07, + "loss": 0.1718, + "step": 9598 + }, + { + "epoch": 3.0820356397495585, + "grad_norm": 0.35824838280677795, + "learning_rate": 9.927218791782599e-07, + "loss": 0.1511, + "step": 9599 + }, + { + "epoch": 3.08235671857441, + "grad_norm": 0.6002468466758728, + "learning_rate": 9.878215937487834e-07, + "loss": 0.3288, + "step": 9600 + }, + { + "epoch": 3.0826777973992616, + "grad_norm": 0.827461302280426, + "learning_rate": 9.829333726529056e-07, + "loss": 0.3523, + "step": 9601 + }, + { + "epoch": 3.082998876224113, + "grad_norm": 0.5873133540153503, + "learning_rate": 9.78057216486261e-07, + "loss": 0.2328, + "step": 9602 + }, + { + "epoch": 3.0833199550489647, + "grad_norm": 0.5725942254066467, + "learning_rate": 9.731931258429638e-07, + "loss": 0.2127, + "step": 9603 + }, + { + "epoch": 3.083641033873816, + "grad_norm": 0.584507405757904, + "learning_rate": 9.683411013157174e-07, + "loss": 0.1985, + "step": 9604 + }, + { + "epoch": 3.0839621126986674, + "grad_norm": 0.7739067077636719, + "learning_rate": 9.635011434957152e-07, + "loss": 0.277, + "step": 9605 + }, + { + "epoch": 3.084283191523519, + "grad_norm": 0.5025599598884583, + "learning_rate": 9.58673252972675e-07, + "loss": 0.1996, + "step": 9606 + }, + { + "epoch": 3.0846042703483705, + "grad_norm": 0.6031046509742737, + "learning_rate": 9.538574303348813e-07, + "loss": 0.2324, + "step": 9607 + }, + { + "epoch": 3.084925349173222, + "grad_norm": 0.8453519344329834, + "learning_rate": 9.490536761691204e-07, + "loss": 0.2988, + "step": 9608 + }, + { + "epoch": 3.0852464279980736, + "grad_norm": 0.5942166447639465, + "learning_rate": 9.442619910607131e-07, + "loss": 0.2195, + "step": 9609 + }, + { + "epoch": 3.085567506822925, + "grad_norm": 0.5165850520133972, + "learning_rate": 9.394823755935145e-07, + "loss": 0.2168, + "step": 9610 + }, + { + "epoch": 3.0858885856477767, + "grad_norm": 0.4023417532444, + "learning_rate": 9.347148303499142e-07, + "loss": 0.1666, + "step": 9611 + }, + { + "epoch": 3.0862096644726282, + "grad_norm": 0.44202151894569397, + "learning_rate": 9.299593559108033e-07, + "loss": 0.1773, + "step": 9612 + }, + { + "epoch": 3.0865307432974793, + "grad_norm": 0.6434639096260071, + "learning_rate": 9.252159528556403e-07, + "loss": 0.2229, + "step": 9613 + }, + { + "epoch": 3.086851822122331, + "grad_norm": 0.5662235021591187, + "learning_rate": 9.204846217623853e-07, + "loss": 0.2579, + "step": 9614 + }, + { + "epoch": 3.0871729009471824, + "grad_norm": 0.7555933594703674, + "learning_rate": 9.157653632075436e-07, + "loss": 0.2733, + "step": 9615 + }, + { + "epoch": 3.087493979772034, + "grad_norm": 0.5656288862228394, + "learning_rate": 9.110581777661331e-07, + "loss": 0.2362, + "step": 9616 + }, + { + "epoch": 3.0878150585968855, + "grad_norm": 0.6788967251777649, + "learning_rate": 9.063630660117173e-07, + "loss": 0.2549, + "step": 9617 + }, + { + "epoch": 3.088136137421737, + "grad_norm": 0.45415976643562317, + "learning_rate": 9.016800285163718e-07, + "loss": 0.1989, + "step": 9618 + }, + { + "epoch": 3.0884572162465886, + "grad_norm": 0.4443502128124237, + "learning_rate": 8.970090658507291e-07, + "loss": 0.1886, + "step": 9619 + }, + { + "epoch": 3.08877829507144, + "grad_norm": 0.5720863938331604, + "learning_rate": 8.923501785839117e-07, + "loss": 0.2065, + "step": 9620 + }, + { + "epoch": 3.0890993738962917, + "grad_norm": 0.7446249723434448, + "learning_rate": 8.877033672835988e-07, + "loss": 0.3329, + "step": 9621 + }, + { + "epoch": 3.089420452721143, + "grad_norm": 0.7151133418083191, + "learning_rate": 8.830686325160042e-07, + "loss": 0.3169, + "step": 9622 + }, + { + "epoch": 3.0897415315459944, + "grad_norm": 0.5628482699394226, + "learning_rate": 8.784459748458318e-07, + "loss": 0.2448, + "step": 9623 + }, + { + "epoch": 3.090062610370846, + "grad_norm": 0.7645835280418396, + "learning_rate": 8.73835394836342e-07, + "loss": 0.2118, + "step": 9624 + }, + { + "epoch": 3.0903836891956975, + "grad_norm": 0.9074884653091431, + "learning_rate": 8.692368930493521e-07, + "loss": 0.2708, + "step": 9625 + }, + { + "epoch": 3.090704768020549, + "grad_norm": 0.8181541562080383, + "learning_rate": 8.646504700451252e-07, + "loss": 0.2557, + "step": 9626 + }, + { + "epoch": 3.0910258468454006, + "grad_norm": 0.39952725172042847, + "learning_rate": 8.600761263825474e-07, + "loss": 0.1937, + "step": 9627 + }, + { + "epoch": 3.091346925670252, + "grad_norm": 0.505699634552002, + "learning_rate": 8.555138626189618e-07, + "loss": 0.2237, + "step": 9628 + }, + { + "epoch": 3.0916680044951037, + "grad_norm": 0.8317124843597412, + "learning_rate": 8.509636793102682e-07, + "loss": 0.316, + "step": 9629 + }, + { + "epoch": 3.0919890833199553, + "grad_norm": 0.4344399571418762, + "learning_rate": 8.46425577010912e-07, + "loss": 0.1989, + "step": 9630 + }, + { + "epoch": 3.0923101621448064, + "grad_norm": 0.7013100385665894, + "learning_rate": 8.418995562738285e-07, + "loss": 0.2483, + "step": 9631 + }, + { + "epoch": 3.092631240969658, + "grad_norm": 0.40041857957839966, + "learning_rate": 8.373856176505101e-07, + "loss": 0.2176, + "step": 9632 + }, + { + "epoch": 3.0929523197945095, + "grad_norm": 0.4556305408477783, + "learning_rate": 8.328837616909613e-07, + "loss": 0.2162, + "step": 9633 + }, + { + "epoch": 3.093273398619361, + "grad_norm": 2.067694664001465, + "learning_rate": 8.283939889437209e-07, + "loss": 0.2219, + "step": 9634 + }, + { + "epoch": 3.0935944774442126, + "grad_norm": 0.48048630356788635, + "learning_rate": 8.239162999558403e-07, + "loss": 0.2146, + "step": 9635 + }, + { + "epoch": 3.093915556269064, + "grad_norm": 0.8311915993690491, + "learning_rate": 8.194506952729386e-07, + "loss": 0.3099, + "step": 9636 + }, + { + "epoch": 3.0942366350939157, + "grad_norm": 0.5806686282157898, + "learning_rate": 8.14997175439125e-07, + "loss": 0.2564, + "step": 9637 + }, + { + "epoch": 3.094557713918767, + "grad_norm": 0.674216628074646, + "learning_rate": 8.105557409970432e-07, + "loss": 0.254, + "step": 9638 + }, + { + "epoch": 3.0948787927436188, + "grad_norm": 0.826112687587738, + "learning_rate": 8.061263924878604e-07, + "loss": 0.2399, + "step": 9639 + }, + { + "epoch": 3.09519987156847, + "grad_norm": 0.5507524013519287, + "learning_rate": 8.017091304513003e-07, + "loss": 0.257, + "step": 9640 + }, + { + "epoch": 3.0955209503933214, + "grad_norm": 0.3682684302330017, + "learning_rate": 7.973039554255768e-07, + "loss": 0.216, + "step": 9641 + }, + { + "epoch": 3.095842029218173, + "grad_norm": 0.49590498208999634, + "learning_rate": 7.929108679474607e-07, + "loss": 0.2547, + "step": 9642 + }, + { + "epoch": 3.0961631080430245, + "grad_norm": 0.3100004494190216, + "learning_rate": 7.885298685522235e-07, + "loss": 0.2364, + "step": 9643 + }, + { + "epoch": 3.096484186867876, + "grad_norm": 0.41891974210739136, + "learning_rate": 7.841609577736719e-07, + "loss": 0.404, + "step": 9644 + }, + { + "epoch": 3.0968052656927276, + "grad_norm": 0.5358453989028931, + "learning_rate": 7.798041361441688e-07, + "loss": 0.6396, + "step": 9645 + }, + { + "epoch": 3.097126344517579, + "grad_norm": 0.5173894166946411, + "learning_rate": 7.754594041945562e-07, + "loss": 0.5229, + "step": 9646 + }, + { + "epoch": 3.0974474233424307, + "grad_norm": 0.4027728736400604, + "learning_rate": 7.711267624542329e-07, + "loss": 0.2377, + "step": 9647 + }, + { + "epoch": 3.0977685021672823, + "grad_norm": 0.3841489851474762, + "learning_rate": 7.668062114511321e-07, + "loss": 0.214, + "step": 9648 + }, + { + "epoch": 3.0980895809921334, + "grad_norm": 0.29058584570884705, + "learning_rate": 7.624977517116772e-07, + "loss": 0.1148, + "step": 9649 + }, + { + "epoch": 3.098410659816985, + "grad_norm": 0.2613859474658966, + "learning_rate": 7.582013837608592e-07, + "loss": 0.0923, + "step": 9650 + }, + { + "epoch": 3.0987317386418365, + "grad_norm": 0.5930027365684509, + "learning_rate": 7.539171081221596e-07, + "loss": 0.2433, + "step": 9651 + }, + { + "epoch": 3.099052817466688, + "grad_norm": 0.6712236404418945, + "learning_rate": 7.496449253176274e-07, + "loss": 0.2822, + "step": 9652 + }, + { + "epoch": 3.0993738962915396, + "grad_norm": 0.6282926797866821, + "learning_rate": 7.453848358678017e-07, + "loss": 0.2422, + "step": 9653 + }, + { + "epoch": 3.099694975116391, + "grad_norm": 0.8251811265945435, + "learning_rate": 7.411368402917563e-07, + "loss": 0.2839, + "step": 9654 + }, + { + "epoch": 3.1000160539412427, + "grad_norm": 0.5989024043083191, + "learning_rate": 7.369009391070992e-07, + "loss": 0.2299, + "step": 9655 + }, + { + "epoch": 3.1003371327660942, + "grad_norm": 0.42122694849967957, + "learning_rate": 7.326771328299731e-07, + "loss": 0.1556, + "step": 9656 + }, + { + "epoch": 3.100658211590946, + "grad_norm": 0.7038974761962891, + "learning_rate": 7.284654219750331e-07, + "loss": 0.2316, + "step": 9657 + }, + { + "epoch": 3.100979290415797, + "grad_norm": 0.5096376538276672, + "learning_rate": 7.242658070554464e-07, + "loss": 0.2069, + "step": 9658 + }, + { + "epoch": 3.1013003692406484, + "grad_norm": 0.8389344811439514, + "learning_rate": 7.200782885829482e-07, + "loss": 0.2828, + "step": 9659 + }, + { + "epoch": 3.1016214480655, + "grad_norm": 0.551834762096405, + "learning_rate": 7.159028670677526e-07, + "loss": 0.1995, + "step": 9660 + }, + { + "epoch": 3.1019425268903515, + "grad_norm": 0.7695516347885132, + "learning_rate": 7.117395430186414e-07, + "loss": 0.2759, + "step": 9661 + }, + { + "epoch": 3.102263605715203, + "grad_norm": 0.6411928534507751, + "learning_rate": 7.075883169428754e-07, + "loss": 0.2384, + "step": 9662 + }, + { + "epoch": 3.1025846845400546, + "grad_norm": 0.8904174566268921, + "learning_rate": 7.034491893463058e-07, + "loss": 0.286, + "step": 9663 + }, + { + "epoch": 3.102905763364906, + "grad_norm": 0.7250691056251526, + "learning_rate": 6.9932216073324e-07, + "loss": 0.2928, + "step": 9664 + }, + { + "epoch": 3.1032268421897578, + "grad_norm": 0.5427330136299133, + "learning_rate": 6.952072316065761e-07, + "loss": 0.2105, + "step": 9665 + }, + { + "epoch": 3.1035479210146093, + "grad_norm": 0.7631176710128784, + "learning_rate": 6.911044024676683e-07, + "loss": 0.2677, + "step": 9666 + }, + { + "epoch": 3.1038689998394604, + "grad_norm": 0.6781275272369385, + "learning_rate": 6.870136738164612e-07, + "loss": 0.2665, + "step": 9667 + }, + { + "epoch": 3.104190078664312, + "grad_norm": 0.7830384969711304, + "learning_rate": 6.829350461514006e-07, + "loss": 0.2469, + "step": 9668 + }, + { + "epoch": 3.1045111574891635, + "grad_norm": 0.836956799030304, + "learning_rate": 6.788685199694222e-07, + "loss": 0.2681, + "step": 9669 + }, + { + "epoch": 3.104832236314015, + "grad_norm": 0.6083877086639404, + "learning_rate": 6.748140957660631e-07, + "loss": 0.2515, + "step": 9670 + }, + { + "epoch": 3.1051533151388666, + "grad_norm": 0.5534031987190247, + "learning_rate": 6.707717740353059e-07, + "loss": 0.2078, + "step": 9671 + }, + { + "epoch": 3.105474393963718, + "grad_norm": 0.42929285764694214, + "learning_rate": 6.66741555269712e-07, + "loss": 0.2037, + "step": 9672 + }, + { + "epoch": 3.1057954727885697, + "grad_norm": 0.559162437915802, + "learning_rate": 6.627234399603555e-07, + "loss": 0.1936, + "step": 9673 + }, + { + "epoch": 3.1061165516134213, + "grad_norm": 0.687892735004425, + "learning_rate": 6.587174285968223e-07, + "loss": 0.2577, + "step": 9674 + }, + { + "epoch": 3.106437630438273, + "grad_norm": 0.499457448720932, + "learning_rate": 6.547235216672443e-07, + "loss": 0.211, + "step": 9675 + }, + { + "epoch": 3.106758709263124, + "grad_norm": 0.4725257456302643, + "learning_rate": 6.507417196582544e-07, + "loss": 0.2347, + "step": 9676 + }, + { + "epoch": 3.1070797880879755, + "grad_norm": 0.48947420716285706, + "learning_rate": 6.4677202305502e-07, + "loss": 0.1987, + "step": 9677 + }, + { + "epoch": 3.107400866912827, + "grad_norm": 0.6859459280967712, + "learning_rate": 6.428144323412544e-07, + "loss": 0.2539, + "step": 9678 + }, + { + "epoch": 3.1077219457376786, + "grad_norm": 0.5311869978904724, + "learning_rate": 6.388689479991605e-07, + "loss": 0.226, + "step": 9679 + }, + { + "epoch": 3.10804302456253, + "grad_norm": 0.668393611907959, + "learning_rate": 6.349355705094984e-07, + "loss": 0.2531, + "step": 9680 + }, + { + "epoch": 3.1083641033873817, + "grad_norm": 0.5925018787384033, + "learning_rate": 6.310143003515179e-07, + "loss": 0.2251, + "step": 9681 + }, + { + "epoch": 3.1086851822122332, + "grad_norm": 0.76288902759552, + "learning_rate": 6.271051380030368e-07, + "loss": 0.2765, + "step": 9682 + }, + { + "epoch": 3.1090062610370848, + "grad_norm": 0.8070435523986816, + "learning_rate": 6.232080839403631e-07, + "loss": 0.3033, + "step": 9683 + }, + { + "epoch": 3.1093273398619363, + "grad_norm": 0.6173917055130005, + "learning_rate": 6.193231386383391e-07, + "loss": 0.2695, + "step": 9684 + }, + { + "epoch": 3.1096484186867874, + "grad_norm": 0.8195832967758179, + "learning_rate": 6.154503025703417e-07, + "loss": 0.2724, + "step": 9685 + }, + { + "epoch": 3.109969497511639, + "grad_norm": 0.433586061000824, + "learning_rate": 6.115895762082602e-07, + "loss": 0.2257, + "step": 9686 + }, + { + "epoch": 3.1102905763364905, + "grad_norm": 0.4982379972934723, + "learning_rate": 6.07740960022507e-07, + "loss": 0.2394, + "step": 9687 + }, + { + "epoch": 3.110611655161342, + "grad_norm": 0.7917426824569702, + "learning_rate": 6.039044544820404e-07, + "loss": 0.2325, + "step": 9688 + }, + { + "epoch": 3.1109327339861936, + "grad_norm": 0.38256534934043884, + "learning_rate": 6.000800600542977e-07, + "loss": 0.2142, + "step": 9689 + }, + { + "epoch": 3.111253812811045, + "grad_norm": 0.7258543372154236, + "learning_rate": 5.96267777205295e-07, + "loss": 0.246, + "step": 9690 + }, + { + "epoch": 3.1115748916358967, + "grad_norm": 0.38089191913604736, + "learning_rate": 5.924676063995382e-07, + "loss": 0.2342, + "step": 9691 + }, + { + "epoch": 3.1118959704607483, + "grad_norm": 0.4571894407272339, + "learning_rate": 5.886795481000795e-07, + "loss": 0.2346, + "step": 9692 + }, + { + "epoch": 3.1122170492856, + "grad_norm": 0.3728485405445099, + "learning_rate": 5.849036027684606e-07, + "loss": 0.2318, + "step": 9693 + }, + { + "epoch": 3.112538128110451, + "grad_norm": 0.40570032596588135, + "learning_rate": 5.811397708647803e-07, + "loss": 0.3118, + "step": 9694 + }, + { + "epoch": 3.1128592069353025, + "grad_norm": 0.4947049617767334, + "learning_rate": 5.773880528476494e-07, + "loss": 0.5981, + "step": 9695 + }, + { + "epoch": 3.113180285760154, + "grad_norm": 0.4747079014778137, + "learning_rate": 5.736484491742134e-07, + "loss": 0.5451, + "step": 9696 + }, + { + "epoch": 3.1135013645850056, + "grad_norm": 0.23416991531848907, + "learning_rate": 5.699209603001076e-07, + "loss": 0.0842, + "step": 9697 + }, + { + "epoch": 3.113822443409857, + "grad_norm": 0.40521523356437683, + "learning_rate": 5.662055866795357e-07, + "loss": 0.2264, + "step": 9698 + }, + { + "epoch": 3.1141435222347087, + "grad_norm": 0.13257575035095215, + "learning_rate": 5.62502328765202e-07, + "loss": 0.055, + "step": 9699 + }, + { + "epoch": 3.1144646010595602, + "grad_norm": 0.25109007954597473, + "learning_rate": 5.588111870083346e-07, + "loss": 0.1009, + "step": 9700 + }, + { + "epoch": 3.114785679884412, + "grad_norm": 0.286777526140213, + "learning_rate": 5.551321618586736e-07, + "loss": 0.1089, + "step": 9701 + }, + { + "epoch": 3.1151067587092633, + "grad_norm": 0.3400633633136749, + "learning_rate": 5.514652537645271e-07, + "loss": 0.1317, + "step": 9702 + }, + { + "epoch": 3.1154278375341145, + "grad_norm": 0.6121630072593689, + "learning_rate": 5.478104631726711e-07, + "loss": 0.2507, + "step": 9703 + }, + { + "epoch": 3.115748916358966, + "grad_norm": 0.6167968511581421, + "learning_rate": 5.441677905284381e-07, + "loss": 0.2289, + "step": 9704 + }, + { + "epoch": 3.1160699951838176, + "grad_norm": 0.735968828201294, + "learning_rate": 5.405372362756734e-07, + "loss": 0.2501, + "step": 9705 + }, + { + "epoch": 3.116391074008669, + "grad_norm": 0.7218773365020752, + "learning_rate": 5.369188008567672e-07, + "loss": 0.2835, + "step": 9706 + }, + { + "epoch": 3.1167121528335207, + "grad_norm": 0.6471768617630005, + "learning_rate": 5.333124847125892e-07, + "loss": 0.1762, + "step": 9707 + }, + { + "epoch": 3.117033231658372, + "grad_norm": 0.7177468538284302, + "learning_rate": 5.297182882825879e-07, + "loss": 0.2868, + "step": 9708 + }, + { + "epoch": 3.1173543104832238, + "grad_norm": 0.540264904499054, + "learning_rate": 5.261362120046686e-07, + "loss": 0.2281, + "step": 9709 + }, + { + "epoch": 3.1176753893080753, + "grad_norm": 0.5251516103744507, + "learning_rate": 5.225662563153266e-07, + "loss": 0.1886, + "step": 9710 + }, + { + "epoch": 3.117996468132927, + "grad_norm": 0.5654820799827576, + "learning_rate": 5.190084216495361e-07, + "loss": 0.2386, + "step": 9711 + }, + { + "epoch": 3.118317546957778, + "grad_norm": 0.5759437680244446, + "learning_rate": 5.154627084408059e-07, + "loss": 0.1868, + "step": 9712 + }, + { + "epoch": 3.1186386257826295, + "grad_norm": 0.6524972319602966, + "learning_rate": 5.119291171211793e-07, + "loss": 0.2155, + "step": 9713 + }, + { + "epoch": 3.118959704607481, + "grad_norm": 0.5860231518745422, + "learning_rate": 5.084076481212119e-07, + "loss": 0.2216, + "step": 9714 + }, + { + "epoch": 3.1192807834323326, + "grad_norm": 0.7092858552932739, + "learning_rate": 5.048983018699827e-07, + "loss": 0.2829, + "step": 9715 + }, + { + "epoch": 3.119601862257184, + "grad_norm": 0.585176944732666, + "learning_rate": 5.01401078795094e-07, + "loss": 0.196, + "step": 9716 + }, + { + "epoch": 3.1199229410820357, + "grad_norm": 0.7186732888221741, + "learning_rate": 4.979159793226718e-07, + "loss": 0.2327, + "step": 9717 + }, + { + "epoch": 3.1202440199068873, + "grad_norm": 0.9595154523849487, + "learning_rate": 4.944430038773762e-07, + "loss": 0.2656, + "step": 9718 + }, + { + "epoch": 3.120565098731739, + "grad_norm": 0.4469728171825409, + "learning_rate": 4.909821528823577e-07, + "loss": 0.1912, + "step": 9719 + }, + { + "epoch": 3.1208861775565904, + "grad_norm": 0.7011276483535767, + "learning_rate": 4.875334267593234e-07, + "loss": 0.2695, + "step": 9720 + }, + { + "epoch": 3.1212072563814415, + "grad_norm": 0.49139729142189026, + "learning_rate": 4.840968259284817e-07, + "loss": 0.1871, + "step": 9721 + }, + { + "epoch": 3.121528335206293, + "grad_norm": 0.7045285105705261, + "learning_rate": 4.806723508085864e-07, + "loss": 0.241, + "step": 9722 + }, + { + "epoch": 3.1218494140311446, + "grad_norm": 0.6836577653884888, + "learning_rate": 4.772600018168816e-07, + "loss": 0.2571, + "step": 9723 + }, + { + "epoch": 3.122170492855996, + "grad_norm": 0.9169760346412659, + "learning_rate": 4.738597793691679e-07, + "loss": 0.3147, + "step": 9724 + }, + { + "epoch": 3.1224915716808477, + "grad_norm": 0.5703161954879761, + "learning_rate": 4.704716838797363e-07, + "loss": 0.2181, + "step": 9725 + }, + { + "epoch": 3.1228126505056992, + "grad_norm": 0.8328591585159302, + "learning_rate": 4.670957157614453e-07, + "loss": 0.2938, + "step": 9726 + }, + { + "epoch": 3.123133729330551, + "grad_norm": 0.545899510383606, + "learning_rate": 4.6373187542561035e-07, + "loss": 0.2104, + "step": 9727 + }, + { + "epoch": 3.1234548081554023, + "grad_norm": 0.6301508545875549, + "learning_rate": 4.6038016328211476e-07, + "loss": 0.2477, + "step": 9728 + }, + { + "epoch": 3.123775886980254, + "grad_norm": 0.7413309216499329, + "learning_rate": 4.570405797393762e-07, + "loss": 0.2591, + "step": 9729 + }, + { + "epoch": 3.124096965805105, + "grad_norm": 0.5572231411933899, + "learning_rate": 4.5371312520429144e-07, + "loss": 0.2159, + "step": 9730 + }, + { + "epoch": 3.1244180446299565, + "grad_norm": 0.4630489647388458, + "learning_rate": 4.503978000823028e-07, + "loss": 0.1875, + "step": 9731 + }, + { + "epoch": 3.124739123454808, + "grad_norm": 0.5202605128288269, + "learning_rate": 4.4709460477737607e-07, + "loss": 0.2429, + "step": 9732 + }, + { + "epoch": 3.1250602022796596, + "grad_norm": 0.5044448971748352, + "learning_rate": 4.438035396920004e-07, + "loss": 0.2206, + "step": 9733 + }, + { + "epoch": 3.125381281104511, + "grad_norm": 0.48041826486587524, + "learning_rate": 4.405246052271772e-07, + "loss": 0.2322, + "step": 9734 + }, + { + "epoch": 3.1257023599293627, + "grad_norm": 0.5572899580001831, + "learning_rate": 4.372578017824314e-07, + "loss": 0.232, + "step": 9735 + }, + { + "epoch": 3.1260234387542143, + "grad_norm": 0.6565282344818115, + "learning_rate": 4.3400312975581117e-07, + "loss": 0.2185, + "step": 9736 + }, + { + "epoch": 3.126344517579066, + "grad_norm": 0.484170526266098, + "learning_rate": 4.307605895439104e-07, + "loss": 0.2358, + "step": 9737 + }, + { + "epoch": 3.1266655964039174, + "grad_norm": 0.6327326893806458, + "learning_rate": 4.275301815417909e-07, + "loss": 0.2278, + "step": 9738 + }, + { + "epoch": 3.1269866752287685, + "grad_norm": 0.8204041719436646, + "learning_rate": 4.2431190614309335e-07, + "loss": 0.2654, + "step": 9739 + }, + { + "epoch": 3.12730775405362, + "grad_norm": 0.3902547359466553, + "learning_rate": 4.2110576373993736e-07, + "loss": 0.2428, + "step": 9740 + }, + { + "epoch": 3.1276288328784716, + "grad_norm": 0.44226837158203125, + "learning_rate": 4.179117547229883e-07, + "loss": 0.2411, + "step": 9741 + }, + { + "epoch": 3.127949911703323, + "grad_norm": 0.4273797273635864, + "learning_rate": 4.1472987948143473e-07, + "loss": 0.2463, + "step": 9742 + }, + { + "epoch": 3.1282709905281747, + "grad_norm": 0.29561299085617065, + "learning_rate": 4.115601384029666e-07, + "loss": 0.2297, + "step": 9743 + }, + { + "epoch": 3.1285920693530263, + "grad_norm": 0.5465667247772217, + "learning_rate": 4.084025318738083e-07, + "loss": 0.6041, + "step": 9744 + }, + { + "epoch": 3.128913148177878, + "grad_norm": 0.5179415941238403, + "learning_rate": 4.0525706027870756e-07, + "loss": 0.7336, + "step": 9745 + }, + { + "epoch": 3.1292342270027294, + "grad_norm": 0.45659175515174866, + "learning_rate": 4.021237240009468e-07, + "loss": 0.3653, + "step": 9746 + }, + { + "epoch": 3.129555305827581, + "grad_norm": 0.45041006803512573, + "learning_rate": 3.9900252342228717e-07, + "loss": 0.3496, + "step": 9747 + }, + { + "epoch": 3.129876384652432, + "grad_norm": 0.29173073172569275, + "learning_rate": 3.958934589230467e-07, + "loss": 0.1509, + "step": 9748 + }, + { + "epoch": 3.1301974634772836, + "grad_norm": 0.362936407327652, + "learning_rate": 3.9279653088205584e-07, + "loss": 0.1642, + "step": 9749 + }, + { + "epoch": 3.130518542302135, + "grad_norm": 0.23348329961299896, + "learning_rate": 3.8971173967666807e-07, + "loss": 0.0935, + "step": 9750 + }, + { + "epoch": 3.1308396211269867, + "grad_norm": 0.5271691083908081, + "learning_rate": 3.866390856827495e-07, + "loss": 0.2844, + "step": 9751 + }, + { + "epoch": 3.131160699951838, + "grad_norm": 0.5729438662528992, + "learning_rate": 3.835785692747118e-07, + "loss": 0.233, + "step": 9752 + }, + { + "epoch": 3.1314817787766898, + "grad_norm": 0.7787706851959229, + "learning_rate": 3.805301908254455e-07, + "loss": 0.3528, + "step": 9753 + }, + { + "epoch": 3.1318028576015413, + "grad_norm": 0.6220112442970276, + "learning_rate": 3.774939507063979e-07, + "loss": 0.2668, + "step": 9754 + }, + { + "epoch": 3.132123936426393, + "grad_norm": 0.5777502059936523, + "learning_rate": 3.744698492875398e-07, + "loss": 0.2084, + "step": 9755 + }, + { + "epoch": 3.1324450152512444, + "grad_norm": 0.6816080808639526, + "learning_rate": 3.7145788693732086e-07, + "loss": 0.2497, + "step": 9756 + }, + { + "epoch": 3.1327660940760955, + "grad_norm": 0.6858428120613098, + "learning_rate": 3.6845806402275863e-07, + "loss": 0.2512, + "step": 9757 + }, + { + "epoch": 3.133087172900947, + "grad_norm": 0.8424202799797058, + "learning_rate": 3.654703809093607e-07, + "loss": 0.2316, + "step": 9758 + }, + { + "epoch": 3.1334082517257986, + "grad_norm": 0.6304720640182495, + "learning_rate": 3.6249483796116924e-07, + "loss": 0.232, + "step": 9759 + }, + { + "epoch": 3.13372933055065, + "grad_norm": 0.9778458476066589, + "learning_rate": 3.595314355407609e-07, + "loss": 0.3145, + "step": 9760 + }, + { + "epoch": 3.1340504093755017, + "grad_norm": 0.593328058719635, + "learning_rate": 3.565801740092023e-07, + "loss": 0.2213, + "step": 9761 + }, + { + "epoch": 3.1343714882003533, + "grad_norm": 0.8587982058525085, + "learning_rate": 3.536410537260948e-07, + "loss": 0.2974, + "step": 9762 + }, + { + "epoch": 3.134692567025205, + "grad_norm": 0.6776645183563232, + "learning_rate": 3.50714075049563e-07, + "loss": 0.2801, + "step": 9763 + }, + { + "epoch": 3.1350136458500564, + "grad_norm": 0.8762674331665039, + "learning_rate": 3.4779923833626606e-07, + "loss": 0.3066, + "step": 9764 + }, + { + "epoch": 3.135334724674908, + "grad_norm": 0.6403717994689941, + "learning_rate": 3.4489654394134205e-07, + "loss": 0.2094, + "step": 9765 + }, + { + "epoch": 3.135655803499759, + "grad_norm": 0.4854280352592468, + "learning_rate": 3.4200599221848595e-07, + "loss": 0.1921, + "step": 9766 + }, + { + "epoch": 3.1359768823246106, + "grad_norm": 0.9062448143959045, + "learning_rate": 3.3912758351991593e-07, + "loss": 0.3486, + "step": 9767 + }, + { + "epoch": 3.136297961149462, + "grad_norm": 0.7571354508399963, + "learning_rate": 3.362613181963403e-07, + "loss": 0.2229, + "step": 9768 + }, + { + "epoch": 3.1366190399743137, + "grad_norm": 0.585010826587677, + "learning_rate": 3.3340719659701313e-07, + "loss": 0.2403, + "step": 9769 + }, + { + "epoch": 3.1369401187991652, + "grad_norm": 0.6055962443351746, + "learning_rate": 3.305652190696895e-07, + "loss": 0.2285, + "step": 9770 + }, + { + "epoch": 3.137261197624017, + "grad_norm": 0.49409618973731995, + "learning_rate": 3.277353859606813e-07, + "loss": 0.1914, + "step": 9771 + }, + { + "epoch": 3.1375822764488683, + "grad_norm": 0.39980348944664, + "learning_rate": 3.249176976147683e-07, + "loss": 0.1664, + "step": 9772 + }, + { + "epoch": 3.13790335527372, + "grad_norm": 0.574813187122345, + "learning_rate": 3.2211215437528694e-07, + "loss": 0.2273, + "step": 9773 + }, + { + "epoch": 3.138224434098571, + "grad_norm": 0.5716437101364136, + "learning_rate": 3.1931875658408604e-07, + "loss": 0.2453, + "step": 9774 + }, + { + "epoch": 3.1385455129234225, + "grad_norm": 0.40409037470817566, + "learning_rate": 3.1653750458152666e-07, + "loss": 0.1933, + "step": 9775 + }, + { + "epoch": 3.138866591748274, + "grad_norm": 0.6817755103111267, + "learning_rate": 3.137683987065043e-07, + "loss": 0.2179, + "step": 9776 + }, + { + "epoch": 3.1391876705731256, + "grad_norm": 0.6137420535087585, + "learning_rate": 3.1101143929641585e-07, + "loss": 0.2484, + "step": 9777 + }, + { + "epoch": 3.139508749397977, + "grad_norm": 0.7133082747459412, + "learning_rate": 3.0826662668720364e-07, + "loss": 0.2384, + "step": 9778 + }, + { + "epoch": 3.1398298282228287, + "grad_norm": 0.6078048348426819, + "learning_rate": 3.0553396121330013e-07, + "loss": 0.2337, + "step": 9779 + }, + { + "epoch": 3.1401509070476803, + "grad_norm": 0.325295627117157, + "learning_rate": 3.0281344320768347e-07, + "loss": 0.1829, + "step": 9780 + }, + { + "epoch": 3.140471985872532, + "grad_norm": 0.5655336976051331, + "learning_rate": 3.001050730018218e-07, + "loss": 0.2414, + "step": 9781 + }, + { + "epoch": 3.1407930646973834, + "grad_norm": 0.6369791626930237, + "learning_rate": 2.974088509257511e-07, + "loss": 0.2344, + "step": 9782 + }, + { + "epoch": 3.1411141435222345, + "grad_norm": 0.42372626066207886, + "learning_rate": 2.947247773079753e-07, + "loss": 0.2057, + "step": 9783 + }, + { + "epoch": 3.141435222347086, + "grad_norm": 0.550973653793335, + "learning_rate": 2.9205285247555505e-07, + "loss": 0.2484, + "step": 9784 + }, + { + "epoch": 3.1417563011719376, + "grad_norm": 0.5076101422309875, + "learning_rate": 2.893930767540298e-07, + "loss": 0.2296, + "step": 9785 + }, + { + "epoch": 3.142077379996789, + "grad_norm": 0.4814938008785248, + "learning_rate": 2.867454504675182e-07, + "loss": 0.2261, + "step": 9786 + }, + { + "epoch": 3.1423984588216407, + "grad_norm": 0.3789692521095276, + "learning_rate": 2.841099739386066e-07, + "loss": 0.2236, + "step": 9787 + }, + { + "epoch": 3.1427195376464923, + "grad_norm": 0.3235059976577759, + "learning_rate": 2.81486647488427e-07, + "loss": 0.2032, + "step": 9788 + }, + { + "epoch": 3.143040616471344, + "grad_norm": 0.4682113826274872, + "learning_rate": 2.7887547143662373e-07, + "loss": 0.2508, + "step": 9789 + }, + { + "epoch": 3.1433616952961954, + "grad_norm": 0.48588764667510986, + "learning_rate": 2.762764461013423e-07, + "loss": 0.2209, + "step": 9790 + }, + { + "epoch": 3.143682774121047, + "grad_norm": 0.4631730914115906, + "learning_rate": 2.73689571799296e-07, + "loss": 0.2206, + "step": 9791 + }, + { + "epoch": 3.144003852945898, + "grad_norm": 0.3348900377750397, + "learning_rate": 2.7111484884567717e-07, + "loss": 0.2183, + "step": 9792 + }, + { + "epoch": 3.1443249317707496, + "grad_norm": 0.2803027927875519, + "learning_rate": 2.685522775541904e-07, + "loss": 0.2277, + "step": 9793 + }, + { + "epoch": 3.144646010595601, + "grad_norm": 0.18889014422893524, + "learning_rate": 2.660018582370971e-07, + "loss": 0.1372, + "step": 9794 + }, + { + "epoch": 3.1449670894204527, + "grad_norm": 0.5592251420021057, + "learning_rate": 2.6346359120514863e-07, + "loss": 0.627, + "step": 9795 + }, + { + "epoch": 3.145288168245304, + "grad_norm": 0.42517322301864624, + "learning_rate": 2.609374767676309e-07, + "loss": 0.4038, + "step": 9796 + }, + { + "epoch": 3.1456092470701558, + "grad_norm": 0.48470279574394226, + "learning_rate": 2.584235152323422e-07, + "loss": 0.3099, + "step": 9797 + }, + { + "epoch": 3.1459303258950073, + "grad_norm": 0.23615846037864685, + "learning_rate": 2.5592170690560414e-07, + "loss": 0.0814, + "step": 9798 + }, + { + "epoch": 3.146251404719859, + "grad_norm": 0.24394987523555756, + "learning_rate": 2.534320520922506e-07, + "loss": 0.0938, + "step": 9799 + }, + { + "epoch": 3.1465724835447104, + "grad_norm": 0.1944095492362976, + "learning_rate": 2.5095455109562795e-07, + "loss": 0.0707, + "step": 9800 + }, + { + "epoch": 3.1468935623695615, + "grad_norm": 0.470803827047348, + "learning_rate": 2.484892042176279e-07, + "loss": 0.2102, + "step": 9801 + }, + { + "epoch": 3.147214641194413, + "grad_norm": 0.5542094111442566, + "learning_rate": 2.4603601175864356e-07, + "loss": 0.3056, + "step": 9802 + }, + { + "epoch": 3.1475357200192646, + "grad_norm": 0.6499555110931396, + "learning_rate": 2.4359497401758024e-07, + "loss": 0.287, + "step": 9803 + }, + { + "epoch": 3.147856798844116, + "grad_norm": 0.690673291683197, + "learning_rate": 2.4116609129187786e-07, + "loss": 0.2237, + "step": 9804 + }, + { + "epoch": 3.1481778776689677, + "grad_norm": 0.6088657975196838, + "learning_rate": 2.387493638774774e-07, + "loss": 0.2442, + "step": 9805 + }, + { + "epoch": 3.1484989564938193, + "grad_norm": 0.6405919194221497, + "learning_rate": 2.363447920688655e-07, + "loss": 0.248, + "step": 9806 + }, + { + "epoch": 3.148820035318671, + "grad_norm": 0.5600770711898804, + "learning_rate": 2.339523761590301e-07, + "loss": 0.2073, + "step": 9807 + }, + { + "epoch": 3.1491411141435224, + "grad_norm": 0.6620354652404785, + "learning_rate": 2.315721164394713e-07, + "loss": 0.2274, + "step": 9808 + }, + { + "epoch": 3.149462192968374, + "grad_norm": 0.8232807517051697, + "learning_rate": 2.2920401320022378e-07, + "loss": 0.2655, + "step": 9809 + }, + { + "epoch": 3.149783271793225, + "grad_norm": 0.48089903593063354, + "learning_rate": 2.2684806672982338e-07, + "loss": 0.197, + "step": 9810 + }, + { + "epoch": 3.1501043506180766, + "grad_norm": 0.6705470085144043, + "learning_rate": 2.2450427731534053e-07, + "loss": 0.2304, + "step": 9811 + }, + { + "epoch": 3.150425429442928, + "grad_norm": 0.7051324248313904, + "learning_rate": 2.2217264524236892e-07, + "loss": 0.2445, + "step": 9812 + }, + { + "epoch": 3.1507465082677797, + "grad_norm": 0.6257637143135071, + "learning_rate": 2.1985317079500356e-07, + "loss": 0.2012, + "step": 9813 + }, + { + "epoch": 3.1510675870926312, + "grad_norm": 0.9062849879264832, + "learning_rate": 2.175458542558517e-07, + "loss": 0.286, + "step": 9814 + }, + { + "epoch": 3.151388665917483, + "grad_norm": 0.5609592795372009, + "learning_rate": 2.1525069590607737e-07, + "loss": 0.185, + "step": 9815 + }, + { + "epoch": 3.1517097447423343, + "grad_norm": 0.5486966371536255, + "learning_rate": 2.1296769602532352e-07, + "loss": 0.2412, + "step": 9816 + }, + { + "epoch": 3.152030823567186, + "grad_norm": 0.7732871770858765, + "learning_rate": 2.106968548917676e-07, + "loss": 0.2621, + "step": 9817 + }, + { + "epoch": 3.1523519023920374, + "grad_norm": 0.5511267781257629, + "learning_rate": 2.0843817278209942e-07, + "loss": 0.2065, + "step": 9818 + }, + { + "epoch": 3.1526729812168885, + "grad_norm": 0.5723970532417297, + "learning_rate": 2.0619164997155438e-07, + "loss": 0.2028, + "step": 9819 + }, + { + "epoch": 3.15299406004174, + "grad_norm": 0.7419432997703552, + "learning_rate": 2.0395728673383574e-07, + "loss": 0.2796, + "step": 9820 + }, + { + "epoch": 3.1533151388665916, + "grad_norm": 0.7188732624053955, + "learning_rate": 2.017350833412146e-07, + "loss": 0.1864, + "step": 9821 + }, + { + "epoch": 3.153636217691443, + "grad_norm": 0.7375403642654419, + "learning_rate": 1.9952504006446325e-07, + "loss": 0.2474, + "step": 9822 + }, + { + "epoch": 3.1539572965162948, + "grad_norm": 0.5861619114875793, + "learning_rate": 1.973271571728441e-07, + "loss": 0.2157, + "step": 9823 + }, + { + "epoch": 3.1542783753411463, + "grad_norm": 0.42802998423576355, + "learning_rate": 1.9514143493417625e-07, + "loss": 0.2182, + "step": 9824 + }, + { + "epoch": 3.154599454165998, + "grad_norm": 0.6591411828994751, + "learning_rate": 1.9296787361480216e-07, + "loss": 0.246, + "step": 9825 + }, + { + "epoch": 3.1549205329908494, + "grad_norm": 0.8120198845863342, + "learning_rate": 1.908064734795323e-07, + "loss": 0.3286, + "step": 9826 + }, + { + "epoch": 3.155241611815701, + "grad_norm": 0.4934924840927124, + "learning_rate": 1.8865723479173368e-07, + "loss": 0.1993, + "step": 9827 + }, + { + "epoch": 3.155562690640552, + "grad_norm": 0.558791995048523, + "learning_rate": 1.86520157813308e-07, + "loss": 0.2377, + "step": 9828 + }, + { + "epoch": 3.1558837694654036, + "grad_norm": 0.8275998830795288, + "learning_rate": 1.8439524280462472e-07, + "loss": 0.2888, + "step": 9829 + }, + { + "epoch": 3.156204848290255, + "grad_norm": 0.5782402157783508, + "learning_rate": 1.8228249002461006e-07, + "loss": 0.2376, + "step": 9830 + }, + { + "epoch": 3.1565259271151067, + "grad_norm": 0.5709177255630493, + "learning_rate": 1.8018189973069143e-07, + "loss": 0.2141, + "step": 9831 + }, + { + "epoch": 3.1568470059399583, + "grad_norm": 0.6545057892799377, + "learning_rate": 1.7809347217881966e-07, + "loss": 0.2365, + "step": 9832 + }, + { + "epoch": 3.15716808476481, + "grad_norm": 0.7144107222557068, + "learning_rate": 1.7601720762346897e-07, + "loss": 0.2655, + "step": 9833 + }, + { + "epoch": 3.1574891635896614, + "grad_norm": 0.4486263394355774, + "learning_rate": 1.7395310631762585e-07, + "loss": 0.1869, + "step": 9834 + }, + { + "epoch": 3.157810242414513, + "grad_norm": 0.5604619979858398, + "learning_rate": 1.7190116851280026e-07, + "loss": 0.2374, + "step": 9835 + }, + { + "epoch": 3.1581313212393645, + "grad_norm": 0.5720604658126831, + "learning_rate": 1.698613944589922e-07, + "loss": 0.2534, + "step": 9836 + }, + { + "epoch": 3.1584524000642156, + "grad_norm": 0.5002235174179077, + "learning_rate": 1.678337844047695e-07, + "loss": 0.2297, + "step": 9837 + }, + { + "epoch": 3.158773478889067, + "grad_norm": 0.519655168056488, + "learning_rate": 1.6581833859716788e-07, + "loss": 0.249, + "step": 9838 + }, + { + "epoch": 3.1590945577139187, + "grad_norm": 0.4107975959777832, + "learning_rate": 1.6381505728176872e-07, + "loss": 0.2233, + "step": 9839 + }, + { + "epoch": 3.1594156365387702, + "grad_norm": 0.30553480982780457, + "learning_rate": 1.618239407026767e-07, + "loss": 0.2119, + "step": 9840 + }, + { + "epoch": 3.1597367153636218, + "grad_norm": 0.3645618259906769, + "learning_rate": 1.598449891024978e-07, + "loss": 0.2261, + "step": 9841 + }, + { + "epoch": 3.1600577941884733, + "grad_norm": 0.3786523938179016, + "learning_rate": 1.578782027223502e-07, + "loss": 0.2326, + "step": 9842 + }, + { + "epoch": 3.160378873013325, + "grad_norm": 0.4503689110279083, + "learning_rate": 1.5592358180189782e-07, + "loss": 0.2413, + "step": 9843 + }, + { + "epoch": 3.1606999518381764, + "grad_norm": 0.5784024000167847, + "learning_rate": 1.5398112657929453e-07, + "loss": 0.5337, + "step": 9844 + }, + { + "epoch": 3.161021030663028, + "grad_norm": 0.42213118076324463, + "learning_rate": 1.520508372912288e-07, + "loss": 0.4703, + "step": 9845 + }, + { + "epoch": 3.161342109487879, + "grad_norm": 0.3703862428665161, + "learning_rate": 1.5013271417290143e-07, + "loss": 0.2417, + "step": 9846 + }, + { + "epoch": 3.1616631883127306, + "grad_norm": 0.321639746427536, + "learning_rate": 1.4822675745801429e-07, + "loss": 0.1601, + "step": 9847 + }, + { + "epoch": 3.161984267137582, + "grad_norm": 0.1442064493894577, + "learning_rate": 1.4633296737882607e-07, + "loss": 0.0541, + "step": 9848 + }, + { + "epoch": 3.1623053459624337, + "grad_norm": 0.10288975387811661, + "learning_rate": 1.4445134416607442e-07, + "loss": 0.0532, + "step": 9849 + }, + { + "epoch": 3.1626264247872853, + "grad_norm": 0.21865668892860413, + "learning_rate": 1.425818880490315e-07, + "loss": 0.0609, + "step": 9850 + }, + { + "epoch": 3.162947503612137, + "grad_norm": 0.6108369827270508, + "learning_rate": 1.4072459925548177e-07, + "loss": 0.2515, + "step": 9851 + }, + { + "epoch": 3.1632685824369884, + "grad_norm": 0.9449090361595154, + "learning_rate": 1.3887947801173307e-07, + "loss": 0.2917, + "step": 9852 + }, + { + "epoch": 3.16358966126184, + "grad_norm": 0.6836480498313904, + "learning_rate": 1.3704652454261668e-07, + "loss": 0.231, + "step": 9853 + }, + { + "epoch": 3.1639107400866915, + "grad_norm": 0.5700660943984985, + "learning_rate": 1.3522573907145398e-07, + "loss": 0.2352, + "step": 9854 + }, + { + "epoch": 3.1642318189115426, + "grad_norm": 0.645478367805481, + "learning_rate": 1.33417121820123e-07, + "loss": 0.2189, + "step": 9855 + }, + { + "epoch": 3.164552897736394, + "grad_norm": 0.7627379894256592, + "learning_rate": 1.3162067300898084e-07, + "loss": 0.236, + "step": 9856 + }, + { + "epoch": 3.1648739765612457, + "grad_norm": 0.6092104911804199, + "learning_rate": 1.2983639285693017e-07, + "loss": 0.2363, + "step": 9857 + }, + { + "epoch": 3.1651950553860972, + "grad_norm": 0.7562824487686157, + "learning_rate": 1.2806428158138596e-07, + "loss": 0.2824, + "step": 9858 + }, + { + "epoch": 3.165516134210949, + "grad_norm": 0.7711445093154907, + "learning_rate": 1.2630433939825327e-07, + "loss": 0.2944, + "step": 9859 + }, + { + "epoch": 3.1658372130358003, + "grad_norm": 0.7503237724304199, + "learning_rate": 1.2455656652198277e-07, + "loss": 0.2468, + "step": 9860 + }, + { + "epoch": 3.166158291860652, + "grad_norm": 0.6290993690490723, + "learning_rate": 1.2282096316554858e-07, + "loss": 0.2132, + "step": 9861 + }, + { + "epoch": 3.1664793706855034, + "grad_norm": 0.7252655625343323, + "learning_rate": 1.2109752954042597e-07, + "loss": 0.2505, + "step": 9862 + }, + { + "epoch": 3.166800449510355, + "grad_norm": 0.5677751898765564, + "learning_rate": 1.193862658566025e-07, + "loss": 0.2001, + "step": 9863 + }, + { + "epoch": 3.167121528335206, + "grad_norm": 0.5677169561386108, + "learning_rate": 1.1768717232257809e-07, + "loss": 0.205, + "step": 9864 + }, + { + "epoch": 3.1674426071600577, + "grad_norm": 0.8806698322296143, + "learning_rate": 1.160002491454093e-07, + "loss": 0.3299, + "step": 9865 + }, + { + "epoch": 3.167763685984909, + "grad_norm": 0.4428505003452301, + "learning_rate": 1.1432549653063174e-07, + "loss": 0.1925, + "step": 9866 + }, + { + "epoch": 3.1680847648097608, + "grad_norm": 0.6543468236923218, + "learning_rate": 1.1266291468229328e-07, + "loss": 0.2174, + "step": 9867 + }, + { + "epoch": 3.1684058436346123, + "grad_norm": 0.6415979862213135, + "learning_rate": 1.1101250380300965e-07, + "loss": 0.243, + "step": 9868 + }, + { + "epoch": 3.168726922459464, + "grad_norm": 0.7214950323104858, + "learning_rate": 1.0937426409384221e-07, + "loss": 0.2707, + "step": 9869 + }, + { + "epoch": 3.1690480012843154, + "grad_norm": 0.7399327754974365, + "learning_rate": 1.0774819575442019e-07, + "loss": 0.2824, + "step": 9870 + }, + { + "epoch": 3.169369080109167, + "grad_norm": 0.6261020302772522, + "learning_rate": 1.0613429898287398e-07, + "loss": 0.2368, + "step": 9871 + }, + { + "epoch": 3.1696901589340185, + "grad_norm": 0.507150411605835, + "learning_rate": 1.0453257397585737e-07, + "loss": 0.2091, + "step": 9872 + }, + { + "epoch": 3.1700112377588696, + "grad_norm": 0.5187735557556152, + "learning_rate": 1.0294302092853647e-07, + "loss": 0.2175, + "step": 9873 + }, + { + "epoch": 3.170332316583721, + "grad_norm": 0.9378407001495361, + "learning_rate": 1.013656400345786e-07, + "loss": 0.2481, + "step": 9874 + }, + { + "epoch": 3.1706533954085727, + "grad_norm": 0.5098487734794617, + "learning_rate": 9.980043148619667e-08, + "loss": 0.2186, + "step": 9875 + }, + { + "epoch": 3.1709744742334243, + "grad_norm": 0.4638996422290802, + "learning_rate": 9.824739547410477e-08, + "loss": 0.2072, + "step": 9876 + }, + { + "epoch": 3.171295553058276, + "grad_norm": 0.7582216858863831, + "learning_rate": 9.670653218752934e-08, + "loss": 0.3402, + "step": 9877 + }, + { + "epoch": 3.1716166318831274, + "grad_norm": 0.42839518189430237, + "learning_rate": 9.517784181422019e-08, + "loss": 0.2069, + "step": 9878 + }, + { + "epoch": 3.171937710707979, + "grad_norm": 0.4199906885623932, + "learning_rate": 9.366132454046162e-08, + "loss": 0.2111, + "step": 9879 + }, + { + "epoch": 3.1722587895328305, + "grad_norm": 0.6500821113586426, + "learning_rate": 9.215698055100585e-08, + "loss": 0.2415, + "step": 9880 + }, + { + "epoch": 3.1725798683576816, + "grad_norm": 1.0988209247589111, + "learning_rate": 9.066481002918403e-08, + "loss": 0.2793, + "step": 9881 + }, + { + "epoch": 3.172900947182533, + "grad_norm": 0.4966912567615509, + "learning_rate": 8.918481315678406e-08, + "loss": 0.2166, + "step": 9882 + }, + { + "epoch": 3.1732220260073847, + "grad_norm": 0.4506468176841736, + "learning_rate": 8.771699011416168e-08, + "loss": 0.2084, + "step": 9883 + }, + { + "epoch": 3.1735431048322362, + "grad_norm": 0.5450963973999023, + "learning_rate": 8.626134108016271e-08, + "loss": 0.2226, + "step": 9884 + }, + { + "epoch": 3.173864183657088, + "grad_norm": 0.6653562784194946, + "learning_rate": 8.481786623214528e-08, + "loss": 0.2526, + "step": 9885 + }, + { + "epoch": 3.1741852624819393, + "grad_norm": 0.3943636417388916, + "learning_rate": 8.33865657459909e-08, + "loss": 0.2209, + "step": 9886 + }, + { + "epoch": 3.174506341306791, + "grad_norm": 0.6992161870002747, + "learning_rate": 8.196743979610455e-08, + "loss": 0.2377, + "step": 9887 + }, + { + "epoch": 3.1748274201316424, + "grad_norm": 0.40512463450431824, + "learning_rate": 8.056048855540343e-08, + "loss": 0.245, + "step": 9888 + }, + { + "epoch": 3.175148498956494, + "grad_norm": 0.5136418342590332, + "learning_rate": 7.91657121953171e-08, + "loss": 0.2392, + "step": 9889 + }, + { + "epoch": 3.175469577781345, + "grad_norm": 0.6193544864654541, + "learning_rate": 7.778311088579849e-08, + "loss": 0.255, + "step": 9890 + }, + { + "epoch": 3.1757906566061966, + "grad_norm": 0.4072697162628174, + "learning_rate": 7.641268479531283e-08, + "loss": 0.2205, + "step": 9891 + }, + { + "epoch": 3.176111735431048, + "grad_norm": 0.3897491693496704, + "learning_rate": 7.505443409083768e-08, + "loss": 0.2429, + "step": 9892 + }, + { + "epoch": 3.1764328142558997, + "grad_norm": 0.46470972895622253, + "learning_rate": 7.370835893788508e-08, + "loss": 0.2422, + "step": 9893 + }, + { + "epoch": 3.1767538930807513, + "grad_norm": 0.39090585708618164, + "learning_rate": 7.237445950044608e-08, + "loss": 0.3282, + "step": 9894 + }, + { + "epoch": 3.177074971905603, + "grad_norm": 0.5572707056999207, + "learning_rate": 7.105273594107953e-08, + "loss": 0.6177, + "step": 9895 + }, + { + "epoch": 3.1773960507304544, + "grad_norm": 0.3608555793762207, + "learning_rate": 6.974318842081217e-08, + "loss": 0.2933, + "step": 9896 + }, + { + "epoch": 3.177717129555306, + "grad_norm": 0.28094372153282166, + "learning_rate": 6.844581709921638e-08, + "loss": 0.16, + "step": 9897 + }, + { + "epoch": 3.1780382083801575, + "grad_norm": 0.4451110363006592, + "learning_rate": 6.71606221343768e-08, + "loss": 0.261, + "step": 9898 + }, + { + "epoch": 3.1783592872050086, + "grad_norm": 0.2656843364238739, + "learning_rate": 6.588760368287928e-08, + "loss": 0.0902, + "step": 9899 + }, + { + "epoch": 3.17868036602986, + "grad_norm": 0.17016752064228058, + "learning_rate": 6.462676189985528e-08, + "loss": 0.061, + "step": 9900 + }, + { + "epoch": 3.1790014448547117, + "grad_norm": 0.3899795413017273, + "learning_rate": 6.337809693891527e-08, + "loss": 0.2108, + "step": 9901 + }, + { + "epoch": 3.1793225236795633, + "grad_norm": 0.6329613924026489, + "learning_rate": 6.214160895222643e-08, + "loss": 0.2528, + "step": 9902 + }, + { + "epoch": 3.179643602504415, + "grad_norm": 0.6943358778953552, + "learning_rate": 6.09172980904238e-08, + "loss": 0.2953, + "step": 9903 + }, + { + "epoch": 3.1799646813292664, + "grad_norm": 0.8991124629974365, + "learning_rate": 5.970516450271025e-08, + "loss": 0.2841, + "step": 9904 + }, + { + "epoch": 3.180285760154118, + "grad_norm": 0.6497676968574524, + "learning_rate": 5.850520833676765e-08, + "loss": 0.2275, + "step": 9905 + }, + { + "epoch": 3.1806068389789695, + "grad_norm": 0.6367692947387695, + "learning_rate": 5.7317429738812376e-08, + "loss": 0.2095, + "step": 9906 + }, + { + "epoch": 3.180927917803821, + "grad_norm": 0.6393100619316101, + "learning_rate": 5.6141828853573106e-08, + "loss": 0.2308, + "step": 9907 + }, + { + "epoch": 3.181248996628672, + "grad_norm": 0.8095628023147583, + "learning_rate": 5.497840582429081e-08, + "loss": 0.3126, + "step": 9908 + }, + { + "epoch": 3.1815700754535237, + "grad_norm": 0.6111844182014465, + "learning_rate": 5.382716079271877e-08, + "loss": 0.2296, + "step": 9909 + }, + { + "epoch": 3.181891154278375, + "grad_norm": 0.4887303113937378, + "learning_rate": 5.268809389913365e-08, + "loss": 0.1672, + "step": 9910 + }, + { + "epoch": 3.1822122331032268, + "grad_norm": 0.6326609253883362, + "learning_rate": 5.1561205282335547e-08, + "loss": 0.2229, + "step": 9911 + }, + { + "epoch": 3.1825333119280783, + "grad_norm": 0.6799615025520325, + "learning_rate": 5.0446495079636836e-08, + "loss": 0.2754, + "step": 9912 + }, + { + "epoch": 3.18285439075293, + "grad_norm": 0.4907290041446686, + "learning_rate": 4.934396342684e-08, + "loss": 0.1849, + "step": 9913 + }, + { + "epoch": 3.1831754695777814, + "grad_norm": 0.7006059288978577, + "learning_rate": 4.825361045831534e-08, + "loss": 0.2467, + "step": 9914 + }, + { + "epoch": 3.183496548402633, + "grad_norm": 0.6643416285514832, + "learning_rate": 4.717543630688992e-08, + "loss": 0.2323, + "step": 9915 + }, + { + "epoch": 3.1838176272274845, + "grad_norm": 0.7749451994895935, + "learning_rate": 4.610944110394755e-08, + "loss": 0.2579, + "step": 9916 + }, + { + "epoch": 3.1841387060523356, + "grad_norm": 0.8002907633781433, + "learning_rate": 4.5055624979384316e-08, + "loss": 0.2735, + "step": 9917 + }, + { + "epoch": 3.184459784877187, + "grad_norm": 0.7704986929893494, + "learning_rate": 4.401398806159751e-08, + "loss": 0.2775, + "step": 9918 + }, + { + "epoch": 3.1847808637020387, + "grad_norm": 0.7502204775810242, + "learning_rate": 4.298453047749673e-08, + "loss": 0.255, + "step": 9919 + }, + { + "epoch": 3.1851019425268903, + "grad_norm": 0.4624794125556946, + "learning_rate": 4.196725235253718e-08, + "loss": 0.2155, + "step": 9920 + }, + { + "epoch": 3.185423021351742, + "grad_norm": 0.8697096705436707, + "learning_rate": 4.096215381066415e-08, + "loss": 0.3674, + "step": 9921 + }, + { + "epoch": 3.1857441001765934, + "grad_norm": 0.8463855385780334, + "learning_rate": 3.996923497434635e-08, + "loss": 0.3001, + "step": 9922 + }, + { + "epoch": 3.186065179001445, + "grad_norm": 0.7046919465065002, + "learning_rate": 3.898849596456478e-08, + "loss": 0.2382, + "step": 9923 + }, + { + "epoch": 3.1863862578262965, + "grad_norm": 0.7259396314620972, + "learning_rate": 3.8019936900812735e-08, + "loss": 0.2166, + "step": 9924 + }, + { + "epoch": 3.186707336651148, + "grad_norm": 0.8264093399047852, + "learning_rate": 3.7063557901129144e-08, + "loss": 0.2728, + "step": 9925 + }, + { + "epoch": 3.187028415475999, + "grad_norm": 0.5185015201568604, + "learning_rate": 3.61193590820208e-08, + "loss": 0.2227, + "step": 9926 + }, + { + "epoch": 3.1873494943008507, + "grad_norm": 0.9947241544723511, + "learning_rate": 3.518734055855122e-08, + "loss": 0.3701, + "step": 9927 + }, + { + "epoch": 3.1876705731257022, + "grad_norm": 0.5121382474899292, + "learning_rate": 3.4267502444274015e-08, + "loss": 0.1992, + "step": 9928 + }, + { + "epoch": 3.187991651950554, + "grad_norm": 0.5972121953964233, + "learning_rate": 3.33598448512773e-08, + "loss": 0.2321, + "step": 9929 + }, + { + "epoch": 3.1883127307754053, + "grad_norm": 0.543809175491333, + "learning_rate": 3.246436789015039e-08, + "loss": 0.2201, + "step": 9930 + }, + { + "epoch": 3.188633809600257, + "grad_norm": 0.8629385232925415, + "learning_rate": 3.1581071670006015e-08, + "loss": 0.2955, + "step": 9931 + }, + { + "epoch": 3.1889548884251084, + "grad_norm": 0.6378942728042603, + "learning_rate": 3.070995629846918e-08, + "loss": 0.2141, + "step": 9932 + }, + { + "epoch": 3.18927596724996, + "grad_norm": 0.5101456642150879, + "learning_rate": 2.985102188168831e-08, + "loss": 0.2327, + "step": 9933 + }, + { + "epoch": 3.1895970460748115, + "grad_norm": 0.6790507435798645, + "learning_rate": 2.900426852431304e-08, + "loss": 0.2728, + "step": 9934 + }, + { + "epoch": 3.1899181248996626, + "grad_norm": 0.4711005985736847, + "learning_rate": 2.8169696329527483e-08, + "loss": 0.2497, + "step": 9935 + }, + { + "epoch": 3.190239203724514, + "grad_norm": 1.078393816947937, + "learning_rate": 2.7347305399016977e-08, + "loss": 0.2378, + "step": 9936 + }, + { + "epoch": 3.1905602825493657, + "grad_norm": 0.6660891175270081, + "learning_rate": 2.6537095832990245e-08, + "loss": 0.2562, + "step": 9937 + }, + { + "epoch": 3.1908813613742173, + "grad_norm": 0.5800371170043945, + "learning_rate": 2.573906773016832e-08, + "loss": 0.2481, + "step": 9938 + }, + { + "epoch": 3.191202440199069, + "grad_norm": 0.47498077154159546, + "learning_rate": 2.4953221187784537e-08, + "loss": 0.2451, + "step": 9939 + }, + { + "epoch": 3.1915235190239204, + "grad_norm": 0.5962786078453064, + "learning_rate": 2.417955630159563e-08, + "loss": 0.2374, + "step": 9940 + }, + { + "epoch": 3.191844597848772, + "grad_norm": 0.3102143108844757, + "learning_rate": 2.3418073165870636e-08, + "loss": 0.246, + "step": 9941 + }, + { + "epoch": 3.1921656766736235, + "grad_norm": 0.6328103542327881, + "learning_rate": 2.266877187339089e-08, + "loss": 0.2762, + "step": 9942 + }, + { + "epoch": 3.192486755498475, + "grad_norm": 0.34649330377578735, + "learning_rate": 2.193165251545004e-08, + "loss": 0.237, + "step": 9943 + }, + { + "epoch": 3.192807834323326, + "grad_norm": 0.34423136711120605, + "learning_rate": 2.1206715181876226e-08, + "loss": 0.3348, + "step": 9944 + }, + { + "epoch": 3.1931289131481777, + "grad_norm": 0.44818225502967834, + "learning_rate": 2.0493959960998787e-08, + "loss": 0.4411, + "step": 9945 + }, + { + "epoch": 3.1934499919730293, + "grad_norm": 0.2298501431941986, + "learning_rate": 1.9793386939659376e-08, + "loss": 0.1117, + "step": 9946 + }, + { + "epoch": 3.193771070797881, + "grad_norm": 0.3223446011543274, + "learning_rate": 1.9104996203223037e-08, + "loss": 0.167, + "step": 9947 + }, + { + "epoch": 3.1940921496227324, + "grad_norm": 0.4785112738609314, + "learning_rate": 1.842878783557822e-08, + "loss": 0.2142, + "step": 9948 + }, + { + "epoch": 3.194413228447584, + "grad_norm": 0.2509891092777252, + "learning_rate": 1.7764761919103477e-08, + "loss": 0.1088, + "step": 9949 + }, + { + "epoch": 3.1947343072724355, + "grad_norm": 0.5531440377235413, + "learning_rate": 1.7112918534711865e-08, + "loss": 0.2591, + "step": 9950 + }, + { + "epoch": 3.195055386097287, + "grad_norm": 0.7335166931152344, + "learning_rate": 1.6473257761828732e-08, + "loss": 0.3958, + "step": 9951 + }, + { + "epoch": 3.1953764649221386, + "grad_norm": 0.8385985493659973, + "learning_rate": 1.584577967840284e-08, + "loss": 0.305, + "step": 9952 + }, + { + "epoch": 3.1956975437469897, + "grad_norm": 0.6894627809524536, + "learning_rate": 1.5230484360873044e-08, + "loss": 0.2275, + "step": 9953 + }, + { + "epoch": 3.196018622571841, + "grad_norm": 0.570214569568634, + "learning_rate": 1.4627371884234909e-08, + "loss": 0.2196, + "step": 9954 + }, + { + "epoch": 3.1963397013966928, + "grad_norm": 0.8872095346450806, + "learning_rate": 1.4036442321962995e-08, + "loss": 0.3162, + "step": 9955 + }, + { + "epoch": 3.1966607802215443, + "grad_norm": 0.6017862558364868, + "learning_rate": 1.3457695746055265e-08, + "loss": 0.2273, + "step": 9956 + }, + { + "epoch": 3.196981859046396, + "grad_norm": 0.6372138857841492, + "learning_rate": 1.2891132227033087e-08, + "loss": 0.2273, + "step": 9957 + }, + { + "epoch": 3.1973029378712474, + "grad_norm": 0.5612054467201233, + "learning_rate": 1.2336751833941229e-08, + "loss": 0.2014, + "step": 9958 + }, + { + "epoch": 3.197624016696099, + "grad_norm": 0.604500949382782, + "learning_rate": 1.1794554634314558e-08, + "loss": 0.2463, + "step": 9959 + }, + { + "epoch": 3.1979450955209505, + "grad_norm": 0.7940091490745544, + "learning_rate": 1.126454069423355e-08, + "loss": 0.3033, + "step": 9960 + }, + { + "epoch": 3.198266174345802, + "grad_norm": 0.7244638204574585, + "learning_rate": 1.074671007825767e-08, + "loss": 0.2531, + "step": 9961 + }, + { + "epoch": 3.198587253170653, + "grad_norm": 0.6201990842819214, + "learning_rate": 1.0241062849503103e-08, + "loss": 0.2504, + "step": 9962 + }, + { + "epoch": 3.1989083319955047, + "grad_norm": 0.5652727484703064, + "learning_rate": 9.747599069576119e-09, + "loss": 0.2249, + "step": 9963 + }, + { + "epoch": 3.1992294108203563, + "grad_norm": 0.810356616973877, + "learning_rate": 9.2663187986064e-09, + "loss": 0.3258, + "step": 9964 + }, + { + "epoch": 3.199550489645208, + "grad_norm": 0.8009127378463745, + "learning_rate": 8.797222095224823e-09, + "loss": 0.279, + "step": 9965 + }, + { + "epoch": 3.1998715684700594, + "grad_norm": 0.5440275073051453, + "learning_rate": 8.340309016585668e-09, + "loss": 0.2299, + "step": 9966 + }, + { + "epoch": 3.200192647294911, + "grad_norm": 0.7199362516403198, + "learning_rate": 7.895579618388827e-09, + "loss": 0.2572, + "step": 9967 + }, + { + "epoch": 3.2005137261197625, + "grad_norm": 0.5794646739959717, + "learning_rate": 7.463033954802078e-09, + "loss": 0.2327, + "step": 9968 + }, + { + "epoch": 3.200834804944614, + "grad_norm": 0.7608715295791626, + "learning_rate": 7.042672078527713e-09, + "loss": 0.284, + "step": 9969 + }, + { + "epoch": 3.2011558837694656, + "grad_norm": 0.6678571701049805, + "learning_rate": 6.634494040802519e-09, + "loss": 0.2936, + "step": 9970 + }, + { + "epoch": 3.2014769625943167, + "grad_norm": 0.7207247614860535, + "learning_rate": 6.238499891353389e-09, + "loss": 0.2576, + "step": 9971 + }, + { + "epoch": 3.2017980414191682, + "grad_norm": 0.7166668176651001, + "learning_rate": 5.854689678419512e-09, + "loss": 0.3074, + "step": 9972 + }, + { + "epoch": 3.20211912024402, + "grad_norm": 0.7045150399208069, + "learning_rate": 5.483063448785686e-09, + "loss": 0.2643, + "step": 9973 + }, + { + "epoch": 3.2024401990688713, + "grad_norm": 0.8350734114646912, + "learning_rate": 5.123621247726806e-09, + "loss": 0.2232, + "step": 9974 + }, + { + "epoch": 3.202761277893723, + "grad_norm": 0.7668434381484985, + "learning_rate": 4.776363119030069e-09, + "loss": 0.2897, + "step": 9975 + }, + { + "epoch": 3.2030823567185744, + "grad_norm": 0.4828073978424072, + "learning_rate": 4.4412891050171765e-09, + "loss": 0.2087, + "step": 9976 + }, + { + "epoch": 3.203403435543426, + "grad_norm": 0.5834342241287231, + "learning_rate": 4.1183992465221315e-09, + "loss": 0.238, + "step": 9977 + }, + { + "epoch": 3.2037245143682775, + "grad_norm": 0.6065945625305176, + "learning_rate": 3.807693582869032e-09, + "loss": 0.2669, + "step": 9978 + }, + { + "epoch": 3.204045593193129, + "grad_norm": 1.0055813789367676, + "learning_rate": 3.509172151938689e-09, + "loss": 0.3039, + "step": 9979 + }, + { + "epoch": 3.20436667201798, + "grad_norm": 0.6315884590148926, + "learning_rate": 3.222834990090906e-09, + "loss": 0.2177, + "step": 9980 + }, + { + "epoch": 3.2046877508428318, + "grad_norm": 0.5141793489456177, + "learning_rate": 2.948682132208891e-09, + "loss": 0.2139, + "step": 9981 + }, + { + "epoch": 3.2050088296676833, + "grad_norm": 0.8965477347373962, + "learning_rate": 2.6867136117214587e-09, + "loss": 0.2669, + "step": 9982 + }, + { + "epoch": 3.205329908492535, + "grad_norm": 0.4767134487628937, + "learning_rate": 2.4369294605253166e-09, + "loss": 0.2415, + "step": 9983 + }, + { + "epoch": 3.2056509873173864, + "grad_norm": 0.5410603880882263, + "learning_rate": 2.1993297090627806e-09, + "loss": 0.2093, + "step": 9984 + }, + { + "epoch": 3.205972066142238, + "grad_norm": 0.5572628378868103, + "learning_rate": 1.973914386288467e-09, + "loss": 0.2392, + "step": 9985 + }, + { + "epoch": 3.2062931449670895, + "grad_norm": 0.5035524964332581, + "learning_rate": 1.7606835196692927e-09, + "loss": 0.2171, + "step": 9986 + }, + { + "epoch": 3.206614223791941, + "grad_norm": 0.4417857229709625, + "learning_rate": 1.5596371351733752e-09, + "loss": 0.2107, + "step": 9987 + }, + { + "epoch": 3.2069353026167926, + "grad_norm": 0.646651029586792, + "learning_rate": 1.3707752573255405e-09, + "loss": 0.2278, + "step": 9988 + }, + { + "epoch": 3.2072563814416437, + "grad_norm": 0.7533266544342041, + "learning_rate": 1.1940979091074056e-09, + "loss": 0.3198, + "step": 9989 + }, + { + "epoch": 3.2075774602664953, + "grad_norm": 0.35283178091049194, + "learning_rate": 1.029605112068399e-09, + "loss": 0.2025, + "step": 9990 + }, + { + "epoch": 3.207898539091347, + "grad_norm": 0.4045920670032501, + "learning_rate": 8.772968862369446e-10, + "loss": 0.2284, + "step": 9991 + }, + { + "epoch": 3.2082196179161984, + "grad_norm": 0.4130226969718933, + "learning_rate": 7.371732501759709e-10, + "loss": 0.2453, + "step": 9992 + }, + { + "epoch": 3.20854069674105, + "grad_norm": 0.35095956921577454, + "learning_rate": 6.092342209607083e-10, + "loss": 0.2329, + "step": 9993 + }, + { + "epoch": 3.2088617755659015, + "grad_norm": 0.28531062602996826, + "learning_rate": 4.934798141786879e-10, + "loss": 0.1969, + "step": 9994 + }, + { + "epoch": 3.209182854390753, + "grad_norm": 0.4474446475505829, + "learning_rate": 3.899100439408443e-10, + "loss": 0.5176, + "step": 9995 + }, + { + "epoch": 3.2095039332156046, + "grad_norm": 0.36056339740753174, + "learning_rate": 2.985249228593112e-10, + "loss": 0.213, + "step": 9996 + }, + { + "epoch": 3.209825012040456, + "grad_norm": 0.27942565083503723, + "learning_rate": 2.1932446206962553e-10, + "loss": 0.1295, + "step": 9997 + }, + { + "epoch": 3.2101460908653072, + "grad_norm": 0.34559550881385803, + "learning_rate": 1.5230867123072756e-10, + "loss": 0.169, + "step": 9998 + }, + { + "epoch": 3.2104671696901588, + "grad_norm": 0.1161576583981514, + "learning_rate": 9.74775584916543e-11, + "loss": 0.0541, + "step": 9999 + }, + { + "epoch": 3.2107882485150103, + "grad_norm": 0.46221232414245605, + "learning_rate": 5.483113054705058e-11, + "loss": 0.1606, + "step": 10000 + } + ], + "logging_steps": 1, + "max_steps": 10000, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.492217387409408e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}