diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13587 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.989025177533893, + "eval_steps": 500, + "global_step": 1935, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0025823111684958036, + "grad_norm": 18.261793337291987, + "learning_rate": 1.0309278350515465e-07, + "loss": 2.5303, + "step": 1 + }, + { + "epoch": 0.005164622336991607, + "grad_norm": 17.801147060406052, + "learning_rate": 2.061855670103093e-07, + "loss": 2.5624, + "step": 2 + }, + { + "epoch": 0.007746933505487412, + "grad_norm": 18.16995107477998, + "learning_rate": 3.0927835051546394e-07, + "loss": 2.5315, + "step": 3 + }, + { + "epoch": 0.010329244673983214, + "grad_norm": 18.01568136556937, + "learning_rate": 4.123711340206186e-07, + "loss": 2.4765, + "step": 4 + }, + { + "epoch": 0.012911555842479019, + "grad_norm": 17.515246452891258, + "learning_rate": 5.154639175257732e-07, + "loss": 2.5302, + "step": 5 + }, + { + "epoch": 0.015493867010974823, + "grad_norm": 17.590449970545567, + "learning_rate": 6.185567010309279e-07, + "loss": 2.5035, + "step": 6 + }, + { + "epoch": 0.018076178179470628, + "grad_norm": 17.860290443777686, + "learning_rate": 7.216494845360824e-07, + "loss": 2.4856, + "step": 7 + }, + { + "epoch": 0.02065848934796643, + "grad_norm": 16.457931144604892, + "learning_rate": 8.247422680412372e-07, + "loss": 2.5034, + "step": 8 + }, + { + "epoch": 0.023240800516462233, + "grad_norm": 15.470780109900105, + "learning_rate": 9.278350515463919e-07, + "loss": 2.4852, + "step": 9 + }, + { + "epoch": 0.025823111684958037, + "grad_norm": 15.512665472720686, + "learning_rate": 1.0309278350515464e-06, + "loss": 2.4699, + "step": 10 + }, + { + "epoch": 0.028405422853453842, + "grad_norm": 11.58357921033743, + "learning_rate": 1.134020618556701e-06, + "loss": 2.4699, + "step": 11 + }, + { + "epoch": 0.030987734021949646, + "grad_norm": 10.860729184020308, + "learning_rate": 1.2371134020618557e-06, + "loss": 2.4071, + "step": 12 + }, + { + "epoch": 0.03357004519044545, + "grad_norm": 9.989819033444546, + "learning_rate": 1.3402061855670104e-06, + "loss": 2.3805, + "step": 13 + }, + { + "epoch": 0.036152356358941255, + "grad_norm": 4.5174463153202415, + "learning_rate": 1.4432989690721649e-06, + "loss": 2.2546, + "step": 14 + }, + { + "epoch": 0.03873466752743705, + "grad_norm": 4.393268427169014, + "learning_rate": 1.5463917525773197e-06, + "loss": 2.2604, + "step": 15 + }, + { + "epoch": 0.04131697869593286, + "grad_norm": 4.427841764828794, + "learning_rate": 1.6494845360824744e-06, + "loss": 2.2923, + "step": 16 + }, + { + "epoch": 0.04389928986442866, + "grad_norm": 4.116274907311869, + "learning_rate": 1.7525773195876288e-06, + "loss": 2.2216, + "step": 17 + }, + { + "epoch": 0.046481601032924466, + "grad_norm": 4.133640482984593, + "learning_rate": 1.8556701030927837e-06, + "loss": 2.2106, + "step": 18 + }, + { + "epoch": 0.04906391220142027, + "grad_norm": 4.395953110017593, + "learning_rate": 1.9587628865979384e-06, + "loss": 2.0858, + "step": 19 + }, + { + "epoch": 0.051646223369916075, + "grad_norm": 4.474700218464569, + "learning_rate": 2.061855670103093e-06, + "loss": 2.0397, + "step": 20 + }, + { + "epoch": 0.05422853453841188, + "grad_norm": 4.4282827611154545, + "learning_rate": 2.1649484536082477e-06, + "loss": 1.9898, + "step": 21 + }, + { + "epoch": 0.056810845706907684, + "grad_norm": 4.327539626304312, + "learning_rate": 2.268041237113402e-06, + "loss": 1.9951, + "step": 22 + }, + { + "epoch": 0.05939315687540349, + "grad_norm": 4.120482191834098, + "learning_rate": 2.3711340206185566e-06, + "loss": 1.9155, + "step": 23 + }, + { + "epoch": 0.06197546804389929, + "grad_norm": 3.8070009573204358, + "learning_rate": 2.4742268041237115e-06, + "loss": 1.8995, + "step": 24 + }, + { + "epoch": 0.0645577792123951, + "grad_norm": 4.212221981103606, + "learning_rate": 2.577319587628866e-06, + "loss": 1.8671, + "step": 25 + }, + { + "epoch": 0.0671400903808909, + "grad_norm": 4.058515787932264, + "learning_rate": 2.680412371134021e-06, + "loss": 1.624, + "step": 26 + }, + { + "epoch": 0.0697224015493867, + "grad_norm": 2.6837920364601398, + "learning_rate": 2.7835051546391757e-06, + "loss": 1.6006, + "step": 27 + }, + { + "epoch": 0.07230471271788251, + "grad_norm": 2.3134143191011924, + "learning_rate": 2.8865979381443297e-06, + "loss": 1.5962, + "step": 28 + }, + { + "epoch": 0.07488702388637831, + "grad_norm": 2.10252758773887, + "learning_rate": 2.9896907216494846e-06, + "loss": 1.5671, + "step": 29 + }, + { + "epoch": 0.0774693350548741, + "grad_norm": 1.5299035293940784, + "learning_rate": 3.0927835051546395e-06, + "loss": 1.5472, + "step": 30 + }, + { + "epoch": 0.08005164622336991, + "grad_norm": 1.2378467992780195, + "learning_rate": 3.195876288659794e-06, + "loss": 1.5826, + "step": 31 + }, + { + "epoch": 0.08263395739186571, + "grad_norm": 1.1693886031886611, + "learning_rate": 3.298969072164949e-06, + "loss": 1.4891, + "step": 32 + }, + { + "epoch": 0.08521626856036152, + "grad_norm": 1.1401616730246562, + "learning_rate": 3.4020618556701037e-06, + "loss": 1.4982, + "step": 33 + }, + { + "epoch": 0.08779857972885732, + "grad_norm": 1.0954194107096582, + "learning_rate": 3.5051546391752577e-06, + "loss": 1.5325, + "step": 34 + }, + { + "epoch": 0.09038089089735313, + "grad_norm": 1.0579698812847145, + "learning_rate": 3.6082474226804126e-06, + "loss": 1.4814, + "step": 35 + }, + { + "epoch": 0.09296320206584893, + "grad_norm": 0.9653972853589035, + "learning_rate": 3.7113402061855674e-06, + "loss": 1.4494, + "step": 36 + }, + { + "epoch": 0.09554551323434474, + "grad_norm": 0.9645799739429384, + "learning_rate": 3.814432989690722e-06, + "loss": 1.5063, + "step": 37 + }, + { + "epoch": 0.09812782440284054, + "grad_norm": 1.00902463704413, + "learning_rate": 3.917525773195877e-06, + "loss": 1.4683, + "step": 38 + }, + { + "epoch": 0.10071013557133635, + "grad_norm": 0.8783590838731776, + "learning_rate": 4.020618556701032e-06, + "loss": 1.4453, + "step": 39 + }, + { + "epoch": 0.10329244673983215, + "grad_norm": 0.8107745608541321, + "learning_rate": 4.123711340206186e-06, + "loss": 1.4753, + "step": 40 + }, + { + "epoch": 0.10587475790832795, + "grad_norm": 0.789499449431658, + "learning_rate": 4.2268041237113405e-06, + "loss": 1.4376, + "step": 41 + }, + { + "epoch": 0.10845706907682376, + "grad_norm": 0.8025862979836251, + "learning_rate": 4.329896907216495e-06, + "loss": 1.4394, + "step": 42 + }, + { + "epoch": 0.11103938024531956, + "grad_norm": 0.7735255000850332, + "learning_rate": 4.4329896907216494e-06, + "loss": 1.4252, + "step": 43 + }, + { + "epoch": 0.11362169141381537, + "grad_norm": 0.655685351955956, + "learning_rate": 4.536082474226804e-06, + "loss": 1.4297, + "step": 44 + }, + { + "epoch": 0.11620400258231117, + "grad_norm": 0.6434305676734384, + "learning_rate": 4.639175257731959e-06, + "loss": 1.4067, + "step": 45 + }, + { + "epoch": 0.11878631375080698, + "grad_norm": 0.6261090972732297, + "learning_rate": 4.742268041237113e-06, + "loss": 1.4101, + "step": 46 + }, + { + "epoch": 0.12136862491930278, + "grad_norm": 0.6073212568723447, + "learning_rate": 4.845360824742268e-06, + "loss": 1.4172, + "step": 47 + }, + { + "epoch": 0.12395093608779859, + "grad_norm": 0.5593992251138936, + "learning_rate": 4.948453608247423e-06, + "loss": 1.3798, + "step": 48 + }, + { + "epoch": 0.1265332472562944, + "grad_norm": 0.5922984047206602, + "learning_rate": 5.051546391752578e-06, + "loss": 1.3858, + "step": 49 + }, + { + "epoch": 0.1291155584247902, + "grad_norm": 0.6051805656066229, + "learning_rate": 5.154639175257732e-06, + "loss": 1.3836, + "step": 50 + }, + { + "epoch": 0.131697869593286, + "grad_norm": 0.5691467307958845, + "learning_rate": 5.257731958762888e-06, + "loss": 1.3841, + "step": 51 + }, + { + "epoch": 0.1342801807617818, + "grad_norm": 0.5693752131599139, + "learning_rate": 5.360824742268042e-06, + "loss": 1.3529, + "step": 52 + }, + { + "epoch": 0.1368624919302776, + "grad_norm": 0.5695922879694713, + "learning_rate": 5.463917525773196e-06, + "loss": 1.3599, + "step": 53 + }, + { + "epoch": 0.1394448030987734, + "grad_norm": 0.5653409792135605, + "learning_rate": 5.567010309278351e-06, + "loss": 1.359, + "step": 54 + }, + { + "epoch": 0.14202711426726922, + "grad_norm": 0.5181379607950953, + "learning_rate": 5.670103092783505e-06, + "loss": 1.358, + "step": 55 + }, + { + "epoch": 0.14460942543576502, + "grad_norm": 0.5616919575863237, + "learning_rate": 5.7731958762886594e-06, + "loss": 1.3925, + "step": 56 + }, + { + "epoch": 0.14719173660426083, + "grad_norm": 0.5785012034706822, + "learning_rate": 5.876288659793815e-06, + "loss": 1.3902, + "step": 57 + }, + { + "epoch": 0.14977404777275663, + "grad_norm": 0.5304602501774517, + "learning_rate": 5.979381443298969e-06, + "loss": 1.3516, + "step": 58 + }, + { + "epoch": 0.15235635894125243, + "grad_norm": 0.530893892325567, + "learning_rate": 6.082474226804124e-06, + "loss": 1.3556, + "step": 59 + }, + { + "epoch": 0.1549386701097482, + "grad_norm": 0.5578192633717619, + "learning_rate": 6.185567010309279e-06, + "loss": 1.3609, + "step": 60 + }, + { + "epoch": 0.15752098127824402, + "grad_norm": 0.5513788256965562, + "learning_rate": 6.288659793814433e-06, + "loss": 1.3494, + "step": 61 + }, + { + "epoch": 0.16010329244673982, + "grad_norm": 0.5357427336170907, + "learning_rate": 6.391752577319588e-06, + "loss": 1.3338, + "step": 62 + }, + { + "epoch": 0.16268560361523562, + "grad_norm": 0.5023717761905121, + "learning_rate": 6.494845360824743e-06, + "loss": 1.3329, + "step": 63 + }, + { + "epoch": 0.16526791478373143, + "grad_norm": 0.5667763878793689, + "learning_rate": 6.597938144329898e-06, + "loss": 1.3091, + "step": 64 + }, + { + "epoch": 0.16785022595222723, + "grad_norm": 0.5321682781083598, + "learning_rate": 6.701030927835052e-06, + "loss": 1.4019, + "step": 65 + }, + { + "epoch": 0.17043253712072304, + "grad_norm": 0.5281078850382545, + "learning_rate": 6.804123711340207e-06, + "loss": 1.2957, + "step": 66 + }, + { + "epoch": 0.17301484828921884, + "grad_norm": 0.5517071277006239, + "learning_rate": 6.907216494845361e-06, + "loss": 1.3387, + "step": 67 + }, + { + "epoch": 0.17559715945771465, + "grad_norm": 0.49767879157016104, + "learning_rate": 7.010309278350515e-06, + "loss": 1.3428, + "step": 68 + }, + { + "epoch": 0.17817947062621045, + "grad_norm": 0.5333338011710018, + "learning_rate": 7.113402061855671e-06, + "loss": 1.3275, + "step": 69 + }, + { + "epoch": 0.18076178179470626, + "grad_norm": 0.5436738511723471, + "learning_rate": 7.216494845360825e-06, + "loss": 1.323, + "step": 70 + }, + { + "epoch": 0.18334409296320206, + "grad_norm": 0.510166704780262, + "learning_rate": 7.319587628865979e-06, + "loss": 1.3337, + "step": 71 + }, + { + "epoch": 0.18592640413169786, + "grad_norm": 0.5367005388657456, + "learning_rate": 7.422680412371135e-06, + "loss": 1.3157, + "step": 72 + }, + { + "epoch": 0.18850871530019367, + "grad_norm": 0.5448486484799029, + "learning_rate": 7.525773195876289e-06, + "loss": 1.3016, + "step": 73 + }, + { + "epoch": 0.19109102646868947, + "grad_norm": 0.5049919202829503, + "learning_rate": 7.628865979381444e-06, + "loss": 1.314, + "step": 74 + }, + { + "epoch": 0.19367333763718528, + "grad_norm": 0.5203343239140137, + "learning_rate": 7.731958762886599e-06, + "loss": 1.3402, + "step": 75 + }, + { + "epoch": 0.19625564880568108, + "grad_norm": 0.5295426487296165, + "learning_rate": 7.835051546391754e-06, + "loss": 1.2937, + "step": 76 + }, + { + "epoch": 0.1988379599741769, + "grad_norm": 0.516845033941673, + "learning_rate": 7.938144329896907e-06, + "loss": 1.3334, + "step": 77 + }, + { + "epoch": 0.2014202711426727, + "grad_norm": 0.5195834990483513, + "learning_rate": 8.041237113402063e-06, + "loss": 1.3398, + "step": 78 + }, + { + "epoch": 0.2040025823111685, + "grad_norm": 0.5437220214849503, + "learning_rate": 8.144329896907216e-06, + "loss": 1.3328, + "step": 79 + }, + { + "epoch": 0.2065848934796643, + "grad_norm": 0.5316348277354109, + "learning_rate": 8.247422680412371e-06, + "loss": 1.2668, + "step": 80 + }, + { + "epoch": 0.2091672046481601, + "grad_norm": 0.5367859922800738, + "learning_rate": 8.350515463917526e-06, + "loss": 1.3455, + "step": 81 + }, + { + "epoch": 0.2117495158166559, + "grad_norm": 0.5330729877188181, + "learning_rate": 8.453608247422681e-06, + "loss": 1.3012, + "step": 82 + }, + { + "epoch": 0.2143318269851517, + "grad_norm": 0.5289538334232236, + "learning_rate": 8.556701030927836e-06, + "loss": 1.2987, + "step": 83 + }, + { + "epoch": 0.21691413815364752, + "grad_norm": 0.5271186274617113, + "learning_rate": 8.65979381443299e-06, + "loss": 1.3091, + "step": 84 + }, + { + "epoch": 0.21949644932214332, + "grad_norm": 0.5425463627416961, + "learning_rate": 8.762886597938146e-06, + "loss": 1.3312, + "step": 85 + }, + { + "epoch": 0.22207876049063913, + "grad_norm": 0.5002639279766852, + "learning_rate": 8.865979381443299e-06, + "loss": 1.297, + "step": 86 + }, + { + "epoch": 0.22466107165913493, + "grad_norm": 0.5217277201869615, + "learning_rate": 8.969072164948455e-06, + "loss": 1.289, + "step": 87 + }, + { + "epoch": 0.22724338282763074, + "grad_norm": 0.5187640962084964, + "learning_rate": 9.072164948453609e-06, + "loss": 1.2987, + "step": 88 + }, + { + "epoch": 0.22982569399612654, + "grad_norm": 0.5338044768965542, + "learning_rate": 9.175257731958764e-06, + "loss": 1.3224, + "step": 89 + }, + { + "epoch": 0.23240800516462234, + "grad_norm": 0.5154318950249379, + "learning_rate": 9.278350515463918e-06, + "loss": 1.2765, + "step": 90 + }, + { + "epoch": 0.23499031633311815, + "grad_norm": 0.5528419081350959, + "learning_rate": 9.381443298969073e-06, + "loss": 1.2546, + "step": 91 + }, + { + "epoch": 0.23757262750161395, + "grad_norm": 0.5214303815188377, + "learning_rate": 9.484536082474226e-06, + "loss": 1.3196, + "step": 92 + }, + { + "epoch": 0.24015493867010976, + "grad_norm": 0.5526561516212597, + "learning_rate": 9.587628865979383e-06, + "loss": 1.3079, + "step": 93 + }, + { + "epoch": 0.24273724983860556, + "grad_norm": 0.5222706312908064, + "learning_rate": 9.690721649484536e-06, + "loss": 1.3066, + "step": 94 + }, + { + "epoch": 0.24531956100710137, + "grad_norm": 0.5012834034257991, + "learning_rate": 9.793814432989691e-06, + "loss": 1.2722, + "step": 95 + }, + { + "epoch": 0.24790187217559717, + "grad_norm": 0.546506717931034, + "learning_rate": 9.896907216494846e-06, + "loss": 1.2797, + "step": 96 + }, + { + "epoch": 0.25048418334409295, + "grad_norm": 0.5253068189240363, + "learning_rate": 1e-05, + "loss": 1.3317, + "step": 97 + }, + { + "epoch": 0.2530664945125888, + "grad_norm": 0.5611013083746247, + "learning_rate": 1.0103092783505156e-05, + "loss": 1.3025, + "step": 98 + }, + { + "epoch": 0.25564880568108456, + "grad_norm": 0.5298450611788376, + "learning_rate": 1.0206185567010309e-05, + "loss": 1.3036, + "step": 99 + }, + { + "epoch": 0.2582311168495804, + "grad_norm": 0.5099158069328582, + "learning_rate": 1.0309278350515464e-05, + "loss": 1.3022, + "step": 100 + }, + { + "epoch": 0.26081342801807617, + "grad_norm": 0.5293692963261973, + "learning_rate": 1.041237113402062e-05, + "loss": 1.3146, + "step": 101 + }, + { + "epoch": 0.263395739186572, + "grad_norm": 0.5244293197527762, + "learning_rate": 1.0515463917525775e-05, + "loss": 1.2885, + "step": 102 + }, + { + "epoch": 0.2659780503550678, + "grad_norm": 0.5033038451383925, + "learning_rate": 1.0618556701030928e-05, + "loss": 1.2578, + "step": 103 + }, + { + "epoch": 0.2685603615235636, + "grad_norm": 0.5274373232659619, + "learning_rate": 1.0721649484536083e-05, + "loss": 1.299, + "step": 104 + }, + { + "epoch": 0.2711426726920594, + "grad_norm": 0.5444726311980428, + "learning_rate": 1.0824742268041238e-05, + "loss": 1.2929, + "step": 105 + }, + { + "epoch": 0.2737249838605552, + "grad_norm": 0.5326713139943118, + "learning_rate": 1.0927835051546391e-05, + "loss": 1.2753, + "step": 106 + }, + { + "epoch": 0.276307295029051, + "grad_norm": 0.5380363136345329, + "learning_rate": 1.1030927835051548e-05, + "loss": 1.3309, + "step": 107 + }, + { + "epoch": 0.2788896061975468, + "grad_norm": 0.5326668305211691, + "learning_rate": 1.1134020618556703e-05, + "loss": 1.2572, + "step": 108 + }, + { + "epoch": 0.2814719173660426, + "grad_norm": 0.546468614229743, + "learning_rate": 1.1237113402061856e-05, + "loss": 1.3002, + "step": 109 + }, + { + "epoch": 0.28405422853453843, + "grad_norm": 0.5702827216999251, + "learning_rate": 1.134020618556701e-05, + "loss": 1.2785, + "step": 110 + }, + { + "epoch": 0.2866365397030342, + "grad_norm": 0.6279431033493682, + "learning_rate": 1.1443298969072166e-05, + "loss": 1.293, + "step": 111 + }, + { + "epoch": 0.28921885087153004, + "grad_norm": 0.531097886388855, + "learning_rate": 1.1546391752577319e-05, + "loss": 1.2583, + "step": 112 + }, + { + "epoch": 0.2918011620400258, + "grad_norm": 0.5599048942527884, + "learning_rate": 1.1649484536082475e-05, + "loss": 1.2812, + "step": 113 + }, + { + "epoch": 0.29438347320852165, + "grad_norm": 0.5156492032600126, + "learning_rate": 1.175257731958763e-05, + "loss": 1.2593, + "step": 114 + }, + { + "epoch": 0.2969657843770174, + "grad_norm": 0.5204640872347789, + "learning_rate": 1.1855670103092785e-05, + "loss": 1.2222, + "step": 115 + }, + { + "epoch": 0.29954809554551326, + "grad_norm": 0.5242605009448421, + "learning_rate": 1.1958762886597938e-05, + "loss": 1.2452, + "step": 116 + }, + { + "epoch": 0.30213040671400904, + "grad_norm": 0.5194114061316515, + "learning_rate": 1.2061855670103093e-05, + "loss": 1.2844, + "step": 117 + }, + { + "epoch": 0.30471271788250487, + "grad_norm": 0.517182795982599, + "learning_rate": 1.2164948453608248e-05, + "loss": 1.2494, + "step": 118 + }, + { + "epoch": 0.30729502905100065, + "grad_norm": 0.5322718855106631, + "learning_rate": 1.2268041237113405e-05, + "loss": 1.2853, + "step": 119 + }, + { + "epoch": 0.3098773402194964, + "grad_norm": 0.5477767784263254, + "learning_rate": 1.2371134020618558e-05, + "loss": 1.266, + "step": 120 + }, + { + "epoch": 0.31245965138799225, + "grad_norm": 0.5409220884054373, + "learning_rate": 1.2474226804123713e-05, + "loss": 1.2862, + "step": 121 + }, + { + "epoch": 0.31504196255648803, + "grad_norm": 0.5278483491044216, + "learning_rate": 1.2577319587628866e-05, + "loss": 1.2431, + "step": 122 + }, + { + "epoch": 0.31762427372498386, + "grad_norm": 0.5366419552263015, + "learning_rate": 1.268041237113402e-05, + "loss": 1.2665, + "step": 123 + }, + { + "epoch": 0.32020658489347964, + "grad_norm": 0.5484453793169465, + "learning_rate": 1.2783505154639176e-05, + "loss": 1.2466, + "step": 124 + }, + { + "epoch": 0.32278889606197547, + "grad_norm": 0.5707617704740593, + "learning_rate": 1.2886597938144332e-05, + "loss": 1.2801, + "step": 125 + }, + { + "epoch": 0.32537120723047125, + "grad_norm": 0.5545965695444451, + "learning_rate": 1.2989690721649485e-05, + "loss": 1.2627, + "step": 126 + }, + { + "epoch": 0.3279535183989671, + "grad_norm": 0.545566481033505, + "learning_rate": 1.309278350515464e-05, + "loss": 1.2662, + "step": 127 + }, + { + "epoch": 0.33053582956746286, + "grad_norm": 0.614998937339536, + "learning_rate": 1.3195876288659795e-05, + "loss": 1.278, + "step": 128 + }, + { + "epoch": 0.3331181407359587, + "grad_norm": 0.8515042792729816, + "learning_rate": 1.3298969072164948e-05, + "loss": 1.2528, + "step": 129 + }, + { + "epoch": 0.33570045190445447, + "grad_norm": 0.5478821425580377, + "learning_rate": 1.3402061855670103e-05, + "loss": 1.2626, + "step": 130 + }, + { + "epoch": 0.3382827630729503, + "grad_norm": 0.5793920120710241, + "learning_rate": 1.350515463917526e-05, + "loss": 1.229, + "step": 131 + }, + { + "epoch": 0.3408650742414461, + "grad_norm": 0.6386990352009216, + "learning_rate": 1.3608247422680415e-05, + "loss": 1.2371, + "step": 132 + }, + { + "epoch": 0.3434473854099419, + "grad_norm": 0.5565581664986918, + "learning_rate": 1.3711340206185568e-05, + "loss": 1.2155, + "step": 133 + }, + { + "epoch": 0.3460296965784377, + "grad_norm": 0.5946237867227266, + "learning_rate": 1.3814432989690723e-05, + "loss": 1.2841, + "step": 134 + }, + { + "epoch": 0.3486120077469335, + "grad_norm": 0.5521016353163, + "learning_rate": 1.3917525773195878e-05, + "loss": 1.2551, + "step": 135 + }, + { + "epoch": 0.3511943189154293, + "grad_norm": 0.5385185628561531, + "learning_rate": 1.402061855670103e-05, + "loss": 1.2419, + "step": 136 + }, + { + "epoch": 0.3537766300839251, + "grad_norm": 0.5193240629735151, + "learning_rate": 1.4123711340206187e-05, + "loss": 1.2539, + "step": 137 + }, + { + "epoch": 0.3563589412524209, + "grad_norm": 0.556121240094166, + "learning_rate": 1.4226804123711342e-05, + "loss": 1.2303, + "step": 138 + }, + { + "epoch": 0.35894125242091673, + "grad_norm": 0.5218247569723863, + "learning_rate": 1.4329896907216495e-05, + "loss": 1.2644, + "step": 139 + }, + { + "epoch": 0.3615235635894125, + "grad_norm": 0.530437093642517, + "learning_rate": 1.443298969072165e-05, + "loss": 1.2497, + "step": 140 + }, + { + "epoch": 0.36410587475790834, + "grad_norm": 0.5484440453142011, + "learning_rate": 1.4536082474226805e-05, + "loss": 1.2324, + "step": 141 + }, + { + "epoch": 0.3666881859264041, + "grad_norm": 0.5493692192470679, + "learning_rate": 1.4639175257731958e-05, + "loss": 1.2717, + "step": 142 + }, + { + "epoch": 0.36927049709489995, + "grad_norm": 0.5413123502333834, + "learning_rate": 1.4742268041237115e-05, + "loss": 1.2103, + "step": 143 + }, + { + "epoch": 0.37185280826339573, + "grad_norm": 0.556751121901872, + "learning_rate": 1.484536082474227e-05, + "loss": 1.2532, + "step": 144 + }, + { + "epoch": 0.37443511943189156, + "grad_norm": 0.5408139067043912, + "learning_rate": 1.4948453608247425e-05, + "loss": 1.2995, + "step": 145 + }, + { + "epoch": 0.37701743060038734, + "grad_norm": 0.567109158025857, + "learning_rate": 1.5051546391752578e-05, + "loss": 1.2412, + "step": 146 + }, + { + "epoch": 0.37959974176888317, + "grad_norm": 0.5489463445020752, + "learning_rate": 1.5154639175257733e-05, + "loss": 1.2601, + "step": 147 + }, + { + "epoch": 0.38218205293737895, + "grad_norm": 0.5269782212095577, + "learning_rate": 1.5257731958762888e-05, + "loss": 1.2105, + "step": 148 + }, + { + "epoch": 0.3847643641058748, + "grad_norm": 0.5476377594621987, + "learning_rate": 1.5360824742268042e-05, + "loss": 1.251, + "step": 149 + }, + { + "epoch": 0.38734667527437056, + "grad_norm": 0.5851100678927004, + "learning_rate": 1.5463917525773197e-05, + "loss": 1.2722, + "step": 150 + }, + { + "epoch": 0.3899289864428664, + "grad_norm": 0.5906959266553361, + "learning_rate": 1.5567010309278352e-05, + "loss": 1.2605, + "step": 151 + }, + { + "epoch": 0.39251129761136216, + "grad_norm": 0.5812493666723612, + "learning_rate": 1.5670103092783507e-05, + "loss": 1.278, + "step": 152 + }, + { + "epoch": 0.395093608779858, + "grad_norm": 0.6197633946415279, + "learning_rate": 1.5773195876288662e-05, + "loss": 1.2062, + "step": 153 + }, + { + "epoch": 0.3976759199483538, + "grad_norm": 0.5382604894375081, + "learning_rate": 1.5876288659793813e-05, + "loss": 1.1994, + "step": 154 + }, + { + "epoch": 0.4002582311168496, + "grad_norm": 0.5919569403332751, + "learning_rate": 1.597938144329897e-05, + "loss": 1.2151, + "step": 155 + }, + { + "epoch": 0.4028405422853454, + "grad_norm": 0.5540942763865888, + "learning_rate": 1.6082474226804127e-05, + "loss": 1.2245, + "step": 156 + }, + { + "epoch": 0.4054228534538412, + "grad_norm": 0.6456250154614755, + "learning_rate": 1.618556701030928e-05, + "loss": 1.2407, + "step": 157 + }, + { + "epoch": 0.408005164622337, + "grad_norm": 0.5495270617698018, + "learning_rate": 1.6288659793814433e-05, + "loss": 1.2319, + "step": 158 + }, + { + "epoch": 0.41058747579083277, + "grad_norm": 0.6456750468117451, + "learning_rate": 1.6391752577319588e-05, + "loss": 1.2314, + "step": 159 + }, + { + "epoch": 0.4131697869593286, + "grad_norm": 0.5361216417752698, + "learning_rate": 1.6494845360824743e-05, + "loss": 1.2252, + "step": 160 + }, + { + "epoch": 0.4157520981278244, + "grad_norm": 0.5740356212634748, + "learning_rate": 1.65979381443299e-05, + "loss": 1.2199, + "step": 161 + }, + { + "epoch": 0.4183344092963202, + "grad_norm": 0.5600044764299427, + "learning_rate": 1.6701030927835052e-05, + "loss": 1.2386, + "step": 162 + }, + { + "epoch": 0.420916720464816, + "grad_norm": 0.5752486574243302, + "learning_rate": 1.6804123711340207e-05, + "loss": 1.2745, + "step": 163 + }, + { + "epoch": 0.4234990316333118, + "grad_norm": 0.5653976115621461, + "learning_rate": 1.6907216494845362e-05, + "loss": 1.2294, + "step": 164 + }, + { + "epoch": 0.4260813428018076, + "grad_norm": 0.6075219487510116, + "learning_rate": 1.7010309278350517e-05, + "loss": 1.2719, + "step": 165 + }, + { + "epoch": 0.4286636539703034, + "grad_norm": 0.606371230053945, + "learning_rate": 1.7113402061855672e-05, + "loss": 1.2112, + "step": 166 + }, + { + "epoch": 0.4312459651387992, + "grad_norm": 0.6078879765053119, + "learning_rate": 1.7216494845360827e-05, + "loss": 1.264, + "step": 167 + }, + { + "epoch": 0.43382827630729504, + "grad_norm": 0.5767804822867032, + "learning_rate": 1.731958762886598e-05, + "loss": 1.2515, + "step": 168 + }, + { + "epoch": 0.4364105874757908, + "grad_norm": 0.6130923630273873, + "learning_rate": 1.7422680412371137e-05, + "loss": 1.2038, + "step": 169 + }, + { + "epoch": 0.43899289864428664, + "grad_norm": 0.5235102985682106, + "learning_rate": 1.752577319587629e-05, + "loss": 1.237, + "step": 170 + }, + { + "epoch": 0.4415752098127824, + "grad_norm": 0.6204241509904728, + "learning_rate": 1.7628865979381443e-05, + "loss": 1.2653, + "step": 171 + }, + { + "epoch": 0.44415752098127825, + "grad_norm": 0.5839862114241418, + "learning_rate": 1.7731958762886598e-05, + "loss": 1.2268, + "step": 172 + }, + { + "epoch": 0.44673983214977403, + "grad_norm": 0.5379286158874602, + "learning_rate": 1.7835051546391756e-05, + "loss": 1.2237, + "step": 173 + }, + { + "epoch": 0.44932214331826986, + "grad_norm": 0.5773716297805083, + "learning_rate": 1.793814432989691e-05, + "loss": 1.2563, + "step": 174 + }, + { + "epoch": 0.45190445448676564, + "grad_norm": 0.5310519709213946, + "learning_rate": 1.8041237113402062e-05, + "loss": 1.1999, + "step": 175 + }, + { + "epoch": 0.45448676565526147, + "grad_norm": 0.5849880582279767, + "learning_rate": 1.8144329896907217e-05, + "loss": 1.2286, + "step": 176 + }, + { + "epoch": 0.45706907682375725, + "grad_norm": 0.5412178590822653, + "learning_rate": 1.8247422680412372e-05, + "loss": 1.2008, + "step": 177 + }, + { + "epoch": 0.4596513879922531, + "grad_norm": 0.6133621047960811, + "learning_rate": 1.8350515463917527e-05, + "loss": 1.2378, + "step": 178 + }, + { + "epoch": 0.46223369916074886, + "grad_norm": 0.528699955767748, + "learning_rate": 1.8453608247422682e-05, + "loss": 1.2259, + "step": 179 + }, + { + "epoch": 0.4648160103292447, + "grad_norm": 0.5279317130021535, + "learning_rate": 1.8556701030927837e-05, + "loss": 1.2412, + "step": 180 + }, + { + "epoch": 0.46739832149774047, + "grad_norm": 0.5853559551993786, + "learning_rate": 1.865979381443299e-05, + "loss": 1.2214, + "step": 181 + }, + { + "epoch": 0.4699806326662363, + "grad_norm": 0.5424233833428941, + "learning_rate": 1.8762886597938147e-05, + "loss": 1.2458, + "step": 182 + }, + { + "epoch": 0.4725629438347321, + "grad_norm": 0.5106671423999047, + "learning_rate": 1.88659793814433e-05, + "loss": 1.2233, + "step": 183 + }, + { + "epoch": 0.4751452550032279, + "grad_norm": 0.5324533214858636, + "learning_rate": 1.8969072164948453e-05, + "loss": 1.2069, + "step": 184 + }, + { + "epoch": 0.4777275661717237, + "grad_norm": 0.5646853613253074, + "learning_rate": 1.907216494845361e-05, + "loss": 1.252, + "step": 185 + }, + { + "epoch": 0.4803098773402195, + "grad_norm": 0.5937004053571758, + "learning_rate": 1.9175257731958766e-05, + "loss": 1.2147, + "step": 186 + }, + { + "epoch": 0.4828921885087153, + "grad_norm": 0.5562505209372693, + "learning_rate": 1.927835051546392e-05, + "loss": 1.2675, + "step": 187 + }, + { + "epoch": 0.4854744996772111, + "grad_norm": 0.5563014952202203, + "learning_rate": 1.9381443298969072e-05, + "loss": 1.2189, + "step": 188 + }, + { + "epoch": 0.4880568108457069, + "grad_norm": 0.5264086037575215, + "learning_rate": 1.9484536082474227e-05, + "loss": 1.2233, + "step": 189 + }, + { + "epoch": 0.49063912201420273, + "grad_norm": 0.5151034326104902, + "learning_rate": 1.9587628865979382e-05, + "loss": 1.1885, + "step": 190 + }, + { + "epoch": 0.4932214331826985, + "grad_norm": 0.5034288685234395, + "learning_rate": 1.969072164948454e-05, + "loss": 1.1903, + "step": 191 + }, + { + "epoch": 0.49580374435119434, + "grad_norm": 0.5307684939002809, + "learning_rate": 1.9793814432989692e-05, + "loss": 1.1928, + "step": 192 + }, + { + "epoch": 0.4983860555196901, + "grad_norm": 0.5230091554872849, + "learning_rate": 1.9896907216494847e-05, + "loss": 1.2519, + "step": 193 + }, + { + "epoch": 0.5009683666881859, + "grad_norm": 0.5226317801919913, + "learning_rate": 2e-05, + "loss": 1.2413, + "step": 194 + }, + { + "epoch": 0.5035506778566817, + "grad_norm": 0.5482333261742753, + "learning_rate": 1.9999983719336895e-05, + "loss": 1.2165, + "step": 195 + }, + { + "epoch": 0.5061329890251776, + "grad_norm": 0.5162294083726316, + "learning_rate": 1.999993487740058e-05, + "loss": 1.241, + "step": 196 + }, + { + "epoch": 0.5087153001936734, + "grad_norm": 0.5914236651547329, + "learning_rate": 1.99998534743501e-05, + "loss": 1.2174, + "step": 197 + }, + { + "epoch": 0.5112976113621691, + "grad_norm": 0.5318708983837044, + "learning_rate": 1.9999739510450505e-05, + "loss": 1.2061, + "step": 198 + }, + { + "epoch": 0.513879922530665, + "grad_norm": 0.5320871675914947, + "learning_rate": 1.9999592986072886e-05, + "loss": 1.2102, + "step": 199 + }, + { + "epoch": 0.5164622336991608, + "grad_norm": 0.520490178789329, + "learning_rate": 1.999941390169434e-05, + "loss": 1.2041, + "step": 200 + }, + { + "epoch": 0.5190445448676565, + "grad_norm": 0.5771804416096503, + "learning_rate": 1.9999202257897994e-05, + "loss": 1.2208, + "step": 201 + }, + { + "epoch": 0.5216268560361523, + "grad_norm": 0.5820688393505574, + "learning_rate": 1.9998958055372984e-05, + "loss": 1.2535, + "step": 202 + }, + { + "epoch": 0.5242091672046482, + "grad_norm": 0.5725413352628053, + "learning_rate": 1.9998681294914463e-05, + "loss": 1.2003, + "step": 203 + }, + { + "epoch": 0.526791478373144, + "grad_norm": 0.5813889118499928, + "learning_rate": 1.999837197742361e-05, + "loss": 1.1826, + "step": 204 + }, + { + "epoch": 0.5293737895416397, + "grad_norm": 0.543697742052886, + "learning_rate": 1.9998030103907594e-05, + "loss": 1.2383, + "step": 205 + }, + { + "epoch": 0.5319561007101355, + "grad_norm": 0.5737878782200372, + "learning_rate": 1.9997655675479604e-05, + "loss": 1.2261, + "step": 206 + }, + { + "epoch": 0.5345384118786314, + "grad_norm": 0.5318306020135024, + "learning_rate": 1.999724869335883e-05, + "loss": 1.2087, + "step": 207 + }, + { + "epoch": 0.5371207230471272, + "grad_norm": 0.5577225248661078, + "learning_rate": 1.999680915887046e-05, + "loss": 1.1793, + "step": 208 + }, + { + "epoch": 0.5397030342156229, + "grad_norm": 0.5246527308813651, + "learning_rate": 1.9996337073445673e-05, + "loss": 1.1913, + "step": 209 + }, + { + "epoch": 0.5422853453841188, + "grad_norm": 0.5214804985936327, + "learning_rate": 1.9995832438621646e-05, + "loss": 1.2264, + "step": 210 + }, + { + "epoch": 0.5448676565526146, + "grad_norm": 0.5608653450795346, + "learning_rate": 1.9995295256041534e-05, + "loss": 1.2269, + "step": 211 + }, + { + "epoch": 0.5474499677211104, + "grad_norm": 0.516907670213089, + "learning_rate": 1.9994725527454476e-05, + "loss": 1.1963, + "step": 212 + }, + { + "epoch": 0.5500322788896062, + "grad_norm": 0.5315857426965214, + "learning_rate": 1.999412325471558e-05, + "loss": 1.2646, + "step": 213 + }, + { + "epoch": 0.552614590058102, + "grad_norm": 0.5281811102245024, + "learning_rate": 1.999348843978593e-05, + "loss": 1.2163, + "step": 214 + }, + { + "epoch": 0.5551969012265978, + "grad_norm": 0.5579573481195031, + "learning_rate": 1.9992821084732572e-05, + "loss": 1.2262, + "step": 215 + }, + { + "epoch": 0.5577792123950936, + "grad_norm": 0.5595800317478022, + "learning_rate": 1.9992121191728495e-05, + "loss": 1.1872, + "step": 216 + }, + { + "epoch": 0.5603615235635894, + "grad_norm": 0.5344069260281085, + "learning_rate": 1.9991388763052643e-05, + "loss": 1.2293, + "step": 217 + }, + { + "epoch": 0.5629438347320852, + "grad_norm": 0.549912373034949, + "learning_rate": 1.9990623801089908e-05, + "loss": 1.1958, + "step": 218 + }, + { + "epoch": 0.565526145900581, + "grad_norm": 0.5482064392691067, + "learning_rate": 1.9989826308331103e-05, + "loss": 1.249, + "step": 219 + }, + { + "epoch": 0.5681084570690769, + "grad_norm": 0.6017796916988709, + "learning_rate": 1.9988996287372967e-05, + "loss": 1.1591, + "step": 220 + }, + { + "epoch": 0.5706907682375726, + "grad_norm": 0.5616669264753023, + "learning_rate": 1.9988133740918167e-05, + "loss": 1.2029, + "step": 221 + }, + { + "epoch": 0.5732730794060684, + "grad_norm": 0.548070565923786, + "learning_rate": 1.998723867177526e-05, + "loss": 1.2113, + "step": 222 + }, + { + "epoch": 0.5758553905745643, + "grad_norm": 0.5579809466395939, + "learning_rate": 1.998631108285871e-05, + "loss": 1.1915, + "step": 223 + }, + { + "epoch": 0.5784377017430601, + "grad_norm": 0.5145023167575715, + "learning_rate": 1.9985350977188877e-05, + "loss": 1.2455, + "step": 224 + }, + { + "epoch": 0.5810200129115558, + "grad_norm": 0.5438281690868874, + "learning_rate": 1.998435835789199e-05, + "loss": 1.1996, + "step": 225 + }, + { + "epoch": 0.5836023240800516, + "grad_norm": 0.5062822089728274, + "learning_rate": 1.9983333228200145e-05, + "loss": 1.2267, + "step": 226 + }, + { + "epoch": 0.5861846352485475, + "grad_norm": 0.49455916062220207, + "learning_rate": 1.9982275591451304e-05, + "loss": 1.2234, + "step": 227 + }, + { + "epoch": 0.5887669464170433, + "grad_norm": 0.5230043234273385, + "learning_rate": 1.998118545108927e-05, + "loss": 1.2048, + "step": 228 + }, + { + "epoch": 0.591349257585539, + "grad_norm": 0.492411804242286, + "learning_rate": 1.998006281066369e-05, + "loss": 1.1567, + "step": 229 + }, + { + "epoch": 0.5939315687540349, + "grad_norm": 0.47969671152864785, + "learning_rate": 1.997890767383002e-05, + "loss": 1.1842, + "step": 230 + }, + { + "epoch": 0.5965138799225307, + "grad_norm": 0.5037463603875141, + "learning_rate": 1.9977720044349546e-05, + "loss": 1.2071, + "step": 231 + }, + { + "epoch": 0.5990961910910265, + "grad_norm": 0.5505833494748907, + "learning_rate": 1.997649992608935e-05, + "loss": 1.2215, + "step": 232 + }, + { + "epoch": 0.6016785022595222, + "grad_norm": 0.48300379957641953, + "learning_rate": 1.9975247323022286e-05, + "loss": 1.1522, + "step": 233 + }, + { + "epoch": 0.6042608134280181, + "grad_norm": 0.535926278201071, + "learning_rate": 1.9973962239227012e-05, + "loss": 1.187, + "step": 234 + }, + { + "epoch": 0.6068431245965139, + "grad_norm": 0.5053573787331682, + "learning_rate": 1.997264467888792e-05, + "loss": 1.2264, + "step": 235 + }, + { + "epoch": 0.6094254357650097, + "grad_norm": 0.5209920849371423, + "learning_rate": 1.9971294646295165e-05, + "loss": 1.1841, + "step": 236 + }, + { + "epoch": 0.6120077469335055, + "grad_norm": 0.540792437365881, + "learning_rate": 1.9969912145844633e-05, + "loss": 1.2543, + "step": 237 + }, + { + "epoch": 0.6145900581020013, + "grad_norm": 0.5212177041723032, + "learning_rate": 1.9968497182037926e-05, + "loss": 1.2561, + "step": 238 + }, + { + "epoch": 0.6171723692704971, + "grad_norm": 0.5304144527565653, + "learning_rate": 1.996704975948236e-05, + "loss": 1.1606, + "step": 239 + }, + { + "epoch": 0.6197546804389928, + "grad_norm": 0.5829183549634611, + "learning_rate": 1.9965569882890924e-05, + "loss": 1.196, + "step": 240 + }, + { + "epoch": 0.6223369916074887, + "grad_norm": 0.5345144481383324, + "learning_rate": 1.99640575570823e-05, + "loss": 1.1859, + "step": 241 + }, + { + "epoch": 0.6249193027759845, + "grad_norm": 0.5221286963848386, + "learning_rate": 1.9962512786980825e-05, + "loss": 1.1715, + "step": 242 + }, + { + "epoch": 0.6275016139444803, + "grad_norm": 0.5058627006232616, + "learning_rate": 1.9960935577616466e-05, + "loss": 1.1821, + "step": 243 + }, + { + "epoch": 0.6300839251129761, + "grad_norm": 0.5199433840698656, + "learning_rate": 1.9959325934124833e-05, + "loss": 1.1953, + "step": 244 + }, + { + "epoch": 0.6326662362814719, + "grad_norm": 0.5096245762495348, + "learning_rate": 1.9957683861747137e-05, + "loss": 1.1775, + "step": 245 + }, + { + "epoch": 0.6352485474499677, + "grad_norm": 0.491816602747724, + "learning_rate": 1.995600936583018e-05, + "loss": 1.1965, + "step": 246 + }, + { + "epoch": 0.6378308586184636, + "grad_norm": 0.539351334033109, + "learning_rate": 1.9954302451826343e-05, + "loss": 1.1902, + "step": 247 + }, + { + "epoch": 0.6404131697869593, + "grad_norm": 0.5695500043236047, + "learning_rate": 1.9952563125293572e-05, + "loss": 1.1805, + "step": 248 + }, + { + "epoch": 0.6429954809554551, + "grad_norm": 0.5279649835109442, + "learning_rate": 1.9950791391895335e-05, + "loss": 1.1736, + "step": 249 + }, + { + "epoch": 0.6455777921239509, + "grad_norm": 0.5082701944757544, + "learning_rate": 1.9948987257400637e-05, + "loss": 1.2334, + "step": 250 + }, + { + "epoch": 0.6481601032924468, + "grad_norm": 0.5482093109063493, + "learning_rate": 1.994715072768398e-05, + "loss": 1.1802, + "step": 251 + }, + { + "epoch": 0.6507424144609425, + "grad_norm": 0.50876456990971, + "learning_rate": 1.9945281808725342e-05, + "loss": 1.2399, + "step": 252 + }, + { + "epoch": 0.6533247256294383, + "grad_norm": 0.5541690057371409, + "learning_rate": 1.9943380506610177e-05, + "loss": 1.1826, + "step": 253 + }, + { + "epoch": 0.6559070367979342, + "grad_norm": 0.503268686466964, + "learning_rate": 1.9941446827529374e-05, + "loss": 1.1959, + "step": 254 + }, + { + "epoch": 0.65848934796643, + "grad_norm": 0.5437511832970794, + "learning_rate": 1.993948077777925e-05, + "loss": 1.1953, + "step": 255 + }, + { + "epoch": 0.6610716591349257, + "grad_norm": 0.4781802895283609, + "learning_rate": 1.9937482363761522e-05, + "loss": 1.1989, + "step": 256 + }, + { + "epoch": 0.6636539703034215, + "grad_norm": 0.5449342716639853, + "learning_rate": 1.9935451591983292e-05, + "loss": 1.2134, + "step": 257 + }, + { + "epoch": 0.6662362814719174, + "grad_norm": 0.5062143579422912, + "learning_rate": 1.9933388469057026e-05, + "loss": 1.2243, + "step": 258 + }, + { + "epoch": 0.6688185926404132, + "grad_norm": 0.47503990693298725, + "learning_rate": 1.9931293001700518e-05, + "loss": 1.1859, + "step": 259 + }, + { + "epoch": 0.6714009038089089, + "grad_norm": 0.5208679076366871, + "learning_rate": 1.9929165196736893e-05, + "loss": 1.1658, + "step": 260 + }, + { + "epoch": 0.6739832149774048, + "grad_norm": 0.5159438722195342, + "learning_rate": 1.9927005061094563e-05, + "loss": 1.1943, + "step": 261 + }, + { + "epoch": 0.6765655261459006, + "grad_norm": 0.51564921169161, + "learning_rate": 1.992481260180722e-05, + "loss": 1.2096, + "step": 262 + }, + { + "epoch": 0.6791478373143964, + "grad_norm": 0.5272888613016128, + "learning_rate": 1.99225878260138e-05, + "loss": 1.2247, + "step": 263 + }, + { + "epoch": 0.6817301484828922, + "grad_norm": 0.4937662552334185, + "learning_rate": 1.992033074095847e-05, + "loss": 1.1959, + "step": 264 + }, + { + "epoch": 0.684312459651388, + "grad_norm": 0.5156120724948244, + "learning_rate": 1.9918041353990593e-05, + "loss": 1.1896, + "step": 265 + }, + { + "epoch": 0.6868947708198838, + "grad_norm": 0.5029899107771948, + "learning_rate": 1.9915719672564724e-05, + "loss": 1.2029, + "step": 266 + }, + { + "epoch": 0.6894770819883796, + "grad_norm": 0.5021013897512888, + "learning_rate": 1.9913365704240562e-05, + "loss": 1.2001, + "step": 267 + }, + { + "epoch": 0.6920593931568754, + "grad_norm": 0.4874570592953185, + "learning_rate": 1.9910979456682935e-05, + "loss": 1.1909, + "step": 268 + }, + { + "epoch": 0.6946417043253712, + "grad_norm": 0.49498289469426227, + "learning_rate": 1.990856093766179e-05, + "loss": 1.1823, + "step": 269 + }, + { + "epoch": 0.697224015493867, + "grad_norm": 0.493423453437657, + "learning_rate": 1.9906110155052142e-05, + "loss": 1.226, + "step": 270 + }, + { + "epoch": 0.6998063266623629, + "grad_norm": 0.5122671842264414, + "learning_rate": 1.9903627116834064e-05, + "loss": 1.1651, + "step": 271 + }, + { + "epoch": 0.7023886378308586, + "grad_norm": 0.4937702040231816, + "learning_rate": 1.990111183109266e-05, + "loss": 1.1902, + "step": 272 + }, + { + "epoch": 0.7049709489993544, + "grad_norm": 0.5136007768359044, + "learning_rate": 1.989856430601803e-05, + "loss": 1.1999, + "step": 273 + }, + { + "epoch": 0.7075532601678503, + "grad_norm": 0.49202497646323023, + "learning_rate": 1.9895984549905255e-05, + "loss": 1.1814, + "step": 274 + }, + { + "epoch": 0.7101355713363461, + "grad_norm": 0.5009139995620645, + "learning_rate": 1.9893372571154362e-05, + "loss": 1.19, + "step": 275 + }, + { + "epoch": 0.7127178825048418, + "grad_norm": 0.5193484363230418, + "learning_rate": 1.9890728378270304e-05, + "loss": 1.2066, + "step": 276 + }, + { + "epoch": 0.7153001936733376, + "grad_norm": 0.49927361128296144, + "learning_rate": 1.9888051979862922e-05, + "loss": 1.2064, + "step": 277 + }, + { + "epoch": 0.7178825048418335, + "grad_norm": 0.48996929520751215, + "learning_rate": 1.988534338464692e-05, + "loss": 1.1653, + "step": 278 + }, + { + "epoch": 0.7204648160103292, + "grad_norm": 0.499684088085507, + "learning_rate": 1.988260260144185e-05, + "loss": 1.1654, + "step": 279 + }, + { + "epoch": 0.723047127178825, + "grad_norm": 0.48422890349549536, + "learning_rate": 1.987982963917206e-05, + "loss": 1.1554, + "step": 280 + }, + { + "epoch": 0.7256294383473209, + "grad_norm": 0.49199667307495154, + "learning_rate": 1.987702450686669e-05, + "loss": 1.1908, + "step": 281 + }, + { + "epoch": 0.7282117495158167, + "grad_norm": 0.4852284035856874, + "learning_rate": 1.9874187213659614e-05, + "loss": 1.1367, + "step": 282 + }, + { + "epoch": 0.7307940606843124, + "grad_norm": 0.49745524250401135, + "learning_rate": 1.987131776878944e-05, + "loss": 1.1801, + "step": 283 + }, + { + "epoch": 0.7333763718528082, + "grad_norm": 0.49039290208544745, + "learning_rate": 1.986841618159946e-05, + "loss": 1.1691, + "step": 284 + }, + { + "epoch": 0.7359586830213041, + "grad_norm": 0.4905834691968128, + "learning_rate": 1.986548246153763e-05, + "loss": 1.1752, + "step": 285 + }, + { + "epoch": 0.7385409941897999, + "grad_norm": 0.490506332640748, + "learning_rate": 1.9862516618156526e-05, + "loss": 1.1883, + "step": 286 + }, + { + "epoch": 0.7411233053582956, + "grad_norm": 0.5248431755020703, + "learning_rate": 1.9859518661113326e-05, + "loss": 1.205, + "step": 287 + }, + { + "epoch": 0.7437056165267915, + "grad_norm": 0.5057560855422101, + "learning_rate": 1.9856488600169785e-05, + "loss": 1.2279, + "step": 288 + }, + { + "epoch": 0.7462879276952873, + "grad_norm": 0.5025298659401831, + "learning_rate": 1.9853426445192175e-05, + "loss": 1.1631, + "step": 289 + }, + { + "epoch": 0.7488702388637831, + "grad_norm": 0.4851276896544048, + "learning_rate": 1.9850332206151285e-05, + "loss": 1.1626, + "step": 290 + }, + { + "epoch": 0.7514525500322788, + "grad_norm": 0.48026264033577865, + "learning_rate": 1.984720589312236e-05, + "loss": 1.2098, + "step": 291 + }, + { + "epoch": 0.7540348612007747, + "grad_norm": 0.5405394737905861, + "learning_rate": 1.9844047516285098e-05, + "loss": 1.2298, + "step": 292 + }, + { + "epoch": 0.7566171723692705, + "grad_norm": 0.48769934414927935, + "learning_rate": 1.9840857085923585e-05, + "loss": 1.196, + "step": 293 + }, + { + "epoch": 0.7591994835377663, + "grad_norm": 0.527078245348908, + "learning_rate": 1.9837634612426292e-05, + "loss": 1.1832, + "step": 294 + }, + { + "epoch": 0.7617817947062621, + "grad_norm": 0.47968694451872484, + "learning_rate": 1.983438010628602e-05, + "loss": 1.176, + "step": 295 + }, + { + "epoch": 0.7643641058747579, + "grad_norm": 0.5316988367330956, + "learning_rate": 1.9831093578099866e-05, + "loss": 1.215, + "step": 296 + }, + { + "epoch": 0.7669464170432537, + "grad_norm": 0.4839737165293382, + "learning_rate": 1.9827775038569203e-05, + "loss": 1.1483, + "step": 297 + }, + { + "epoch": 0.7695287282117496, + "grad_norm": 0.47298182478673473, + "learning_rate": 1.9824424498499644e-05, + "loss": 1.138, + "step": 298 + }, + { + "epoch": 0.7721110393802453, + "grad_norm": 0.5037524314864462, + "learning_rate": 1.9821041968800982e-05, + "loss": 1.1906, + "step": 299 + }, + { + "epoch": 0.7746933505487411, + "grad_norm": 0.4726414066930357, + "learning_rate": 1.981762746048719e-05, + "loss": 1.1872, + "step": 300 + }, + { + "epoch": 0.7772756617172369, + "grad_norm": 0.4775555799653867, + "learning_rate": 1.9814180984676353e-05, + "loss": 1.1741, + "step": 301 + }, + { + "epoch": 0.7798579728857328, + "grad_norm": 0.48936767771418854, + "learning_rate": 1.981070255259066e-05, + "loss": 1.1687, + "step": 302 + }, + { + "epoch": 0.7824402840542285, + "grad_norm": 0.5010983431343674, + "learning_rate": 1.9807192175556344e-05, + "loss": 1.1563, + "step": 303 + }, + { + "epoch": 0.7850225952227243, + "grad_norm": 0.49317307520583575, + "learning_rate": 1.9803649865003658e-05, + "loss": 1.1831, + "step": 304 + }, + { + "epoch": 0.7876049063912202, + "grad_norm": 0.5072717596433491, + "learning_rate": 1.9800075632466832e-05, + "loss": 1.1795, + "step": 305 + }, + { + "epoch": 0.790187217559716, + "grad_norm": 0.5044246633386273, + "learning_rate": 1.979646948958405e-05, + "loss": 1.1985, + "step": 306 + }, + { + "epoch": 0.7927695287282117, + "grad_norm": 0.5075154181174161, + "learning_rate": 1.979283144809738e-05, + "loss": 1.1955, + "step": 307 + }, + { + "epoch": 0.7953518398967075, + "grad_norm": 0.5199713188120779, + "learning_rate": 1.9789161519852777e-05, + "loss": 1.2114, + "step": 308 + }, + { + "epoch": 0.7979341510652034, + "grad_norm": 0.5261480930927327, + "learning_rate": 1.9785459716800005e-05, + "loss": 1.1582, + "step": 309 + }, + { + "epoch": 0.8005164622336992, + "grad_norm": 0.5271373406623951, + "learning_rate": 1.978172605099264e-05, + "loss": 1.1761, + "step": 310 + }, + { + "epoch": 0.8030987734021949, + "grad_norm": 0.5065930895833843, + "learning_rate": 1.9777960534587975e-05, + "loss": 1.1915, + "step": 311 + }, + { + "epoch": 0.8056810845706908, + "grad_norm": 0.5171670807714366, + "learning_rate": 1.9774163179847046e-05, + "loss": 1.1776, + "step": 312 + }, + { + "epoch": 0.8082633957391866, + "grad_norm": 0.49429669397671067, + "learning_rate": 1.9770333999134538e-05, + "loss": 1.2005, + "step": 313 + }, + { + "epoch": 0.8108457069076824, + "grad_norm": 0.506264732133423, + "learning_rate": 1.976647300491877e-05, + "loss": 1.1555, + "step": 314 + }, + { + "epoch": 0.8134280180761781, + "grad_norm": 0.4975188714683081, + "learning_rate": 1.9762580209771648e-05, + "loss": 1.1761, + "step": 315 + }, + { + "epoch": 0.816010329244674, + "grad_norm": 0.4917712576784755, + "learning_rate": 1.9758655626368635e-05, + "loss": 1.1769, + "step": 316 + }, + { + "epoch": 0.8185926404131698, + "grad_norm": 0.4954187885530158, + "learning_rate": 1.975469926748869e-05, + "loss": 1.169, + "step": 317 + }, + { + "epoch": 0.8211749515816655, + "grad_norm": 0.5115626137899685, + "learning_rate": 1.9750711146014254e-05, + "loss": 1.1737, + "step": 318 + }, + { + "epoch": 0.8237572627501614, + "grad_norm": 0.4748616428095463, + "learning_rate": 1.9746691274931168e-05, + "loss": 1.19, + "step": 319 + }, + { + "epoch": 0.8263395739186572, + "grad_norm": 0.4940492043780541, + "learning_rate": 1.9742639667328666e-05, + "loss": 1.1761, + "step": 320 + }, + { + "epoch": 0.828921885087153, + "grad_norm": 0.5422433717627166, + "learning_rate": 1.9738556336399322e-05, + "loss": 1.1573, + "step": 321 + }, + { + "epoch": 0.8315041962556488, + "grad_norm": 0.5003074749343184, + "learning_rate": 1.9734441295439004e-05, + "loss": 1.1777, + "step": 322 + }, + { + "epoch": 0.8340865074241446, + "grad_norm": 0.4892259502271507, + "learning_rate": 1.973029455784683e-05, + "loss": 1.1696, + "step": 323 + }, + { + "epoch": 0.8366688185926404, + "grad_norm": 0.5321024468246395, + "learning_rate": 1.9726116137125128e-05, + "loss": 1.1436, + "step": 324 + }, + { + "epoch": 0.8392511297611362, + "grad_norm": 0.5045646201138196, + "learning_rate": 1.9721906046879392e-05, + "loss": 1.1764, + "step": 325 + }, + { + "epoch": 0.841833440929632, + "grad_norm": 0.5281769148500022, + "learning_rate": 1.971766430081823e-05, + "loss": 1.1966, + "step": 326 + }, + { + "epoch": 0.8444157520981278, + "grad_norm": 0.5202036284793085, + "learning_rate": 1.971339091275333e-05, + "loss": 1.1929, + "step": 327 + }, + { + "epoch": 0.8469980632666236, + "grad_norm": 0.48805338005531557, + "learning_rate": 1.9709085896599414e-05, + "loss": 1.1713, + "step": 328 + }, + { + "epoch": 0.8495803744351195, + "grad_norm": 0.48834281779827204, + "learning_rate": 1.970474926637418e-05, + "loss": 1.1766, + "step": 329 + }, + { + "epoch": 0.8521626856036152, + "grad_norm": 0.47045086079257925, + "learning_rate": 1.9700381036198278e-05, + "loss": 1.1733, + "step": 330 + }, + { + "epoch": 0.854744996772111, + "grad_norm": 0.5046592134608324, + "learning_rate": 1.9695981220295242e-05, + "loss": 1.2065, + "step": 331 + }, + { + "epoch": 0.8573273079406069, + "grad_norm": 0.4815780307147996, + "learning_rate": 1.9691549832991455e-05, + "loss": 1.1641, + "step": 332 + }, + { + "epoch": 0.8599096191091027, + "grad_norm": 0.47306399270721433, + "learning_rate": 1.96870868887161e-05, + "loss": 1.2106, + "step": 333 + }, + { + "epoch": 0.8624919302775984, + "grad_norm": 0.48177902227189173, + "learning_rate": 1.968259240200112e-05, + "loss": 1.1867, + "step": 334 + }, + { + "epoch": 0.8650742414460942, + "grad_norm": 0.5109266879563596, + "learning_rate": 1.967806638748116e-05, + "loss": 1.1835, + "step": 335 + }, + { + "epoch": 0.8676565526145901, + "grad_norm": 0.4886245722535642, + "learning_rate": 1.9673508859893515e-05, + "loss": 1.1687, + "step": 336 + }, + { + "epoch": 0.8702388637830859, + "grad_norm": 0.5248768069436597, + "learning_rate": 1.966891983407811e-05, + "loss": 1.1984, + "step": 337 + }, + { + "epoch": 0.8728211749515816, + "grad_norm": 0.4943514825391058, + "learning_rate": 1.9664299324977412e-05, + "loss": 1.1891, + "step": 338 + }, + { + "epoch": 0.8754034861200775, + "grad_norm": 0.47129413485615096, + "learning_rate": 1.9659647347636422e-05, + "loss": 1.1586, + "step": 339 + }, + { + "epoch": 0.8779857972885733, + "grad_norm": 0.49462243772822845, + "learning_rate": 1.9654963917202586e-05, + "loss": 1.1558, + "step": 340 + }, + { + "epoch": 0.8805681084570691, + "grad_norm": 0.4861824324614408, + "learning_rate": 1.965024904892578e-05, + "loss": 1.1683, + "step": 341 + }, + { + "epoch": 0.8831504196255648, + "grad_norm": 0.4951569265192093, + "learning_rate": 1.9645502758158234e-05, + "loss": 1.2037, + "step": 342 + }, + { + "epoch": 0.8857327307940607, + "grad_norm": 0.46700115170322853, + "learning_rate": 1.9640725060354508e-05, + "loss": 1.1142, + "step": 343 + }, + { + "epoch": 0.8883150419625565, + "grad_norm": 0.5117387702694532, + "learning_rate": 1.963591597107142e-05, + "loss": 1.1944, + "step": 344 + }, + { + "epoch": 0.8908973531310523, + "grad_norm": 0.46783399641185575, + "learning_rate": 1.9631075505967993e-05, + "loss": 1.1802, + "step": 345 + }, + { + "epoch": 0.8934796642995481, + "grad_norm": 0.49795421380846105, + "learning_rate": 1.9626203680805432e-05, + "loss": 1.1814, + "step": 346 + }, + { + "epoch": 0.8960619754680439, + "grad_norm": 0.4861596444598828, + "learning_rate": 1.9621300511447043e-05, + "loss": 1.1825, + "step": 347 + }, + { + "epoch": 0.8986442866365397, + "grad_norm": 0.4953435099828471, + "learning_rate": 1.9616366013858195e-05, + "loss": 1.161, + "step": 348 + }, + { + "epoch": 0.9012265978050356, + "grad_norm": 0.5023878227402143, + "learning_rate": 1.961140020410627e-05, + "loss": 1.1885, + "step": 349 + }, + { + "epoch": 0.9038089089735313, + "grad_norm": 0.5007001061995038, + "learning_rate": 1.9606403098360597e-05, + "loss": 1.1989, + "step": 350 + }, + { + "epoch": 0.9063912201420271, + "grad_norm": 0.4635439197533225, + "learning_rate": 1.960137471289242e-05, + "loss": 1.1302, + "step": 351 + }, + { + "epoch": 0.9089735313105229, + "grad_norm": 0.49621446281583526, + "learning_rate": 1.9596315064074826e-05, + "loss": 1.1991, + "step": 352 + }, + { + "epoch": 0.9115558424790188, + "grad_norm": 0.48901012014264644, + "learning_rate": 1.9591224168382708e-05, + "loss": 1.1818, + "step": 353 + }, + { + "epoch": 0.9141381536475145, + "grad_norm": 0.4823284694023606, + "learning_rate": 1.958610204239269e-05, + "loss": 1.1464, + "step": 354 + }, + { + "epoch": 0.9167204648160103, + "grad_norm": 0.47062926472993427, + "learning_rate": 1.95809487027831e-05, + "loss": 1.1966, + "step": 355 + }, + { + "epoch": 0.9193027759845062, + "grad_norm": 0.4766355752313664, + "learning_rate": 1.9575764166333887e-05, + "loss": 1.1741, + "step": 356 + }, + { + "epoch": 0.921885087153002, + "grad_norm": 0.470588804103096, + "learning_rate": 1.95705484499266e-05, + "loss": 1.1425, + "step": 357 + }, + { + "epoch": 0.9244673983214977, + "grad_norm": 0.49029399234060356, + "learning_rate": 1.9565301570544297e-05, + "loss": 1.19, + "step": 358 + }, + { + "epoch": 0.9270497094899935, + "grad_norm": 0.4811321660417228, + "learning_rate": 1.9560023545271512e-05, + "loss": 1.1617, + "step": 359 + }, + { + "epoch": 0.9296320206584894, + "grad_norm": 0.47791045452781805, + "learning_rate": 1.9554714391294198e-05, + "loss": 1.1349, + "step": 360 + }, + { + "epoch": 0.9322143318269851, + "grad_norm": 0.45457900915984395, + "learning_rate": 1.9549374125899665e-05, + "loss": 1.1697, + "step": 361 + }, + { + "epoch": 0.9347966429954809, + "grad_norm": 0.4783672298922519, + "learning_rate": 1.9544002766476523e-05, + "loss": 1.1779, + "step": 362 + }, + { + "epoch": 0.9373789541639768, + "grad_norm": 0.5043177366580075, + "learning_rate": 1.953860033051463e-05, + "loss": 1.16, + "step": 363 + }, + { + "epoch": 0.9399612653324726, + "grad_norm": 0.5253476046397387, + "learning_rate": 1.953316683560504e-05, + "loss": 1.2074, + "step": 364 + }, + { + "epoch": 0.9425435765009683, + "grad_norm": 0.4830105807169101, + "learning_rate": 1.9527702299439925e-05, + "loss": 1.1598, + "step": 365 + }, + { + "epoch": 0.9451258876694641, + "grad_norm": 0.4603907599438401, + "learning_rate": 1.9522206739812546e-05, + "loss": 1.1511, + "step": 366 + }, + { + "epoch": 0.94770819883796, + "grad_norm": 0.45849247679852523, + "learning_rate": 1.9516680174617168e-05, + "loss": 1.1873, + "step": 367 + }, + { + "epoch": 0.9502905100064558, + "grad_norm": 0.4826500481926017, + "learning_rate": 1.9511122621849025e-05, + "loss": 1.187, + "step": 368 + }, + { + "epoch": 0.9528728211749515, + "grad_norm": 0.4577162756249727, + "learning_rate": 1.9505534099604245e-05, + "loss": 1.1611, + "step": 369 + }, + { + "epoch": 0.9554551323434474, + "grad_norm": 0.4674250946156371, + "learning_rate": 1.94999146260798e-05, + "loss": 1.164, + "step": 370 + }, + { + "epoch": 0.9580374435119432, + "grad_norm": 0.46337504643140265, + "learning_rate": 1.9494264219573433e-05, + "loss": 1.1898, + "step": 371 + }, + { + "epoch": 0.960619754680439, + "grad_norm": 0.48205345839710323, + "learning_rate": 1.9488582898483625e-05, + "loss": 1.1641, + "step": 372 + }, + { + "epoch": 0.9632020658489348, + "grad_norm": 0.4927156506373219, + "learning_rate": 1.9482870681309502e-05, + "loss": 1.1526, + "step": 373 + }, + { + "epoch": 0.9657843770174306, + "grad_norm": 0.49774685339253233, + "learning_rate": 1.9477127586650812e-05, + "loss": 1.1513, + "step": 374 + }, + { + "epoch": 0.9683666881859264, + "grad_norm": 0.5493094035957639, + "learning_rate": 1.9471353633207824e-05, + "loss": 1.2067, + "step": 375 + }, + { + "epoch": 0.9709489993544222, + "grad_norm": 0.4695400511675006, + "learning_rate": 1.94655488397813e-05, + "loss": 1.1575, + "step": 376 + }, + { + "epoch": 0.973531310522918, + "grad_norm": 0.5159667326407505, + "learning_rate": 1.9459713225272422e-05, + "loss": 1.1785, + "step": 377 + }, + { + "epoch": 0.9761136216914138, + "grad_norm": 0.5216341313782504, + "learning_rate": 1.9453846808682713e-05, + "loss": 1.1446, + "step": 378 + }, + { + "epoch": 0.9786959328599096, + "grad_norm": 0.5182964074330236, + "learning_rate": 1.9447949609114018e-05, + "loss": 1.1432, + "step": 379 + }, + { + "epoch": 0.9812782440284055, + "grad_norm": 0.485083144408532, + "learning_rate": 1.9442021645768383e-05, + "loss": 1.1275, + "step": 380 + }, + { + "epoch": 0.9838605551969012, + "grad_norm": 0.4599544335039025, + "learning_rate": 1.9436062937948058e-05, + "loss": 1.1151, + "step": 381 + }, + { + "epoch": 0.986442866365397, + "grad_norm": 0.521054277469576, + "learning_rate": 1.943007350505538e-05, + "loss": 1.2012, + "step": 382 + }, + { + "epoch": 0.9890251775338929, + "grad_norm": 0.4751977247155287, + "learning_rate": 1.942405336659274e-05, + "loss": 1.1797, + "step": 383 + }, + { + "epoch": 0.9916074887023887, + "grad_norm": 0.47658638371046963, + "learning_rate": 1.94180025421625e-05, + "loss": 1.1392, + "step": 384 + }, + { + "epoch": 0.9941897998708844, + "grad_norm": 0.4712331278213858, + "learning_rate": 1.9411921051466952e-05, + "loss": 1.141, + "step": 385 + }, + { + "epoch": 0.9967721110393802, + "grad_norm": 0.49755103398259565, + "learning_rate": 1.9405808914308236e-05, + "loss": 1.1328, + "step": 386 + }, + { + "epoch": 0.9993544222078761, + "grad_norm": 0.4847898684583998, + "learning_rate": 1.9399666150588286e-05, + "loss": 1.1669, + "step": 387 + }, + { + "epoch": 1.0, + "grad_norm": 0.4847898684583998, + "learning_rate": 1.9393492780308745e-05, + "loss": 1.1861, + "step": 388 + }, + { + "epoch": 1.0025823111684957, + "grad_norm": 1.1014826257650279, + "learning_rate": 1.938728882357093e-05, + "loss": 1.0917, + "step": 389 + }, + { + "epoch": 1.0051646223369917, + "grad_norm": 0.5639388688605913, + "learning_rate": 1.938105430057575e-05, + "loss": 1.0903, + "step": 390 + }, + { + "epoch": 1.0077469335054874, + "grad_norm": 0.6443566019067333, + "learning_rate": 1.9374789231623636e-05, + "loss": 1.1009, + "step": 391 + }, + { + "epoch": 1.010329244673983, + "grad_norm": 0.630435005940545, + "learning_rate": 1.9368493637114483e-05, + "loss": 1.1003, + "step": 392 + }, + { + "epoch": 1.012911555842479, + "grad_norm": 0.5108574939050173, + "learning_rate": 1.936216753754758e-05, + "loss": 1.0711, + "step": 393 + }, + { + "epoch": 1.0154938670109748, + "grad_norm": 0.608458106332031, + "learning_rate": 1.9355810953521556e-05, + "loss": 1.0595, + "step": 394 + }, + { + "epoch": 1.0180761781794707, + "grad_norm": 0.5458550950072169, + "learning_rate": 1.934942390573428e-05, + "loss": 1.0943, + "step": 395 + }, + { + "epoch": 1.0206584893479664, + "grad_norm": 0.5725197447406076, + "learning_rate": 1.9343006414982827e-05, + "loss": 1.0715, + "step": 396 + }, + { + "epoch": 1.0232408005164622, + "grad_norm": 0.5707836595715814, + "learning_rate": 1.9336558502163404e-05, + "loss": 1.0845, + "step": 397 + }, + { + "epoch": 1.025823111684958, + "grad_norm": 0.5436343954371967, + "learning_rate": 1.933008018827127e-05, + "loss": 1.1252, + "step": 398 + }, + { + "epoch": 1.0284054228534538, + "grad_norm": 0.5722847310225982, + "learning_rate": 1.932357149440067e-05, + "loss": 1.0698, + "step": 399 + }, + { + "epoch": 1.0309877340219495, + "grad_norm": 0.5620996777465161, + "learning_rate": 1.9317032441744778e-05, + "loss": 1.0999, + "step": 400 + }, + { + "epoch": 1.0335700451904455, + "grad_norm": 0.6088252142123551, + "learning_rate": 1.9310463051595612e-05, + "loss": 1.1462, + "step": 401 + }, + { + "epoch": 1.0361523563589412, + "grad_norm": 0.5076462836433528, + "learning_rate": 1.9303863345343985e-05, + "loss": 1.0755, + "step": 402 + }, + { + "epoch": 1.0387346675274371, + "grad_norm": 0.5109910409402665, + "learning_rate": 1.929723334447941e-05, + "loss": 1.0821, + "step": 403 + }, + { + "epoch": 1.0413169786959329, + "grad_norm": 0.5636026180024839, + "learning_rate": 1.9290573070590053e-05, + "loss": 1.079, + "step": 404 + }, + { + "epoch": 1.0438992898644286, + "grad_norm": 0.5112896124943734, + "learning_rate": 1.9283882545362642e-05, + "loss": 1.104, + "step": 405 + }, + { + "epoch": 1.0464816010329245, + "grad_norm": 0.5515513629476969, + "learning_rate": 1.9277161790582425e-05, + "loss": 1.094, + "step": 406 + }, + { + "epoch": 1.0490639122014203, + "grad_norm": 0.514886116372971, + "learning_rate": 1.9270410828133062e-05, + "loss": 1.085, + "step": 407 + }, + { + "epoch": 1.051646223369916, + "grad_norm": 0.5331151636377134, + "learning_rate": 1.9263629679996582e-05, + "loss": 1.1028, + "step": 408 + }, + { + "epoch": 1.054228534538412, + "grad_norm": 0.5165475698281132, + "learning_rate": 1.925681836825331e-05, + "loss": 1.0848, + "step": 409 + }, + { + "epoch": 1.0568108457069076, + "grad_norm": 0.5366595294372662, + "learning_rate": 1.9249976915081773e-05, + "loss": 1.1015, + "step": 410 + }, + { + "epoch": 1.0593931568754036, + "grad_norm": 0.5193038464689262, + "learning_rate": 1.9243105342758657e-05, + "loss": 1.0782, + "step": 411 + }, + { + "epoch": 1.0619754680438993, + "grad_norm": 0.5009182548303415, + "learning_rate": 1.923620367365871e-05, + "loss": 1.0516, + "step": 412 + }, + { + "epoch": 1.064557779212395, + "grad_norm": 0.5029748452840229, + "learning_rate": 1.922927193025468e-05, + "loss": 1.0781, + "step": 413 + }, + { + "epoch": 1.067140090380891, + "grad_norm": 0.5196114525480685, + "learning_rate": 1.922231013511724e-05, + "loss": 1.0873, + "step": 414 + }, + { + "epoch": 1.0697224015493867, + "grad_norm": 0.5294324303655081, + "learning_rate": 1.921531831091492e-05, + "loss": 1.0955, + "step": 415 + }, + { + "epoch": 1.0723047127178824, + "grad_norm": 0.5264181862085819, + "learning_rate": 1.9208296480414034e-05, + "loss": 1.0849, + "step": 416 + }, + { + "epoch": 1.0748870238863784, + "grad_norm": 0.5072255528456242, + "learning_rate": 1.9201244666478586e-05, + "loss": 1.0865, + "step": 417 + }, + { + "epoch": 1.077469335054874, + "grad_norm": 0.5043454850061722, + "learning_rate": 1.919416289207022e-05, + "loss": 1.1016, + "step": 418 + }, + { + "epoch": 1.08005164622337, + "grad_norm": 0.521923847407557, + "learning_rate": 1.9187051180248134e-05, + "loss": 1.1006, + "step": 419 + }, + { + "epoch": 1.0826339573918657, + "grad_norm": 0.5106595879796177, + "learning_rate": 1.9179909554169002e-05, + "loss": 1.0947, + "step": 420 + }, + { + "epoch": 1.0852162685603615, + "grad_norm": 0.5021486916963932, + "learning_rate": 1.9172738037086905e-05, + "loss": 1.0763, + "step": 421 + }, + { + "epoch": 1.0877985797288574, + "grad_norm": 0.5348145469187987, + "learning_rate": 1.9165536652353256e-05, + "loss": 1.1169, + "step": 422 + }, + { + "epoch": 1.0903808908973531, + "grad_norm": 0.49988703400762524, + "learning_rate": 1.915830542341672e-05, + "loss": 1.1116, + "step": 423 + }, + { + "epoch": 1.0929632020658488, + "grad_norm": 0.5234949485004682, + "learning_rate": 1.915104437382313e-05, + "loss": 1.088, + "step": 424 + }, + { + "epoch": 1.0955455132343448, + "grad_norm": 0.5079748564445703, + "learning_rate": 1.9143753527215437e-05, + "loss": 1.0716, + "step": 425 + }, + { + "epoch": 1.0981278244028405, + "grad_norm": 0.5084741143233746, + "learning_rate": 1.91364329073336e-05, + "loss": 1.0913, + "step": 426 + }, + { + "epoch": 1.1007101355713362, + "grad_norm": 0.5110683137296181, + "learning_rate": 1.912908253801453e-05, + "loss": 1.0376, + "step": 427 + }, + { + "epoch": 1.1032924467398322, + "grad_norm": 0.5104932914801271, + "learning_rate": 1.9121702443191994e-05, + "loss": 1.0499, + "step": 428 + }, + { + "epoch": 1.105874757908328, + "grad_norm": 0.5311882952530959, + "learning_rate": 1.9114292646896574e-05, + "loss": 1.0875, + "step": 429 + }, + { + "epoch": 1.1084570690768238, + "grad_norm": 0.4879682050554302, + "learning_rate": 1.910685317325554e-05, + "loss": 1.0851, + "step": 430 + }, + { + "epoch": 1.1110393802453196, + "grad_norm": 0.5107417160978968, + "learning_rate": 1.9099384046492807e-05, + "loss": 1.1234, + "step": 431 + }, + { + "epoch": 1.1136216914138153, + "grad_norm": 0.5323164257143238, + "learning_rate": 1.9091885290928846e-05, + "loss": 1.1051, + "step": 432 + }, + { + "epoch": 1.1162040025823112, + "grad_norm": 0.5062938072356562, + "learning_rate": 1.9084356930980593e-05, + "loss": 1.1316, + "step": 433 + }, + { + "epoch": 1.118786313750807, + "grad_norm": 0.5155486221780639, + "learning_rate": 1.9076798991161395e-05, + "loss": 1.1078, + "step": 434 + }, + { + "epoch": 1.121368624919303, + "grad_norm": 0.5225888914215889, + "learning_rate": 1.90692114960809e-05, + "loss": 1.0605, + "step": 435 + }, + { + "epoch": 1.1239509360877986, + "grad_norm": 0.5218586332404057, + "learning_rate": 1.9061594470445e-05, + "loss": 1.0544, + "step": 436 + }, + { + "epoch": 1.1265332472562943, + "grad_norm": 0.5063750610874502, + "learning_rate": 1.9053947939055737e-05, + "loss": 1.1166, + "step": 437 + }, + { + "epoch": 1.1291155584247903, + "grad_norm": 0.5243484649411593, + "learning_rate": 1.9046271926811238e-05, + "loss": 1.0726, + "step": 438 + }, + { + "epoch": 1.131697869593286, + "grad_norm": 0.5495963106537264, + "learning_rate": 1.9038566458705615e-05, + "loss": 1.1032, + "step": 439 + }, + { + "epoch": 1.1342801807617817, + "grad_norm": 0.4964414774064479, + "learning_rate": 1.903083155982889e-05, + "loss": 1.1079, + "step": 440 + }, + { + "epoch": 1.1368624919302777, + "grad_norm": 0.5147288498020974, + "learning_rate": 1.902306725536692e-05, + "loss": 1.1177, + "step": 441 + }, + { + "epoch": 1.1394448030987734, + "grad_norm": 0.5327839820642644, + "learning_rate": 1.9015273570601316e-05, + "loss": 1.075, + "step": 442 + }, + { + "epoch": 1.142027114267269, + "grad_norm": 0.528940491782197, + "learning_rate": 1.9007450530909345e-05, + "loss": 1.086, + "step": 443 + }, + { + "epoch": 1.144609425435765, + "grad_norm": 0.5495937382324539, + "learning_rate": 1.899959816176386e-05, + "loss": 1.0836, + "step": 444 + }, + { + "epoch": 1.1471917366042608, + "grad_norm": 0.5259398393873413, + "learning_rate": 1.899171648873322e-05, + "loss": 1.0777, + "step": 445 + }, + { + "epoch": 1.1497740477727567, + "grad_norm": 0.5301129579516534, + "learning_rate": 1.8983805537481196e-05, + "loss": 1.072, + "step": 446 + }, + { + "epoch": 1.1523563589412524, + "grad_norm": 0.5472484501403468, + "learning_rate": 1.8975865333766895e-05, + "loss": 1.053, + "step": 447 + }, + { + "epoch": 1.1549386701097482, + "grad_norm": 0.521949307121031, + "learning_rate": 1.8967895903444672e-05, + "loss": 1.0818, + "step": 448 + }, + { + "epoch": 1.157520981278244, + "grad_norm": 0.5182353015878332, + "learning_rate": 1.895989727246405e-05, + "loss": 1.1152, + "step": 449 + }, + { + "epoch": 1.1601032924467398, + "grad_norm": 0.5214919591138573, + "learning_rate": 1.895186946686964e-05, + "loss": 1.1376, + "step": 450 + }, + { + "epoch": 1.1626856036152355, + "grad_norm": 0.5173501815788256, + "learning_rate": 1.8943812512801023e-05, + "loss": 1.1067, + "step": 451 + }, + { + "epoch": 1.1652679147837315, + "grad_norm": 0.5017112933282827, + "learning_rate": 1.8935726436492724e-05, + "loss": 1.0956, + "step": 452 + }, + { + "epoch": 1.1678502259522272, + "grad_norm": 0.5054557050103587, + "learning_rate": 1.8927611264274078e-05, + "loss": 1.0743, + "step": 453 + }, + { + "epoch": 1.170432537120723, + "grad_norm": 0.5038866398853414, + "learning_rate": 1.8919467022569163e-05, + "loss": 1.0663, + "step": 454 + }, + { + "epoch": 1.1730148482892189, + "grad_norm": 0.5069505760103037, + "learning_rate": 1.8911293737896706e-05, + "loss": 1.0918, + "step": 455 + }, + { + "epoch": 1.1755971594577146, + "grad_norm": 0.49669266787942645, + "learning_rate": 1.890309143687001e-05, + "loss": 1.0768, + "step": 456 + }, + { + "epoch": 1.1781794706262105, + "grad_norm": 0.5059342818530757, + "learning_rate": 1.8894860146196848e-05, + "loss": 1.0606, + "step": 457 + }, + { + "epoch": 1.1807617817947063, + "grad_norm": 0.49213032174231053, + "learning_rate": 1.88865998926794e-05, + "loss": 1.0538, + "step": 458 + }, + { + "epoch": 1.183344092963202, + "grad_norm": 0.497950829610666, + "learning_rate": 1.8878310703214148e-05, + "loss": 1.0797, + "step": 459 + }, + { + "epoch": 1.185926404131698, + "grad_norm": 0.5085396242287622, + "learning_rate": 1.8869992604791786e-05, + "loss": 1.0652, + "step": 460 + }, + { + "epoch": 1.1885087153001936, + "grad_norm": 0.5050911514318284, + "learning_rate": 1.8861645624497154e-05, + "loss": 1.0744, + "step": 461 + }, + { + "epoch": 1.1910910264686896, + "grad_norm": 0.518765301787044, + "learning_rate": 1.885326978950912e-05, + "loss": 1.1027, + "step": 462 + }, + { + "epoch": 1.1936733376371853, + "grad_norm": 0.49638458680440195, + "learning_rate": 1.8844865127100517e-05, + "loss": 1.0804, + "step": 463 + }, + { + "epoch": 1.196255648805681, + "grad_norm": 0.5183830709936179, + "learning_rate": 1.883643166463804e-05, + "loss": 1.0966, + "step": 464 + }, + { + "epoch": 1.198837959974177, + "grad_norm": 0.49817074390497323, + "learning_rate": 1.882796942958216e-05, + "loss": 1.0786, + "step": 465 + }, + { + "epoch": 1.2014202711426727, + "grad_norm": 0.501779662690412, + "learning_rate": 1.8819478449487034e-05, + "loss": 1.0586, + "step": 466 + }, + { + "epoch": 1.2040025823111684, + "grad_norm": 0.5054852727426871, + "learning_rate": 1.8810958752000426e-05, + "loss": 1.0553, + "step": 467 + }, + { + "epoch": 1.2065848934796644, + "grad_norm": 0.5016427339348701, + "learning_rate": 1.8802410364863598e-05, + "loss": 1.0943, + "step": 468 + }, + { + "epoch": 1.20916720464816, + "grad_norm": 0.49091502171551293, + "learning_rate": 1.879383331591123e-05, + "loss": 1.1174, + "step": 469 + }, + { + "epoch": 1.2117495158166558, + "grad_norm": 0.5378342182934727, + "learning_rate": 1.8785227633071332e-05, + "loss": 1.0729, + "step": 470 + }, + { + "epoch": 1.2143318269851517, + "grad_norm": 0.5106316461830734, + "learning_rate": 1.877659334436515e-05, + "loss": 1.1085, + "step": 471 + }, + { + "epoch": 1.2169141381536475, + "grad_norm": 0.5098246784805628, + "learning_rate": 1.8767930477907074e-05, + "loss": 1.0957, + "step": 472 + }, + { + "epoch": 1.2194964493221434, + "grad_norm": 0.506477376675333, + "learning_rate": 1.875923906190454e-05, + "loss": 1.0918, + "step": 473 + }, + { + "epoch": 1.2220787604906391, + "grad_norm": 0.5004090128286083, + "learning_rate": 1.875051912465796e-05, + "loss": 1.1018, + "step": 474 + }, + { + "epoch": 1.2246610716591348, + "grad_norm": 0.5117533679075642, + "learning_rate": 1.8741770694560598e-05, + "loss": 1.0592, + "step": 475 + }, + { + "epoch": 1.2272433828276308, + "grad_norm": 0.5078739636230806, + "learning_rate": 1.873299380009851e-05, + "loss": 1.0705, + "step": 476 + }, + { + "epoch": 1.2298256939961265, + "grad_norm": 0.5143902933122632, + "learning_rate": 1.8724188469850423e-05, + "loss": 1.0774, + "step": 477 + }, + { + "epoch": 1.2324080051646225, + "grad_norm": 0.49232344984603593, + "learning_rate": 1.871535473248766e-05, + "loss": 1.0548, + "step": 478 + }, + { + "epoch": 1.2349903163331182, + "grad_norm": 0.49486310972586456, + "learning_rate": 1.8706492616774043e-05, + "loss": 1.084, + "step": 479 + }, + { + "epoch": 1.237572627501614, + "grad_norm": 0.5178421153087599, + "learning_rate": 1.86976021515658e-05, + "loss": 1.0907, + "step": 480 + }, + { + "epoch": 1.2401549386701098, + "grad_norm": 0.524983289209404, + "learning_rate": 1.8688683365811456e-05, + "loss": 1.0863, + "step": 481 + }, + { + "epoch": 1.2427372498386056, + "grad_norm": 0.5045481433976463, + "learning_rate": 1.867973628855177e-05, + "loss": 1.1053, + "step": 482 + }, + { + "epoch": 1.2453195610071013, + "grad_norm": 0.5091501092813715, + "learning_rate": 1.8670760948919608e-05, + "loss": 1.1194, + "step": 483 + }, + { + "epoch": 1.2479018721755972, + "grad_norm": 0.49967798365538585, + "learning_rate": 1.8661757376139858e-05, + "loss": 1.0845, + "step": 484 + }, + { + "epoch": 1.250484183344093, + "grad_norm": 0.4980457942019374, + "learning_rate": 1.865272559952936e-05, + "loss": 1.0601, + "step": 485 + }, + { + "epoch": 1.2530664945125887, + "grad_norm": 0.4972111559723752, + "learning_rate": 1.864366564849677e-05, + "loss": 1.0907, + "step": 486 + }, + { + "epoch": 1.2556488056810846, + "grad_norm": 0.5065337348249978, + "learning_rate": 1.8634577552542492e-05, + "loss": 1.0795, + "step": 487 + }, + { + "epoch": 1.2582311168495803, + "grad_norm": 0.513607026130961, + "learning_rate": 1.862546134125857e-05, + "loss": 1.0436, + "step": 488 + }, + { + "epoch": 1.2608134280180763, + "grad_norm": 0.5225707833426417, + "learning_rate": 1.86163170443286e-05, + "loss": 1.0857, + "step": 489 + }, + { + "epoch": 1.263395739186572, + "grad_norm": 0.5022229029307232, + "learning_rate": 1.860714469152762e-05, + "loss": 1.0872, + "step": 490 + }, + { + "epoch": 1.2659780503550677, + "grad_norm": 0.5010606447774606, + "learning_rate": 1.859794431272203e-05, + "loss": 1.1187, + "step": 491 + }, + { + "epoch": 1.2685603615235637, + "grad_norm": 0.5014036896762357, + "learning_rate": 1.8588715937869487e-05, + "loss": 1.0601, + "step": 492 + }, + { + "epoch": 1.2711426726920594, + "grad_norm": 0.5062378400840666, + "learning_rate": 1.8579459597018798e-05, + "loss": 1.0733, + "step": 493 + }, + { + "epoch": 1.2737249838605553, + "grad_norm": 0.49322055025801476, + "learning_rate": 1.857017532030984e-05, + "loss": 1.081, + "step": 494 + }, + { + "epoch": 1.276307295029051, + "grad_norm": 0.4985978212218064, + "learning_rate": 1.8560863137973447e-05, + "loss": 1.0824, + "step": 495 + }, + { + "epoch": 1.2788896061975468, + "grad_norm": 0.5035778345625428, + "learning_rate": 1.8551523080331324e-05, + "loss": 1.0875, + "step": 496 + }, + { + "epoch": 1.2814719173660425, + "grad_norm": 0.5035572503139835, + "learning_rate": 1.854215517779593e-05, + "loss": 1.085, + "step": 497 + }, + { + "epoch": 1.2840542285345384, + "grad_norm": 0.5464525852922026, + "learning_rate": 1.8532759460870407e-05, + "loss": 1.0964, + "step": 498 + }, + { + "epoch": 1.2866365397030342, + "grad_norm": 0.5069778816485006, + "learning_rate": 1.8523335960148446e-05, + "loss": 1.1123, + "step": 499 + }, + { + "epoch": 1.28921885087153, + "grad_norm": 0.5106529164099722, + "learning_rate": 1.8513884706314224e-05, + "loss": 1.0752, + "step": 500 + }, + { + "epoch": 1.2918011620400258, + "grad_norm": 0.5009571583608718, + "learning_rate": 1.8504405730142267e-05, + "loss": 1.0549, + "step": 501 + }, + { + "epoch": 1.2943834732085215, + "grad_norm": 0.4969158537205058, + "learning_rate": 1.849489906249739e-05, + "loss": 1.0756, + "step": 502 + }, + { + "epoch": 1.2969657843770175, + "grad_norm": 0.5336056726258084, + "learning_rate": 1.8485364734334555e-05, + "loss": 1.0765, + "step": 503 + }, + { + "epoch": 1.2995480955455132, + "grad_norm": 0.5171474777619819, + "learning_rate": 1.84758027766988e-05, + "loss": 1.0919, + "step": 504 + }, + { + "epoch": 1.3021304067140091, + "grad_norm": 0.5120748195398405, + "learning_rate": 1.8466213220725133e-05, + "loss": 1.092, + "step": 505 + }, + { + "epoch": 1.3047127178825049, + "grad_norm": 0.5018571042434719, + "learning_rate": 1.8456596097638414e-05, + "loss": 1.0857, + "step": 506 + }, + { + "epoch": 1.3072950290510006, + "grad_norm": 0.5230975347071961, + "learning_rate": 1.8446951438753272e-05, + "loss": 1.0498, + "step": 507 + }, + { + "epoch": 1.3098773402194963, + "grad_norm": 0.5217912907178898, + "learning_rate": 1.8437279275474e-05, + "loss": 1.0744, + "step": 508 + }, + { + "epoch": 1.3124596513879923, + "grad_norm": 0.5317696621552458, + "learning_rate": 1.8427579639294436e-05, + "loss": 1.0914, + "step": 509 + }, + { + "epoch": 1.315041962556488, + "grad_norm": 0.49394980658380616, + "learning_rate": 1.841785256179789e-05, + "loss": 1.1051, + "step": 510 + }, + { + "epoch": 1.317624273724984, + "grad_norm": 0.5217876781301438, + "learning_rate": 1.840809807465701e-05, + "loss": 1.1077, + "step": 511 + }, + { + "epoch": 1.3202065848934796, + "grad_norm": 0.49490066616909967, + "learning_rate": 1.839831620963371e-05, + "loss": 1.0961, + "step": 512 + }, + { + "epoch": 1.3227888960619754, + "grad_norm": 0.5083891511636687, + "learning_rate": 1.8388506998579025e-05, + "loss": 1.0532, + "step": 513 + }, + { + "epoch": 1.3253712072304713, + "grad_norm": 0.5242811566268283, + "learning_rate": 1.837867047343306e-05, + "loss": 1.1178, + "step": 514 + }, + { + "epoch": 1.327953518398967, + "grad_norm": 0.5281173798352883, + "learning_rate": 1.8368806666224836e-05, + "loss": 1.1301, + "step": 515 + }, + { + "epoch": 1.330535829567463, + "grad_norm": 0.5022194856315405, + "learning_rate": 1.8358915609072223e-05, + "loss": 1.0753, + "step": 516 + }, + { + "epoch": 1.3331181407359587, + "grad_norm": 0.5275458989077836, + "learning_rate": 1.8348997334181815e-05, + "loss": 1.0857, + "step": 517 + }, + { + "epoch": 1.3357004519044544, + "grad_norm": 0.5261446317192925, + "learning_rate": 1.833905187384883e-05, + "loss": 1.0724, + "step": 518 + }, + { + "epoch": 1.3382827630729504, + "grad_norm": 0.4909880164376135, + "learning_rate": 1.8329079260457e-05, + "loss": 1.0867, + "step": 519 + }, + { + "epoch": 1.340865074241446, + "grad_norm": 0.538007883447562, + "learning_rate": 1.8319079526478487e-05, + "loss": 1.1477, + "step": 520 + }, + { + "epoch": 1.343447385409942, + "grad_norm": 0.5259603760732874, + "learning_rate": 1.830905270447374e-05, + "loss": 1.0871, + "step": 521 + }, + { + "epoch": 1.3460296965784377, + "grad_norm": 0.5210088026374243, + "learning_rate": 1.829899882709143e-05, + "loss": 1.0548, + "step": 522 + }, + { + "epoch": 1.3486120077469335, + "grad_norm": 0.5274100435358815, + "learning_rate": 1.8288917927068315e-05, + "loss": 1.0874, + "step": 523 + }, + { + "epoch": 1.3511943189154292, + "grad_norm": 0.509881963592761, + "learning_rate": 1.8278810037229134e-05, + "loss": 1.0692, + "step": 524 + }, + { + "epoch": 1.3537766300839251, + "grad_norm": 0.5425562492866538, + "learning_rate": 1.8268675190486524e-05, + "loss": 1.0896, + "step": 525 + }, + { + "epoch": 1.3563589412524208, + "grad_norm": 0.5172299722903951, + "learning_rate": 1.825851341984089e-05, + "loss": 1.0332, + "step": 526 + }, + { + "epoch": 1.3589412524209168, + "grad_norm": 0.5113811516912794, + "learning_rate": 1.82483247583803e-05, + "loss": 1.1021, + "step": 527 + }, + { + "epoch": 1.3615235635894125, + "grad_norm": 0.4972938127748451, + "learning_rate": 1.8238109239280393e-05, + "loss": 1.0932, + "step": 528 + }, + { + "epoch": 1.3641058747579082, + "grad_norm": 0.516789056255083, + "learning_rate": 1.822786689580425e-05, + "loss": 1.0544, + "step": 529 + }, + { + "epoch": 1.3666881859264042, + "grad_norm": 0.55011290621819, + "learning_rate": 1.8217597761302298e-05, + "loss": 1.1075, + "step": 530 + }, + { + "epoch": 1.3692704970949, + "grad_norm": 0.5273282578927786, + "learning_rate": 1.8207301869212207e-05, + "loss": 1.0642, + "step": 531 + }, + { + "epoch": 1.3718528082633958, + "grad_norm": 0.5212309533973544, + "learning_rate": 1.8196979253058765e-05, + "loss": 1.1039, + "step": 532 + }, + { + "epoch": 1.3744351194318916, + "grad_norm": 0.5234164457393133, + "learning_rate": 1.8186629946453774e-05, + "loss": 1.0697, + "step": 533 + }, + { + "epoch": 1.3770174306003873, + "grad_norm": 0.5143378010325881, + "learning_rate": 1.8176253983095958e-05, + "loss": 1.059, + "step": 534 + }, + { + "epoch": 1.3795997417688832, + "grad_norm": 0.5202992098003197, + "learning_rate": 1.816585139677082e-05, + "loss": 1.0832, + "step": 535 + }, + { + "epoch": 1.382182052937379, + "grad_norm": 0.5137267251756589, + "learning_rate": 1.8155422221350566e-05, + "loss": 1.077, + "step": 536 + }, + { + "epoch": 1.384764364105875, + "grad_norm": 0.5206482372388472, + "learning_rate": 1.8144966490793973e-05, + "loss": 1.0808, + "step": 537 + }, + { + "epoch": 1.3873466752743706, + "grad_norm": 0.4977846119013132, + "learning_rate": 1.813448423914629e-05, + "loss": 1.0889, + "step": 538 + }, + { + "epoch": 1.3899289864428663, + "grad_norm": 0.5038953337541946, + "learning_rate": 1.8123975500539114e-05, + "loss": 1.0517, + "step": 539 + }, + { + "epoch": 1.392511297611362, + "grad_norm": 0.48358165460230973, + "learning_rate": 1.811344030919029e-05, + "loss": 1.0637, + "step": 540 + }, + { + "epoch": 1.395093608779858, + "grad_norm": 0.5010075364080836, + "learning_rate": 1.8102878699403804e-05, + "loss": 1.0718, + "step": 541 + }, + { + "epoch": 1.3976759199483537, + "grad_norm": 0.5031248131184225, + "learning_rate": 1.8092290705569655e-05, + "loss": 1.08, + "step": 542 + }, + { + "epoch": 1.4002582311168497, + "grad_norm": 0.4906664062118931, + "learning_rate": 1.8081676362163757e-05, + "loss": 1.0582, + "step": 543 + }, + { + "epoch": 1.4028405422853454, + "grad_norm": 0.5121501864467626, + "learning_rate": 1.8071035703747816e-05, + "loss": 1.0751, + "step": 544 + }, + { + "epoch": 1.405422853453841, + "grad_norm": 0.524910024204937, + "learning_rate": 1.806036876496923e-05, + "loss": 1.0547, + "step": 545 + }, + { + "epoch": 1.408005164622337, + "grad_norm": 0.5209029065997814, + "learning_rate": 1.8049675580560965e-05, + "loss": 1.1205, + "step": 546 + }, + { + "epoch": 1.4105874757908328, + "grad_norm": 0.48220028684533556, + "learning_rate": 1.8038956185341452e-05, + "loss": 1.0426, + "step": 547 + }, + { + "epoch": 1.4131697869593287, + "grad_norm": 0.503658857829128, + "learning_rate": 1.8028210614214458e-05, + "loss": 1.0772, + "step": 548 + }, + { + "epoch": 1.4157520981278244, + "grad_norm": 0.5055334860886664, + "learning_rate": 1.8017438902168987e-05, + "loss": 1.0702, + "step": 549 + }, + { + "epoch": 1.4183344092963202, + "grad_norm": 0.5171324204492811, + "learning_rate": 1.800664108427917e-05, + "loss": 1.0609, + "step": 550 + }, + { + "epoch": 1.4209167204648159, + "grad_norm": 0.5189722895903278, + "learning_rate": 1.799581719570412e-05, + "loss": 1.0755, + "step": 551 + }, + { + "epoch": 1.4234990316333118, + "grad_norm": 0.5243160862800866, + "learning_rate": 1.798496727168787e-05, + "loss": 1.0739, + "step": 552 + }, + { + "epoch": 1.4260813428018075, + "grad_norm": 0.5317337215399274, + "learning_rate": 1.7974091347559197e-05, + "loss": 1.0711, + "step": 553 + }, + { + "epoch": 1.4286636539703035, + "grad_norm": 0.49461538156213314, + "learning_rate": 1.796318945873156e-05, + "loss": 1.1056, + "step": 554 + }, + { + "epoch": 1.4312459651387992, + "grad_norm": 0.5212170098454869, + "learning_rate": 1.795226164070296e-05, + "loss": 1.1166, + "step": 555 + }, + { + "epoch": 1.433828276307295, + "grad_norm": 0.5384906886276548, + "learning_rate": 1.7941307929055813e-05, + "loss": 1.0836, + "step": 556 + }, + { + "epoch": 1.4364105874757909, + "grad_norm": 0.538099607308631, + "learning_rate": 1.7930328359456856e-05, + "loss": 1.0563, + "step": 557 + }, + { + "epoch": 1.4389928986442866, + "grad_norm": 0.508125640639657, + "learning_rate": 1.791932296765703e-05, + "loss": 1.0862, + "step": 558 + }, + { + "epoch": 1.4415752098127825, + "grad_norm": 0.5258022272020693, + "learning_rate": 1.7908291789491348e-05, + "loss": 1.0947, + "step": 559 + }, + { + "epoch": 1.4441575209812783, + "grad_norm": 0.5292716445438939, + "learning_rate": 1.7897234860878783e-05, + "loss": 1.0953, + "step": 560 + }, + { + "epoch": 1.446739832149774, + "grad_norm": 0.4919901732224947, + "learning_rate": 1.7886152217822173e-05, + "loss": 1.0589, + "step": 561 + }, + { + "epoch": 1.44932214331827, + "grad_norm": 0.540399625807644, + "learning_rate": 1.7875043896408065e-05, + "loss": 1.0868, + "step": 562 + }, + { + "epoch": 1.4519044544867656, + "grad_norm": 0.5157274204814702, + "learning_rate": 1.7863909932806632e-05, + "loss": 1.055, + "step": 563 + }, + { + "epoch": 1.4544867656552616, + "grad_norm": 0.5157064713943694, + "learning_rate": 1.785275036327153e-05, + "loss": 1.0743, + "step": 564 + }, + { + "epoch": 1.4570690768237573, + "grad_norm": 0.5093111094768367, + "learning_rate": 1.7841565224139798e-05, + "loss": 1.0885, + "step": 565 + }, + { + "epoch": 1.459651387992253, + "grad_norm": 0.48350148375931845, + "learning_rate": 1.783035455183174e-05, + "loss": 1.0795, + "step": 566 + }, + { + "epoch": 1.4622336991607487, + "grad_norm": 0.5104024923814483, + "learning_rate": 1.781911838285078e-05, + "loss": 1.0691, + "step": 567 + }, + { + "epoch": 1.4648160103292447, + "grad_norm": 0.5257774613135558, + "learning_rate": 1.7807856753783387e-05, + "loss": 1.0836, + "step": 568 + }, + { + "epoch": 1.4673983214977404, + "grad_norm": 0.5133170260226599, + "learning_rate": 1.7796569701298906e-05, + "loss": 1.111, + "step": 569 + }, + { + "epoch": 1.4699806326662364, + "grad_norm": 0.4960488771792151, + "learning_rate": 1.778525726214949e-05, + "loss": 1.0913, + "step": 570 + }, + { + "epoch": 1.472562943834732, + "grad_norm": 0.4923055735581868, + "learning_rate": 1.7773919473169933e-05, + "loss": 1.0585, + "step": 571 + }, + { + "epoch": 1.4751452550032278, + "grad_norm": 0.5127181992354112, + "learning_rate": 1.7762556371277578e-05, + "loss": 1.0647, + "step": 572 + }, + { + "epoch": 1.4777275661717237, + "grad_norm": 0.5282994414831952, + "learning_rate": 1.7751167993472198e-05, + "loss": 1.1137, + "step": 573 + }, + { + "epoch": 1.4803098773402195, + "grad_norm": 0.5248953393475492, + "learning_rate": 1.7739754376835858e-05, + "loss": 1.0999, + "step": 574 + }, + { + "epoch": 1.4828921885087154, + "grad_norm": 0.5281077411991068, + "learning_rate": 1.7728315558532806e-05, + "loss": 1.0953, + "step": 575 + }, + { + "epoch": 1.4854744996772111, + "grad_norm": 0.4914770889754758, + "learning_rate": 1.7716851575809354e-05, + "loss": 1.1072, + "step": 576 + }, + { + "epoch": 1.4880568108457068, + "grad_norm": 0.5218556469624681, + "learning_rate": 1.770536246599375e-05, + "loss": 1.0899, + "step": 577 + }, + { + "epoch": 1.4906391220142028, + "grad_norm": 0.5239346375890538, + "learning_rate": 1.769384826649606e-05, + "loss": 1.0779, + "step": 578 + }, + { + "epoch": 1.4932214331826985, + "grad_norm": 0.5538774540635639, + "learning_rate": 1.7682309014808043e-05, + "loss": 1.0503, + "step": 579 + }, + { + "epoch": 1.4958037443511945, + "grad_norm": 0.5212453237405811, + "learning_rate": 1.7670744748503033e-05, + "loss": 1.1206, + "step": 580 + }, + { + "epoch": 1.4983860555196902, + "grad_norm": 0.5221975743446008, + "learning_rate": 1.7659155505235812e-05, + "loss": 1.0712, + "step": 581 + }, + { + "epoch": 1.500968366688186, + "grad_norm": 0.49914883784122016, + "learning_rate": 1.76475413227425e-05, + "loss": 1.0649, + "step": 582 + }, + { + "epoch": 1.5035506778566816, + "grad_norm": 0.5000466728400638, + "learning_rate": 1.7635902238840408e-05, + "loss": 1.0621, + "step": 583 + }, + { + "epoch": 1.5061329890251776, + "grad_norm": 0.4994098902333712, + "learning_rate": 1.762423829142794e-05, + "loss": 1.0712, + "step": 584 + }, + { + "epoch": 1.5087153001936735, + "grad_norm": 0.5342342246357215, + "learning_rate": 1.7612549518484458e-05, + "loss": 1.1141, + "step": 585 + }, + { + "epoch": 1.5112976113621692, + "grad_norm": 0.5097744481641414, + "learning_rate": 1.7600835958070156e-05, + "loss": 1.1007, + "step": 586 + }, + { + "epoch": 1.513879922530665, + "grad_norm": 0.5079344339293564, + "learning_rate": 1.7589097648325936e-05, + "loss": 1.0814, + "step": 587 + }, + { + "epoch": 1.5164622336991607, + "grad_norm": 0.4902197192987524, + "learning_rate": 1.7577334627473295e-05, + "loss": 1.0589, + "step": 588 + }, + { + "epoch": 1.5190445448676564, + "grad_norm": 0.4893765216413381, + "learning_rate": 1.756554693381419e-05, + "loss": 1.0913, + "step": 589 + }, + { + "epoch": 1.5216268560361523, + "grad_norm": 0.5225764596558536, + "learning_rate": 1.755373460573091e-05, + "loss": 1.0923, + "step": 590 + }, + { + "epoch": 1.5242091672046483, + "grad_norm": 0.5152845982203591, + "learning_rate": 1.7541897681685967e-05, + "loss": 1.0946, + "step": 591 + }, + { + "epoch": 1.526791478373144, + "grad_norm": 0.5261227535805723, + "learning_rate": 1.7530036200221955e-05, + "loss": 1.1183, + "step": 592 + }, + { + "epoch": 1.5293737895416397, + "grad_norm": 0.49461625515890395, + "learning_rate": 1.7518150199961427e-05, + "loss": 1.0876, + "step": 593 + }, + { + "epoch": 1.5319561007101354, + "grad_norm": 0.5021228041031806, + "learning_rate": 1.7506239719606776e-05, + "loss": 1.0916, + "step": 594 + }, + { + "epoch": 1.5345384118786314, + "grad_norm": 0.503576565099223, + "learning_rate": 1.749430479794011e-05, + "loss": 1.0943, + "step": 595 + }, + { + "epoch": 1.5371207230471273, + "grad_norm": 0.5226270592481841, + "learning_rate": 1.7482345473823116e-05, + "loss": 1.1015, + "step": 596 + }, + { + "epoch": 1.539703034215623, + "grad_norm": 0.537294451707703, + "learning_rate": 1.7470361786196938e-05, + "loss": 1.0954, + "step": 597 + }, + { + "epoch": 1.5422853453841188, + "grad_norm": 0.504661881274588, + "learning_rate": 1.7458353774082052e-05, + "loss": 1.0821, + "step": 598 + }, + { + "epoch": 1.5448676565526145, + "grad_norm": 0.49601719518902315, + "learning_rate": 1.7446321476578138e-05, + "loss": 1.0721, + "step": 599 + }, + { + "epoch": 1.5474499677211104, + "grad_norm": 0.5187763220017648, + "learning_rate": 1.743426493286395e-05, + "loss": 1.0507, + "step": 600 + }, + { + "epoch": 1.5500322788896062, + "grad_norm": 0.4989292387853037, + "learning_rate": 1.7422184182197197e-05, + "loss": 1.0897, + "step": 601 + }, + { + "epoch": 1.552614590058102, + "grad_norm": 0.4843467686137247, + "learning_rate": 1.7410079263914406e-05, + "loss": 1.0631, + "step": 602 + }, + { + "epoch": 1.5551969012265978, + "grad_norm": 0.5271533786423233, + "learning_rate": 1.7397950217430794e-05, + "loss": 1.1022, + "step": 603 + }, + { + "epoch": 1.5577792123950935, + "grad_norm": 0.5088338681798537, + "learning_rate": 1.7385797082240147e-05, + "loss": 1.0839, + "step": 604 + }, + { + "epoch": 1.5603615235635893, + "grad_norm": 0.4853868656293585, + "learning_rate": 1.737361989791468e-05, + "loss": 1.0539, + "step": 605 + }, + { + "epoch": 1.5629438347320852, + "grad_norm": 0.49682074497284284, + "learning_rate": 1.7361418704104925e-05, + "loss": 1.089, + "step": 606 + }, + { + "epoch": 1.5655261459005811, + "grad_norm": 0.49391110999180715, + "learning_rate": 1.734919354053959e-05, + "loss": 1.0829, + "step": 607 + }, + { + "epoch": 1.5681084570690769, + "grad_norm": 0.5033926514159104, + "learning_rate": 1.733694444702542e-05, + "loss": 1.0882, + "step": 608 + }, + { + "epoch": 1.5706907682375726, + "grad_norm": 0.5120915845296581, + "learning_rate": 1.7324671463447092e-05, + "loss": 1.1071, + "step": 609 + }, + { + "epoch": 1.5732730794060683, + "grad_norm": 0.5027154616635228, + "learning_rate": 1.731237462976707e-05, + "loss": 1.0706, + "step": 610 + }, + { + "epoch": 1.5758553905745643, + "grad_norm": 0.5017997136995258, + "learning_rate": 1.7300053986025476e-05, + "loss": 1.0935, + "step": 611 + }, + { + "epoch": 1.5784377017430602, + "grad_norm": 0.4876968907410891, + "learning_rate": 1.7287709572339958e-05, + "loss": 1.0414, + "step": 612 + }, + { + "epoch": 1.581020012911556, + "grad_norm": 0.5124364267745315, + "learning_rate": 1.7275341428905564e-05, + "loss": 1.0569, + "step": 613 + }, + { + "epoch": 1.5836023240800516, + "grad_norm": 0.5192211728464061, + "learning_rate": 1.7262949595994606e-05, + "loss": 1.0761, + "step": 614 + }, + { + "epoch": 1.5861846352485474, + "grad_norm": 0.49024680943396437, + "learning_rate": 1.7250534113956543e-05, + "loss": 1.08, + "step": 615 + }, + { + "epoch": 1.5887669464170433, + "grad_norm": 0.4920460482587524, + "learning_rate": 1.7238095023217823e-05, + "loss": 1.0739, + "step": 616 + }, + { + "epoch": 1.591349257585539, + "grad_norm": 0.5124627096290069, + "learning_rate": 1.722563236428178e-05, + "loss": 1.0507, + "step": 617 + }, + { + "epoch": 1.593931568754035, + "grad_norm": 0.51198818397323, + "learning_rate": 1.721314617772849e-05, + "loss": 1.0922, + "step": 618 + }, + { + "epoch": 1.5965138799225307, + "grad_norm": 0.5062132347505444, + "learning_rate": 1.7200636504214618e-05, + "loss": 1.0374, + "step": 619 + }, + { + "epoch": 1.5990961910910264, + "grad_norm": 0.521687159299446, + "learning_rate": 1.7188103384473334e-05, + "loss": 1.064, + "step": 620 + }, + { + "epoch": 1.6016785022595221, + "grad_norm": 0.47867011946021426, + "learning_rate": 1.7175546859314126e-05, + "loss": 1.0988, + "step": 621 + }, + { + "epoch": 1.604260813428018, + "grad_norm": 0.4894260917344886, + "learning_rate": 1.7162966969622713e-05, + "loss": 1.0709, + "step": 622 + }, + { + "epoch": 1.606843124596514, + "grad_norm": 0.5354553757656224, + "learning_rate": 1.7150363756360886e-05, + "loss": 1.1033, + "step": 623 + }, + { + "epoch": 1.6094254357650097, + "grad_norm": 0.5039991951669948, + "learning_rate": 1.713773726056637e-05, + "loss": 1.1001, + "step": 624 + }, + { + "epoch": 1.6120077469335055, + "grad_norm": 0.4980182271238221, + "learning_rate": 1.7125087523352718e-05, + "loss": 1.0788, + "step": 625 + }, + { + "epoch": 1.6145900581020012, + "grad_norm": 0.5322347706228192, + "learning_rate": 1.7112414585909146e-05, + "loss": 1.0673, + "step": 626 + }, + { + "epoch": 1.6171723692704971, + "grad_norm": 0.49515788566577773, + "learning_rate": 1.7099718489500426e-05, + "loss": 1.0818, + "step": 627 + }, + { + "epoch": 1.6197546804389928, + "grad_norm": 0.49847544874894734, + "learning_rate": 1.7086999275466727e-05, + "loss": 1.071, + "step": 628 + }, + { + "epoch": 1.6223369916074888, + "grad_norm": 0.4852829304995305, + "learning_rate": 1.7074256985223496e-05, + "loss": 1.0631, + "step": 629 + }, + { + "epoch": 1.6249193027759845, + "grad_norm": 0.49795661914754413, + "learning_rate": 1.706149166026132e-05, + "loss": 1.0876, + "step": 630 + }, + { + "epoch": 1.6275016139444802, + "grad_norm": 0.502622123586083, + "learning_rate": 1.7048703342145793e-05, + "loss": 1.0846, + "step": 631 + }, + { + "epoch": 1.630083925112976, + "grad_norm": 0.49228802240874425, + "learning_rate": 1.7035892072517373e-05, + "loss": 1.1087, + "step": 632 + }, + { + "epoch": 1.632666236281472, + "grad_norm": 0.4971680566338004, + "learning_rate": 1.7023057893091254e-05, + "loss": 1.0768, + "step": 633 + }, + { + "epoch": 1.6352485474499678, + "grad_norm": 0.5131651796956291, + "learning_rate": 1.7010200845657222e-05, + "loss": 1.0899, + "step": 634 + }, + { + "epoch": 1.6378308586184636, + "grad_norm": 0.5074519176524721, + "learning_rate": 1.6997320972079536e-05, + "loss": 1.081, + "step": 635 + }, + { + "epoch": 1.6404131697869593, + "grad_norm": 0.5174545968485477, + "learning_rate": 1.6984418314296768e-05, + "loss": 1.0472, + "step": 636 + }, + { + "epoch": 1.642995480955455, + "grad_norm": 0.5077381746768771, + "learning_rate": 1.697149291432168e-05, + "loss": 1.0926, + "step": 637 + }, + { + "epoch": 1.645577792123951, + "grad_norm": 0.5354282615337868, + "learning_rate": 1.6958544814241094e-05, + "loss": 1.0414, + "step": 638 + }, + { + "epoch": 1.6481601032924469, + "grad_norm": 0.52898951516764, + "learning_rate": 1.6945574056215742e-05, + "loss": 1.0973, + "step": 639 + }, + { + "epoch": 1.6507424144609426, + "grad_norm": 0.5009163137242975, + "learning_rate": 1.6932580682480124e-05, + "loss": 1.0826, + "step": 640 + }, + { + "epoch": 1.6533247256294383, + "grad_norm": 0.5066860393622376, + "learning_rate": 1.6919564735342398e-05, + "loss": 1.0836, + "step": 641 + }, + { + "epoch": 1.655907036797934, + "grad_norm": 0.5418314242744041, + "learning_rate": 1.6906526257184206e-05, + "loss": 1.1132, + "step": 642 + }, + { + "epoch": 1.65848934796643, + "grad_norm": 0.4999534074892505, + "learning_rate": 1.689346529046057e-05, + "loss": 1.0818, + "step": 643 + }, + { + "epoch": 1.6610716591349257, + "grad_norm": 0.5017097366254959, + "learning_rate": 1.6880381877699717e-05, + "loss": 1.074, + "step": 644 + }, + { + "epoch": 1.6636539703034217, + "grad_norm": 0.5233206395612633, + "learning_rate": 1.686727606150299e-05, + "loss": 1.0628, + "step": 645 + }, + { + "epoch": 1.6662362814719174, + "grad_norm": 0.512780816400479, + "learning_rate": 1.6854147884544655e-05, + "loss": 1.0843, + "step": 646 + }, + { + "epoch": 1.668818592640413, + "grad_norm": 0.49275381075866503, + "learning_rate": 1.68409973895718e-05, + "loss": 1.0843, + "step": 647 + }, + { + "epoch": 1.6714009038089088, + "grad_norm": 0.5496131300871087, + "learning_rate": 1.682782461940418e-05, + "loss": 1.0836, + "step": 648 + }, + { + "epoch": 1.6739832149774048, + "grad_norm": 0.512860264741888, + "learning_rate": 1.6814629616934078e-05, + "loss": 1.0743, + "step": 649 + }, + { + "epoch": 1.6765655261459007, + "grad_norm": 0.4975144838656257, + "learning_rate": 1.6801412425126183e-05, + "loss": 1.0864, + "step": 650 + }, + { + "epoch": 1.6791478373143964, + "grad_norm": 0.5014057129631031, + "learning_rate": 1.678817308701741e-05, + "loss": 1.0427, + "step": 651 + }, + { + "epoch": 1.6817301484828922, + "grad_norm": 0.5234985943818525, + "learning_rate": 1.677491164571681e-05, + "loss": 1.1048, + "step": 652 + }, + { + "epoch": 1.6843124596513879, + "grad_norm": 0.5201679982476692, + "learning_rate": 1.6761628144405394e-05, + "loss": 1.064, + "step": 653 + }, + { + "epoch": 1.6868947708198838, + "grad_norm": 0.4846519175712527, + "learning_rate": 1.6748322626336e-05, + "loss": 1.0539, + "step": 654 + }, + { + "epoch": 1.6894770819883798, + "grad_norm": 0.5173646723613604, + "learning_rate": 1.6734995134833155e-05, + "loss": 1.1007, + "step": 655 + }, + { + "epoch": 1.6920593931568755, + "grad_norm": 0.5113183351556072, + "learning_rate": 1.6721645713292953e-05, + "loss": 1.0815, + "step": 656 + }, + { + "epoch": 1.6946417043253712, + "grad_norm": 0.5211205639308888, + "learning_rate": 1.670827440518287e-05, + "loss": 1.0837, + "step": 657 + }, + { + "epoch": 1.697224015493867, + "grad_norm": 0.5080702604570161, + "learning_rate": 1.6694881254041657e-05, + "loss": 1.1173, + "step": 658 + }, + { + "epoch": 1.6998063266623629, + "grad_norm": 0.4962653526436615, + "learning_rate": 1.6681466303479196e-05, + "loss": 1.0352, + "step": 659 + }, + { + "epoch": 1.7023886378308586, + "grad_norm": 0.5142414297852521, + "learning_rate": 1.6668029597176344e-05, + "loss": 1.0666, + "step": 660 + }, + { + "epoch": 1.7049709489993545, + "grad_norm": 0.4901838014123924, + "learning_rate": 1.66545711788848e-05, + "loss": 1.0816, + "step": 661 + }, + { + "epoch": 1.7075532601678503, + "grad_norm": 0.5141149184635171, + "learning_rate": 1.664109109242696e-05, + "loss": 1.0771, + "step": 662 + }, + { + "epoch": 1.710135571336346, + "grad_norm": 0.5172010165390014, + "learning_rate": 1.6627589381695763e-05, + "loss": 1.0752, + "step": 663 + }, + { + "epoch": 1.7127178825048417, + "grad_norm": 0.4963415308538906, + "learning_rate": 1.661406609065458e-05, + "loss": 1.1219, + "step": 664 + }, + { + "epoch": 1.7153001936733376, + "grad_norm": 0.4877763872520322, + "learning_rate": 1.6600521263337043e-05, + "loss": 1.058, + "step": 665 + }, + { + "epoch": 1.7178825048418336, + "grad_norm": 0.4969456781904556, + "learning_rate": 1.6586954943846895e-05, + "loss": 1.0834, + "step": 666 + }, + { + "epoch": 1.7204648160103293, + "grad_norm": 0.4815019660085988, + "learning_rate": 1.6573367176357876e-05, + "loss": 1.0618, + "step": 667 + }, + { + "epoch": 1.723047127178825, + "grad_norm": 0.4936696257730036, + "learning_rate": 1.6559758005113564e-05, + "loss": 1.0902, + "step": 668 + }, + { + "epoch": 1.7256294383473207, + "grad_norm": 0.4850729841607312, + "learning_rate": 1.6546127474427217e-05, + "loss": 1.0499, + "step": 669 + }, + { + "epoch": 1.7282117495158167, + "grad_norm": 0.48113300472686776, + "learning_rate": 1.653247562868166e-05, + "loss": 1.0682, + "step": 670 + }, + { + "epoch": 1.7307940606843124, + "grad_norm": 0.4814780954159902, + "learning_rate": 1.6518802512329105e-05, + "loss": 1.083, + "step": 671 + }, + { + "epoch": 1.7333763718528084, + "grad_norm": 0.5247380192600469, + "learning_rate": 1.6505108169891032e-05, + "loss": 1.093, + "step": 672 + }, + { + "epoch": 1.735958683021304, + "grad_norm": 0.4981848964288428, + "learning_rate": 1.6491392645958043e-05, + "loss": 1.0656, + "step": 673 + }, + { + "epoch": 1.7385409941897998, + "grad_norm": 0.5007815563313807, + "learning_rate": 1.6477655985189703e-05, + "loss": 1.0583, + "step": 674 + }, + { + "epoch": 1.7411233053582955, + "grad_norm": 0.4924655390382668, + "learning_rate": 1.6463898232314393e-05, + "loss": 1.0881, + "step": 675 + }, + { + "epoch": 1.7437056165267915, + "grad_norm": 0.4964112767549225, + "learning_rate": 1.6450119432129185e-05, + "loss": 1.0645, + "step": 676 + }, + { + "epoch": 1.7462879276952874, + "grad_norm": 0.48606768423741387, + "learning_rate": 1.6436319629499683e-05, + "loss": 1.0984, + "step": 677 + }, + { + "epoch": 1.7488702388637831, + "grad_norm": 0.5148244978248903, + "learning_rate": 1.642249886935987e-05, + "loss": 1.0668, + "step": 678 + }, + { + "epoch": 1.7514525500322788, + "grad_norm": 0.5005608398817017, + "learning_rate": 1.6408657196711977e-05, + "loss": 1.0253, + "step": 679 + }, + { + "epoch": 1.7540348612007746, + "grad_norm": 0.5231072514633008, + "learning_rate": 1.6394794656626325e-05, + "loss": 1.1069, + "step": 680 + }, + { + "epoch": 1.7566171723692705, + "grad_norm": 0.5158005487144547, + "learning_rate": 1.638091129424118e-05, + "loss": 1.1059, + "step": 681 + }, + { + "epoch": 1.7591994835377665, + "grad_norm": 0.5085221327854161, + "learning_rate": 1.6367007154762616e-05, + "loss": 1.0628, + "step": 682 + }, + { + "epoch": 1.7617817947062622, + "grad_norm": 0.49993960938301113, + "learning_rate": 1.6353082283464355e-05, + "loss": 1.0774, + "step": 683 + }, + { + "epoch": 1.764364105874758, + "grad_norm": 0.5136885732061924, + "learning_rate": 1.633913672568762e-05, + "loss": 1.0571, + "step": 684 + }, + { + "epoch": 1.7669464170432536, + "grad_norm": 0.5064861116191551, + "learning_rate": 1.6325170526841e-05, + "loss": 1.0927, + "step": 685 + }, + { + "epoch": 1.7695287282117496, + "grad_norm": 0.5065336598849988, + "learning_rate": 1.631118373240029e-05, + "loss": 1.0437, + "step": 686 + }, + { + "epoch": 1.7721110393802453, + "grad_norm": 0.5071797770884724, + "learning_rate": 1.629717638790835e-05, + "loss": 1.058, + "step": 687 + }, + { + "epoch": 1.7746933505487412, + "grad_norm": 0.5032716603331865, + "learning_rate": 1.6283148538974943e-05, + "loss": 1.108, + "step": 688 + }, + { + "epoch": 1.777275661717237, + "grad_norm": 0.5168971680331349, + "learning_rate": 1.6269100231276617e-05, + "loss": 1.0967, + "step": 689 + }, + { + "epoch": 1.7798579728857327, + "grad_norm": 0.5328504423513274, + "learning_rate": 1.6255031510556513e-05, + "loss": 1.0755, + "step": 690 + }, + { + "epoch": 1.7824402840542284, + "grad_norm": 0.474134415576521, + "learning_rate": 1.6240942422624264e-05, + "loss": 1.0433, + "step": 691 + }, + { + "epoch": 1.7850225952227243, + "grad_norm": 0.5062840676066106, + "learning_rate": 1.62268330133558e-05, + "loss": 1.0884, + "step": 692 + }, + { + "epoch": 1.7876049063912203, + "grad_norm": 0.5242892989776939, + "learning_rate": 1.6212703328693232e-05, + "loss": 1.0813, + "step": 693 + }, + { + "epoch": 1.790187217559716, + "grad_norm": 0.49294778164946207, + "learning_rate": 1.6198553414644687e-05, + "loss": 1.0589, + "step": 694 + }, + { + "epoch": 1.7927695287282117, + "grad_norm": 0.5333831319134179, + "learning_rate": 1.6184383317284163e-05, + "loss": 1.0803, + "step": 695 + }, + { + "epoch": 1.7953518398967074, + "grad_norm": 0.4848117805750976, + "learning_rate": 1.6170193082751372e-05, + "loss": 1.0651, + "step": 696 + }, + { + "epoch": 1.7979341510652034, + "grad_norm": 0.4912184014826424, + "learning_rate": 1.6155982757251605e-05, + "loss": 1.0805, + "step": 697 + }, + { + "epoch": 1.8005164622336993, + "grad_norm": 0.5209563606543747, + "learning_rate": 1.614175238705556e-05, + "loss": 1.0676, + "step": 698 + }, + { + "epoch": 1.803098773402195, + "grad_norm": 0.4983116660478031, + "learning_rate": 1.6127502018499216e-05, + "loss": 1.0523, + "step": 699 + }, + { + "epoch": 1.8056810845706908, + "grad_norm": 0.487693383153112, + "learning_rate": 1.6113231697983658e-05, + "loss": 1.0663, + "step": 700 + }, + { + "epoch": 1.8082633957391865, + "grad_norm": 0.5338041396304789, + "learning_rate": 1.6098941471974945e-05, + "loss": 1.1128, + "step": 701 + }, + { + "epoch": 1.8108457069076824, + "grad_norm": 0.5142298199439157, + "learning_rate": 1.608463138700395e-05, + "loss": 1.0712, + "step": 702 + }, + { + "epoch": 1.8134280180761781, + "grad_norm": 0.47630790480629104, + "learning_rate": 1.6070301489666203e-05, + "loss": 1.0988, + "step": 703 + }, + { + "epoch": 1.816010329244674, + "grad_norm": 0.4901104298299483, + "learning_rate": 1.6055951826621753e-05, + "loss": 1.0428, + "step": 704 + }, + { + "epoch": 1.8185926404131698, + "grad_norm": 0.5227604090573119, + "learning_rate": 1.6041582444595004e-05, + "loss": 1.0698, + "step": 705 + }, + { + "epoch": 1.8211749515816655, + "grad_norm": 0.5041405266487794, + "learning_rate": 1.602719339037457e-05, + "loss": 1.0753, + "step": 706 + }, + { + "epoch": 1.8237572627501613, + "grad_norm": 0.5093841266418548, + "learning_rate": 1.6012784710813122e-05, + "loss": 1.1189, + "step": 707 + }, + { + "epoch": 1.8263395739186572, + "grad_norm": 0.5166236437305157, + "learning_rate": 1.599835645282723e-05, + "loss": 1.07, + "step": 708 + }, + { + "epoch": 1.8289218850871531, + "grad_norm": 0.5238202604739227, + "learning_rate": 1.598390866339721e-05, + "loss": 1.0734, + "step": 709 + }, + { + "epoch": 1.8315041962556489, + "grad_norm": 0.5351507809671923, + "learning_rate": 1.5969441389566995e-05, + "loss": 1.0722, + "step": 710 + }, + { + "epoch": 1.8340865074241446, + "grad_norm": 0.48654357580846874, + "learning_rate": 1.5954954678443934e-05, + "loss": 1.0581, + "step": 711 + }, + { + "epoch": 1.8366688185926403, + "grad_norm": 0.48828499263183267, + "learning_rate": 1.5940448577198685e-05, + "loss": 1.0778, + "step": 712 + }, + { + "epoch": 1.8392511297611362, + "grad_norm": 0.49993618606120815, + "learning_rate": 1.5925923133065036e-05, + "loss": 1.0744, + "step": 713 + }, + { + "epoch": 1.841833440929632, + "grad_norm": 0.5060526028050449, + "learning_rate": 1.591137839333976e-05, + "loss": 1.0869, + "step": 714 + }, + { + "epoch": 1.844415752098128, + "grad_norm": 0.4854542346534205, + "learning_rate": 1.5896814405382455e-05, + "loss": 1.0734, + "step": 715 + }, + { + "epoch": 1.8469980632666236, + "grad_norm": 0.5120958440799618, + "learning_rate": 1.5882231216615405e-05, + "loss": 1.056, + "step": 716 + }, + { + "epoch": 1.8495803744351194, + "grad_norm": 0.4857556591454626, + "learning_rate": 1.58676288745234e-05, + "loss": 1.0502, + "step": 717 + }, + { + "epoch": 1.852162685603615, + "grad_norm": 0.49183704296233893, + "learning_rate": 1.5853007426653607e-05, + "loss": 1.116, + "step": 718 + }, + { + "epoch": 1.854744996772111, + "grad_norm": 0.4971020661357399, + "learning_rate": 1.5838366920615395e-05, + "loss": 1.0535, + "step": 719 + }, + { + "epoch": 1.857327307940607, + "grad_norm": 0.485071482170176, + "learning_rate": 1.5823707404080196e-05, + "loss": 1.0465, + "step": 720 + }, + { + "epoch": 1.8599096191091027, + "grad_norm": 0.48718149162761787, + "learning_rate": 1.5809028924781343e-05, + "loss": 1.0787, + "step": 721 + }, + { + "epoch": 1.8624919302775984, + "grad_norm": 0.4775709718268873, + "learning_rate": 1.5794331530513903e-05, + "loss": 1.0354, + "step": 722 + }, + { + "epoch": 1.8650742414460941, + "grad_norm": 0.5008952740743758, + "learning_rate": 1.577961526913455e-05, + "loss": 1.0602, + "step": 723 + }, + { + "epoch": 1.86765655261459, + "grad_norm": 0.5064643531485886, + "learning_rate": 1.5764880188561376e-05, + "loss": 1.1178, + "step": 724 + }, + { + "epoch": 1.870238863783086, + "grad_norm": 0.49848568260978515, + "learning_rate": 1.5750126336773755e-05, + "loss": 1.0422, + "step": 725 + }, + { + "epoch": 1.8728211749515817, + "grad_norm": 0.5220419002346904, + "learning_rate": 1.5735353761812197e-05, + "loss": 1.057, + "step": 726 + }, + { + "epoch": 1.8754034861200775, + "grad_norm": 0.4971228637528045, + "learning_rate": 1.5720562511778156e-05, + "loss": 1.0556, + "step": 727 + }, + { + "epoch": 1.8779857972885732, + "grad_norm": 0.48640138793502713, + "learning_rate": 1.5705752634833908e-05, + "loss": 1.0857, + "step": 728 + }, + { + "epoch": 1.8805681084570691, + "grad_norm": 0.5037467924591017, + "learning_rate": 1.5690924179202375e-05, + "loss": 1.0581, + "step": 729 + }, + { + "epoch": 1.8831504196255648, + "grad_norm": 0.5282496443218059, + "learning_rate": 1.5676077193166973e-05, + "loss": 1.0799, + "step": 730 + }, + { + "epoch": 1.8857327307940608, + "grad_norm": 0.47021596749068756, + "learning_rate": 1.5661211725071457e-05, + "loss": 1.0352, + "step": 731 + }, + { + "epoch": 1.8883150419625565, + "grad_norm": 0.513391496193585, + "learning_rate": 1.5646327823319765e-05, + "loss": 1.1031, + "step": 732 + }, + { + "epoch": 1.8908973531310522, + "grad_norm": 0.5186791214030437, + "learning_rate": 1.5631425536375858e-05, + "loss": 1.0849, + "step": 733 + }, + { + "epoch": 1.893479664299548, + "grad_norm": 0.4945851612291226, + "learning_rate": 1.5616504912763554e-05, + "loss": 1.0513, + "step": 734 + }, + { + "epoch": 1.896061975468044, + "grad_norm": 0.4805032305818217, + "learning_rate": 1.5601566001066384e-05, + "loss": 1.0388, + "step": 735 + }, + { + "epoch": 1.8986442866365398, + "grad_norm": 0.49355307137386584, + "learning_rate": 1.5586608849927424e-05, + "loss": 1.0729, + "step": 736 + }, + { + "epoch": 1.9012265978050356, + "grad_norm": 0.4868522623792675, + "learning_rate": 1.5571633508049148e-05, + "loss": 1.0472, + "step": 737 + }, + { + "epoch": 1.9038089089735313, + "grad_norm": 0.5064645951323721, + "learning_rate": 1.5556640024193245e-05, + "loss": 1.0592, + "step": 738 + }, + { + "epoch": 1.906391220142027, + "grad_norm": 0.4935670987750482, + "learning_rate": 1.5541628447180494e-05, + "loss": 1.0567, + "step": 739 + }, + { + "epoch": 1.908973531310523, + "grad_norm": 0.5087284962422527, + "learning_rate": 1.552659882589058e-05, + "loss": 1.0544, + "step": 740 + }, + { + "epoch": 1.9115558424790189, + "grad_norm": 0.503398066954607, + "learning_rate": 1.551155120926194e-05, + "loss": 1.0416, + "step": 741 + }, + { + "epoch": 1.9141381536475146, + "grad_norm": 0.5248315003621526, + "learning_rate": 1.5496485646291613e-05, + "loss": 1.0821, + "step": 742 + }, + { + "epoch": 1.9167204648160103, + "grad_norm": 0.509374102002012, + "learning_rate": 1.548140218603507e-05, + "loss": 1.1231, + "step": 743 + }, + { + "epoch": 1.919302775984506, + "grad_norm": 0.4763357005641916, + "learning_rate": 1.5466300877606054e-05, + "loss": 1.0557, + "step": 744 + }, + { + "epoch": 1.921885087153002, + "grad_norm": 0.5156095352624543, + "learning_rate": 1.5451181770176434e-05, + "loss": 1.102, + "step": 745 + }, + { + "epoch": 1.9244673983214977, + "grad_norm": 0.5116973644648233, + "learning_rate": 1.543604491297602e-05, + "loss": 1.1098, + "step": 746 + }, + { + "epoch": 1.9270497094899937, + "grad_norm": 0.5093190129624484, + "learning_rate": 1.5420890355292435e-05, + "loss": 1.0528, + "step": 747 + }, + { + "epoch": 1.9296320206584894, + "grad_norm": 0.506225349532536, + "learning_rate": 1.5405718146470926e-05, + "loss": 1.0607, + "step": 748 + }, + { + "epoch": 1.932214331826985, + "grad_norm": 0.5271647344306659, + "learning_rate": 1.5390528335914216e-05, + "loss": 1.1065, + "step": 749 + }, + { + "epoch": 1.9347966429954808, + "grad_norm": 0.5309546928129137, + "learning_rate": 1.5375320973082346e-05, + "loss": 1.0818, + "step": 750 + }, + { + "epoch": 1.9373789541639768, + "grad_norm": 0.5187838315898233, + "learning_rate": 1.53600961074925e-05, + "loss": 1.0614, + "step": 751 + }, + { + "epoch": 1.9399612653324727, + "grad_norm": 0.4999420933843893, + "learning_rate": 1.5344853788718867e-05, + "loss": 1.0385, + "step": 752 + }, + { + "epoch": 1.9425435765009684, + "grad_norm": 0.5449765150372478, + "learning_rate": 1.532959406639245e-05, + "loss": 1.0324, + "step": 753 + }, + { + "epoch": 1.9451258876694641, + "grad_norm": 0.48682953533866824, + "learning_rate": 1.5314316990200933e-05, + "loss": 1.0302, + "step": 754 + }, + { + "epoch": 1.9477081988379599, + "grad_norm": 0.49226909739556324, + "learning_rate": 1.5299022609888507e-05, + "loss": 1.1016, + "step": 755 + }, + { + "epoch": 1.9502905100064558, + "grad_norm": 0.5347878649288165, + "learning_rate": 1.5283710975255695e-05, + "loss": 1.0843, + "step": 756 + }, + { + "epoch": 1.9528728211749515, + "grad_norm": 0.5028835492330221, + "learning_rate": 1.5268382136159213e-05, + "loss": 1.0832, + "step": 757 + }, + { + "epoch": 1.9554551323434475, + "grad_norm": 0.5020788778708613, + "learning_rate": 1.5253036142511794e-05, + "loss": 1.0554, + "step": 758 + }, + { + "epoch": 1.9580374435119432, + "grad_norm": 0.5117826424662124, + "learning_rate": 1.5237673044282028e-05, + "loss": 1.0407, + "step": 759 + }, + { + "epoch": 1.960619754680439, + "grad_norm": 0.486751220436105, + "learning_rate": 1.5222292891494204e-05, + "loss": 1.1028, + "step": 760 + }, + { + "epoch": 1.9632020658489346, + "grad_norm": 0.5042620665613498, + "learning_rate": 1.5206895734228133e-05, + "loss": 1.1089, + "step": 761 + }, + { + "epoch": 1.9657843770174306, + "grad_norm": 0.4985230564456094, + "learning_rate": 1.5191481622619006e-05, + "loss": 1.0892, + "step": 762 + }, + { + "epoch": 1.9683666881859265, + "grad_norm": 0.46564788925434075, + "learning_rate": 1.5176050606857211e-05, + "loss": 1.0687, + "step": 763 + }, + { + "epoch": 1.9709489993544222, + "grad_norm": 0.4799444405216457, + "learning_rate": 1.5160602737188184e-05, + "loss": 1.0627, + "step": 764 + }, + { + "epoch": 1.973531310522918, + "grad_norm": 0.517680238719764, + "learning_rate": 1.514513806391224e-05, + "loss": 1.1087, + "step": 765 + }, + { + "epoch": 1.9761136216914137, + "grad_norm": 0.4945356852520951, + "learning_rate": 1.5129656637384398e-05, + "loss": 1.0333, + "step": 766 + }, + { + "epoch": 1.9786959328599096, + "grad_norm": 0.47246161399068515, + "learning_rate": 1.5114158508014244e-05, + "loss": 1.0622, + "step": 767 + }, + { + "epoch": 1.9812782440284056, + "grad_norm": 0.4792556964609251, + "learning_rate": 1.509864372626574e-05, + "loss": 1.0807, + "step": 768 + }, + { + "epoch": 1.9838605551969013, + "grad_norm": 0.4942144710838991, + "learning_rate": 1.5083112342657071e-05, + "loss": 1.088, + "step": 769 + }, + { + "epoch": 1.986442866365397, + "grad_norm": 0.5201249256236419, + "learning_rate": 1.5067564407760485e-05, + "loss": 1.0938, + "step": 770 + }, + { + "epoch": 1.9890251775338927, + "grad_norm": 0.4829123624901927, + "learning_rate": 1.5051999972202118e-05, + "loss": 1.0353, + "step": 771 + }, + { + "epoch": 1.9916074887023887, + "grad_norm": 0.5024498746492575, + "learning_rate": 1.5036419086661837e-05, + "loss": 1.0802, + "step": 772 + }, + { + "epoch": 1.9941897998708844, + "grad_norm": 0.4963413917672638, + "learning_rate": 1.5020821801873072e-05, + "loss": 1.0801, + "step": 773 + }, + { + "epoch": 1.9967721110393803, + "grad_norm": 0.4872864553127849, + "learning_rate": 1.5005208168622649e-05, + "loss": 1.0509, + "step": 774 + }, + { + "epoch": 1.999354422207876, + "grad_norm": 0.47707633602130245, + "learning_rate": 1.4989578237750628e-05, + "loss": 1.0485, + "step": 775 + }, + { + "epoch": 2.0, + "grad_norm": 0.47707633602130245, + "learning_rate": 1.4973932060150142e-05, + "loss": 1.0293, + "step": 776 + }, + { + "epoch": 2.0025823111684957, + "grad_norm": 1.1941485826343992, + "learning_rate": 1.4958269686767214e-05, + "loss": 0.9552, + "step": 777 + }, + { + "epoch": 2.0051646223369914, + "grad_norm": 0.7512477337089716, + "learning_rate": 1.4942591168600616e-05, + "loss": 0.9653, + "step": 778 + }, + { + "epoch": 2.0077469335054876, + "grad_norm": 0.6413401233120336, + "learning_rate": 1.4926896556701676e-05, + "loss": 0.9713, + "step": 779 + }, + { + "epoch": 2.0103292446739833, + "grad_norm": 0.8265225982878504, + "learning_rate": 1.4911185902174134e-05, + "loss": 0.9674, + "step": 780 + }, + { + "epoch": 2.012911555842479, + "grad_norm": 0.7690019968988396, + "learning_rate": 1.4895459256173966e-05, + "loss": 0.9701, + "step": 781 + }, + { + "epoch": 2.0154938670109748, + "grad_norm": 0.6695923734742643, + "learning_rate": 1.4879716669909215e-05, + "loss": 0.9262, + "step": 782 + }, + { + "epoch": 2.0180761781794705, + "grad_norm": 0.6968627838761038, + "learning_rate": 1.4863958194639828e-05, + "loss": 0.9738, + "step": 783 + }, + { + "epoch": 2.020658489347966, + "grad_norm": 0.7154538295447892, + "learning_rate": 1.4848183881677497e-05, + "loss": 0.9537, + "step": 784 + }, + { + "epoch": 2.0232408005164624, + "grad_norm": 0.6599822904047927, + "learning_rate": 1.4832393782385475e-05, + "loss": 0.9428, + "step": 785 + }, + { + "epoch": 2.025823111684958, + "grad_norm": 0.6785737464207784, + "learning_rate": 1.4816587948178411e-05, + "loss": 0.9377, + "step": 786 + }, + { + "epoch": 2.028405422853454, + "grad_norm": 0.6659927550215519, + "learning_rate": 1.4800766430522208e-05, + "loss": 0.9477, + "step": 787 + }, + { + "epoch": 2.0309877340219495, + "grad_norm": 0.6453008064623791, + "learning_rate": 1.4784929280933819e-05, + "loss": 0.9734, + "step": 788 + }, + { + "epoch": 2.0335700451904453, + "grad_norm": 0.6138501650256379, + "learning_rate": 1.4769076550981107e-05, + "loss": 0.9485, + "step": 789 + }, + { + "epoch": 2.0361523563589414, + "grad_norm": 0.628292728528221, + "learning_rate": 1.4753208292282666e-05, + "loss": 0.9373, + "step": 790 + }, + { + "epoch": 2.038734667527437, + "grad_norm": 0.6518720740962953, + "learning_rate": 1.4737324556507639e-05, + "loss": 0.9854, + "step": 791 + }, + { + "epoch": 2.041316978695933, + "grad_norm": 0.6417366086713475, + "learning_rate": 1.472142539537559e-05, + "loss": 0.9668, + "step": 792 + }, + { + "epoch": 2.0438992898644286, + "grad_norm": 0.6525413690852432, + "learning_rate": 1.4705510860656289e-05, + "loss": 0.9429, + "step": 793 + }, + { + "epoch": 2.0464816010329243, + "grad_norm": 0.6126084056396112, + "learning_rate": 1.4689581004169573e-05, + "loss": 0.9828, + "step": 794 + }, + { + "epoch": 2.0490639122014205, + "grad_norm": 0.6169432282421197, + "learning_rate": 1.4673635877785168e-05, + "loss": 0.9522, + "step": 795 + }, + { + "epoch": 2.051646223369916, + "grad_norm": 0.5922213822722046, + "learning_rate": 1.4657675533422517e-05, + "loss": 0.9478, + "step": 796 + }, + { + "epoch": 2.054228534538412, + "grad_norm": 0.6211700530426607, + "learning_rate": 1.4641700023050625e-05, + "loss": 0.9325, + "step": 797 + }, + { + "epoch": 2.0568108457069076, + "grad_norm": 0.6173048324016761, + "learning_rate": 1.4625709398687862e-05, + "loss": 0.9477, + "step": 798 + }, + { + "epoch": 2.0593931568754034, + "grad_norm": 0.5866193216584925, + "learning_rate": 1.4609703712401832e-05, + "loss": 0.9378, + "step": 799 + }, + { + "epoch": 2.061975468043899, + "grad_norm": 0.6128450747778748, + "learning_rate": 1.4593683016309168e-05, + "loss": 0.9785, + "step": 800 + }, + { + "epoch": 2.0645577792123952, + "grad_norm": 0.582652343191567, + "learning_rate": 1.4577647362575378e-05, + "loss": 0.9318, + "step": 801 + }, + { + "epoch": 2.067140090380891, + "grad_norm": 0.5589523161558311, + "learning_rate": 1.4561596803414681e-05, + "loss": 0.9295, + "step": 802 + }, + { + "epoch": 2.0697224015493867, + "grad_norm": 0.5777811559605781, + "learning_rate": 1.4545531391089826e-05, + "loss": 0.9606, + "step": 803 + }, + { + "epoch": 2.0723047127178824, + "grad_norm": 0.5932336435576502, + "learning_rate": 1.4529451177911926e-05, + "loss": 0.973, + "step": 804 + }, + { + "epoch": 2.074887023886378, + "grad_norm": 0.5467403526396694, + "learning_rate": 1.4513356216240287e-05, + "loss": 0.8862, + "step": 805 + }, + { + "epoch": 2.0774693350548743, + "grad_norm": 0.5667654235524354, + "learning_rate": 1.449724655848224e-05, + "loss": 0.9484, + "step": 806 + }, + { + "epoch": 2.08005164622337, + "grad_norm": 0.557027598681585, + "learning_rate": 1.4481122257092966e-05, + "loss": 0.9537, + "step": 807 + }, + { + "epoch": 2.0826339573918657, + "grad_norm": 0.5736006444800077, + "learning_rate": 1.4464983364575327e-05, + "loss": 0.9644, + "step": 808 + }, + { + "epoch": 2.0852162685603615, + "grad_norm": 0.5911533935183022, + "learning_rate": 1.44488299334797e-05, + "loss": 0.9547, + "step": 809 + }, + { + "epoch": 2.087798579728857, + "grad_norm": 0.5854788104570025, + "learning_rate": 1.44326620164038e-05, + "loss": 0.9316, + "step": 810 + }, + { + "epoch": 2.090380890897353, + "grad_norm": 0.5885109768704322, + "learning_rate": 1.4416479665992507e-05, + "loss": 0.9468, + "step": 811 + }, + { + "epoch": 2.092963202065849, + "grad_norm": 0.5860083582206628, + "learning_rate": 1.4400282934937702e-05, + "loss": 0.9597, + "step": 812 + }, + { + "epoch": 2.095545513234345, + "grad_norm": 0.5612799121488241, + "learning_rate": 1.4384071875978085e-05, + "loss": 0.9291, + "step": 813 + }, + { + "epoch": 2.0981278244028405, + "grad_norm": 0.5760413758972827, + "learning_rate": 1.4367846541899017e-05, + "loss": 0.9434, + "step": 814 + }, + { + "epoch": 2.1007101355713362, + "grad_norm": 0.5872031423934213, + "learning_rate": 1.4351606985532338e-05, + "loss": 0.9546, + "step": 815 + }, + { + "epoch": 2.103292446739832, + "grad_norm": 0.5875266718802965, + "learning_rate": 1.4335353259756199e-05, + "loss": 0.9739, + "step": 816 + }, + { + "epoch": 2.105874757908328, + "grad_norm": 0.5834229896061526, + "learning_rate": 1.4319085417494885e-05, + "loss": 0.936, + "step": 817 + }, + { + "epoch": 2.108457069076824, + "grad_norm": 0.5740341555688057, + "learning_rate": 1.430280351171864e-05, + "loss": 0.9295, + "step": 818 + }, + { + "epoch": 2.1110393802453196, + "grad_norm": 0.6028061663862296, + "learning_rate": 1.4286507595443527e-05, + "loss": 0.9475, + "step": 819 + }, + { + "epoch": 2.1136216914138153, + "grad_norm": 0.6066376132775557, + "learning_rate": 1.4270197721731192e-05, + "loss": 0.9748, + "step": 820 + }, + { + "epoch": 2.116204002582311, + "grad_norm": 0.6009913417618149, + "learning_rate": 1.4253873943688751e-05, + "loss": 0.9599, + "step": 821 + }, + { + "epoch": 2.118786313750807, + "grad_norm": 0.5983886456577467, + "learning_rate": 1.4237536314468602e-05, + "loss": 0.9594, + "step": 822 + }, + { + "epoch": 2.121368624919303, + "grad_norm": 0.617011626576933, + "learning_rate": 1.4221184887268218e-05, + "loss": 0.9498, + "step": 823 + }, + { + "epoch": 2.1239509360877986, + "grad_norm": 0.6005132003701584, + "learning_rate": 1.4204819715330026e-05, + "loss": 0.9503, + "step": 824 + }, + { + "epoch": 2.1265332472562943, + "grad_norm": 0.5741558367115511, + "learning_rate": 1.4188440851941185e-05, + "loss": 0.9587, + "step": 825 + }, + { + "epoch": 2.12911555842479, + "grad_norm": 0.6062156734819026, + "learning_rate": 1.4172048350433457e-05, + "loss": 0.969, + "step": 826 + }, + { + "epoch": 2.131697869593286, + "grad_norm": 0.6128646943053142, + "learning_rate": 1.4155642264182992e-05, + "loss": 0.9534, + "step": 827 + }, + { + "epoch": 2.134280180761782, + "grad_norm": 0.5828534204572827, + "learning_rate": 1.4139222646610185e-05, + "loss": 0.9388, + "step": 828 + }, + { + "epoch": 2.1368624919302777, + "grad_norm": 0.6060884386262935, + "learning_rate": 1.4122789551179495e-05, + "loss": 0.9884, + "step": 829 + }, + { + "epoch": 2.1394448030987734, + "grad_norm": 0.6061173547442686, + "learning_rate": 1.4106343031399252e-05, + "loss": 0.924, + "step": 830 + }, + { + "epoch": 2.142027114267269, + "grad_norm": 0.5851413898430766, + "learning_rate": 1.408988314082151e-05, + "loss": 0.9455, + "step": 831 + }, + { + "epoch": 2.144609425435765, + "grad_norm": 0.588388475305726, + "learning_rate": 1.4073409933041853e-05, + "loss": 0.9337, + "step": 832 + }, + { + "epoch": 2.147191736604261, + "grad_norm": 0.5838096533852828, + "learning_rate": 1.4056923461699232e-05, + "loss": 0.9392, + "step": 833 + }, + { + "epoch": 2.1497740477727567, + "grad_norm": 0.5997141349811622, + "learning_rate": 1.4040423780475787e-05, + "loss": 0.9593, + "step": 834 + }, + { + "epoch": 2.1523563589412524, + "grad_norm": 0.6020566174282612, + "learning_rate": 1.4023910943096662e-05, + "loss": 0.9616, + "step": 835 + }, + { + "epoch": 2.154938670109748, + "grad_norm": 0.6073751111196977, + "learning_rate": 1.4007385003329847e-05, + "loss": 0.9804, + "step": 836 + }, + { + "epoch": 2.157520981278244, + "grad_norm": 0.640691713500995, + "learning_rate": 1.3990846014985997e-05, + "loss": 0.9525, + "step": 837 + }, + { + "epoch": 2.16010329244674, + "grad_norm": 0.5749963474745784, + "learning_rate": 1.397429403191825e-05, + "loss": 0.9753, + "step": 838 + }, + { + "epoch": 2.1626856036152358, + "grad_norm": 0.6017911845722985, + "learning_rate": 1.3957729108022057e-05, + "loss": 0.9698, + "step": 839 + }, + { + "epoch": 2.1652679147837315, + "grad_norm": 0.598963393328458, + "learning_rate": 1.3941151297235007e-05, + "loss": 0.9828, + "step": 840 + }, + { + "epoch": 2.167850225952227, + "grad_norm": 0.5829192765375827, + "learning_rate": 1.3924560653536652e-05, + "loss": 0.9399, + "step": 841 + }, + { + "epoch": 2.170432537120723, + "grad_norm": 0.5736202743026629, + "learning_rate": 1.3907957230948328e-05, + "loss": 0.9414, + "step": 842 + }, + { + "epoch": 2.1730148482892186, + "grad_norm": 0.6048383091141705, + "learning_rate": 1.3891341083532979e-05, + "loss": 0.93, + "step": 843 + }, + { + "epoch": 2.175597159457715, + "grad_norm": 0.6058814179639644, + "learning_rate": 1.3874712265394984e-05, + "loss": 0.9625, + "step": 844 + }, + { + "epoch": 2.1781794706262105, + "grad_norm": 0.6048568085747608, + "learning_rate": 1.3858070830679987e-05, + "loss": 0.9325, + "step": 845 + }, + { + "epoch": 2.1807617817947063, + "grad_norm": 0.6166529166864086, + "learning_rate": 1.3841416833574696e-05, + "loss": 0.9991, + "step": 846 + }, + { + "epoch": 2.183344092963202, + "grad_norm": 0.5836884801008753, + "learning_rate": 1.3824750328306747e-05, + "loss": 0.9567, + "step": 847 + }, + { + "epoch": 2.1859264041316977, + "grad_norm": 0.5952429990454414, + "learning_rate": 1.3808071369144476e-05, + "loss": 0.9244, + "step": 848 + }, + { + "epoch": 2.188508715300194, + "grad_norm": 0.5857084084921026, + "learning_rate": 1.37913800103968e-05, + "loss": 0.9655, + "step": 849 + }, + { + "epoch": 2.1910910264686896, + "grad_norm": 0.6279175507836195, + "learning_rate": 1.3774676306412986e-05, + "loss": 0.9323, + "step": 850 + }, + { + "epoch": 2.1936733376371853, + "grad_norm": 0.5863735033805826, + "learning_rate": 1.3757960311582518e-05, + "loss": 0.961, + "step": 851 + }, + { + "epoch": 2.196255648805681, + "grad_norm": 0.5793193685107874, + "learning_rate": 1.3741232080334889e-05, + "loss": 0.9417, + "step": 852 + }, + { + "epoch": 2.1988379599741767, + "grad_norm": 0.5779370435007501, + "learning_rate": 1.3724491667139437e-05, + "loss": 0.9543, + "step": 853 + }, + { + "epoch": 2.2014202711426725, + "grad_norm": 0.5898150549054328, + "learning_rate": 1.3707739126505168e-05, + "loss": 0.9751, + "step": 854 + }, + { + "epoch": 2.2040025823111686, + "grad_norm": 0.5932733176039338, + "learning_rate": 1.3690974512980577e-05, + "loss": 0.9453, + "step": 855 + }, + { + "epoch": 2.2065848934796644, + "grad_norm": 0.5905918422617804, + "learning_rate": 1.3674197881153468e-05, + "loss": 0.9361, + "step": 856 + }, + { + "epoch": 2.20916720464816, + "grad_norm": 0.594528411021171, + "learning_rate": 1.365740928565078e-05, + "loss": 0.9781, + "step": 857 + }, + { + "epoch": 2.211749515816656, + "grad_norm": 0.5872952131266409, + "learning_rate": 1.3640608781138407e-05, + "loss": 0.9479, + "step": 858 + }, + { + "epoch": 2.2143318269851515, + "grad_norm": 0.5929574963797165, + "learning_rate": 1.3623796422321018e-05, + "loss": 0.9488, + "step": 859 + }, + { + "epoch": 2.2169141381536477, + "grad_norm": 0.6100088602969217, + "learning_rate": 1.3606972263941884e-05, + "loss": 0.93, + "step": 860 + }, + { + "epoch": 2.2194964493221434, + "grad_norm": 0.5757485969334069, + "learning_rate": 1.3590136360782697e-05, + "loss": 0.9167, + "step": 861 + }, + { + "epoch": 2.222078760490639, + "grad_norm": 0.5880881759424176, + "learning_rate": 1.3573288767663388e-05, + "loss": 0.9831, + "step": 862 + }, + { + "epoch": 2.224661071659135, + "grad_norm": 0.6101438672240849, + "learning_rate": 1.3556429539441957e-05, + "loss": 0.9425, + "step": 863 + }, + { + "epoch": 2.2272433828276306, + "grad_norm": 0.6032144416691072, + "learning_rate": 1.3539558731014285e-05, + "loss": 0.956, + "step": 864 + }, + { + "epoch": 2.2298256939961267, + "grad_norm": 0.5877358574038184, + "learning_rate": 1.3522676397313963e-05, + "loss": 0.9769, + "step": 865 + }, + { + "epoch": 2.2324080051646225, + "grad_norm": 0.6037905375839121, + "learning_rate": 1.3505782593312108e-05, + "loss": 0.9577, + "step": 866 + }, + { + "epoch": 2.234990316333118, + "grad_norm": 0.5826777668673346, + "learning_rate": 1.3488877374017189e-05, + "loss": 0.9514, + "step": 867 + }, + { + "epoch": 2.237572627501614, + "grad_norm": 0.591593499195398, + "learning_rate": 1.3471960794474837e-05, + "loss": 0.9563, + "step": 868 + }, + { + "epoch": 2.2401549386701096, + "grad_norm": 0.5972872893141782, + "learning_rate": 1.345503290976768e-05, + "loss": 0.9646, + "step": 869 + }, + { + "epoch": 2.242737249838606, + "grad_norm": 0.5695814980462333, + "learning_rate": 1.3438093775015157e-05, + "loss": 0.9295, + "step": 870 + }, + { + "epoch": 2.2453195610071015, + "grad_norm": 0.5950572680113415, + "learning_rate": 1.342114344537334e-05, + "loss": 0.9378, + "step": 871 + }, + { + "epoch": 2.2479018721755972, + "grad_norm": 0.645911801845914, + "learning_rate": 1.3404181976034743e-05, + "loss": 0.9889, + "step": 872 + }, + { + "epoch": 2.250484183344093, + "grad_norm": 0.5891952037473503, + "learning_rate": 1.3387209422228164e-05, + "loss": 0.9257, + "step": 873 + }, + { + "epoch": 2.2530664945125887, + "grad_norm": 0.6101696680348054, + "learning_rate": 1.3370225839218494e-05, + "loss": 0.9387, + "step": 874 + }, + { + "epoch": 2.2556488056810844, + "grad_norm": 0.6235755995527572, + "learning_rate": 1.3353231282306521e-05, + "loss": 0.9699, + "step": 875 + }, + { + "epoch": 2.2582311168495806, + "grad_norm": 0.6032240561162692, + "learning_rate": 1.3336225806828782e-05, + "loss": 0.9256, + "step": 876 + }, + { + "epoch": 2.2608134280180763, + "grad_norm": 0.6200539436633388, + "learning_rate": 1.3319209468157362e-05, + "loss": 0.977, + "step": 877 + }, + { + "epoch": 2.263395739186572, + "grad_norm": 0.6357789919117319, + "learning_rate": 1.3302182321699712e-05, + "loss": 0.9589, + "step": 878 + }, + { + "epoch": 2.2659780503550677, + "grad_norm": 0.6102482086269118, + "learning_rate": 1.3285144422898486e-05, + "loss": 0.9595, + "step": 879 + }, + { + "epoch": 2.2685603615235634, + "grad_norm": 0.6310634925304537, + "learning_rate": 1.3268095827231333e-05, + "loss": 0.9406, + "step": 880 + }, + { + "epoch": 2.2711426726920596, + "grad_norm": 0.6196741175987706, + "learning_rate": 1.3251036590210751e-05, + "loss": 0.9623, + "step": 881 + }, + { + "epoch": 2.2737249838605553, + "grad_norm": 0.5865418451174635, + "learning_rate": 1.323396676738387e-05, + "loss": 0.9618, + "step": 882 + }, + { + "epoch": 2.276307295029051, + "grad_norm": 0.5886489491664807, + "learning_rate": 1.3216886414332304e-05, + "loss": 0.9654, + "step": 883 + }, + { + "epoch": 2.2788896061975468, + "grad_norm": 0.6269313692986308, + "learning_rate": 1.319979558667194e-05, + "loss": 0.9648, + "step": 884 + }, + { + "epoch": 2.2814719173660425, + "grad_norm": 0.5950331112803471, + "learning_rate": 1.3182694340052785e-05, + "loss": 1.0065, + "step": 885 + }, + { + "epoch": 2.284054228534538, + "grad_norm": 0.5868804806129319, + "learning_rate": 1.3165582730158764e-05, + "loss": 0.9425, + "step": 886 + }, + { + "epoch": 2.2866365397030344, + "grad_norm": 0.585709126958065, + "learning_rate": 1.3148460812707549e-05, + "loss": 0.9866, + "step": 887 + }, + { + "epoch": 2.28921885087153, + "grad_norm": 0.5943971591153827, + "learning_rate": 1.3131328643450373e-05, + "loss": 0.928, + "step": 888 + }, + { + "epoch": 2.291801162040026, + "grad_norm": 0.6011485207920195, + "learning_rate": 1.3114186278171855e-05, + "loss": 0.9471, + "step": 889 + }, + { + "epoch": 2.2943834732085215, + "grad_norm": 0.6202130154424499, + "learning_rate": 1.3097033772689804e-05, + "loss": 0.9555, + "step": 890 + }, + { + "epoch": 2.2969657843770173, + "grad_norm": 0.601191279942045, + "learning_rate": 1.3079871182855056e-05, + "loss": 0.9763, + "step": 891 + }, + { + "epoch": 2.2995480955455134, + "grad_norm": 0.6091424415493963, + "learning_rate": 1.3062698564551277e-05, + "loss": 0.9564, + "step": 892 + }, + { + "epoch": 2.302130406714009, + "grad_norm": 0.6322044545300952, + "learning_rate": 1.3045515973694793e-05, + "loss": 0.9621, + "step": 893 + }, + { + "epoch": 2.304712717882505, + "grad_norm": 0.593976781762648, + "learning_rate": 1.3028323466234398e-05, + "loss": 0.9352, + "step": 894 + }, + { + "epoch": 2.3072950290510006, + "grad_norm": 0.6093135390414695, + "learning_rate": 1.3011121098151177e-05, + "loss": 0.9444, + "step": 895 + }, + { + "epoch": 2.3098773402194963, + "grad_norm": 0.6081280945984243, + "learning_rate": 1.2993908925458318e-05, + "loss": 0.9019, + "step": 896 + }, + { + "epoch": 2.312459651387992, + "grad_norm": 0.5965625320422764, + "learning_rate": 1.2976687004200941e-05, + "loss": 0.9504, + "step": 897 + }, + { + "epoch": 2.315041962556488, + "grad_norm": 0.6136358258415586, + "learning_rate": 1.2959455390455906e-05, + "loss": 0.9598, + "step": 898 + }, + { + "epoch": 2.317624273724984, + "grad_norm": 0.614066787514822, + "learning_rate": 1.294221414033163e-05, + "loss": 0.9151, + "step": 899 + }, + { + "epoch": 2.3202065848934796, + "grad_norm": 0.595393393778215, + "learning_rate": 1.2924963309967914e-05, + "loss": 0.9383, + "step": 900 + }, + { + "epoch": 2.3227888960619754, + "grad_norm": 0.6123276452590078, + "learning_rate": 1.2907702955535744e-05, + "loss": 0.9449, + "step": 901 + }, + { + "epoch": 2.325371207230471, + "grad_norm": 0.6002189347008143, + "learning_rate": 1.2890433133237129e-05, + "loss": 0.9648, + "step": 902 + }, + { + "epoch": 2.3279535183989672, + "grad_norm": 0.5948640736384636, + "learning_rate": 1.2873153899304898e-05, + "loss": 0.9654, + "step": 903 + }, + { + "epoch": 2.330535829567463, + "grad_norm": 0.6253192331451701, + "learning_rate": 1.2855865310002526e-05, + "loss": 0.9459, + "step": 904 + }, + { + "epoch": 2.3331181407359587, + "grad_norm": 0.6060085962717341, + "learning_rate": 1.2838567421623957e-05, + "loss": 0.9648, + "step": 905 + }, + { + "epoch": 2.3357004519044544, + "grad_norm": 0.5909129536256885, + "learning_rate": 1.2821260290493411e-05, + "loss": 0.9615, + "step": 906 + }, + { + "epoch": 2.33828276307295, + "grad_norm": 0.6033489652168267, + "learning_rate": 1.2803943972965193e-05, + "loss": 0.9822, + "step": 907 + }, + { + "epoch": 2.340865074241446, + "grad_norm": 0.6471948077451358, + "learning_rate": 1.278661852542354e-05, + "loss": 0.9372, + "step": 908 + }, + { + "epoch": 2.343447385409942, + "grad_norm": 0.5875321400886871, + "learning_rate": 1.2769284004282398e-05, + "loss": 0.9283, + "step": 909 + }, + { + "epoch": 2.3460296965784377, + "grad_norm": 0.57397296055963, + "learning_rate": 1.2751940465985273e-05, + "loss": 0.9443, + "step": 910 + }, + { + "epoch": 2.3486120077469335, + "grad_norm": 0.6083870147758043, + "learning_rate": 1.2734587967005025e-05, + "loss": 0.9911, + "step": 911 + }, + { + "epoch": 2.351194318915429, + "grad_norm": 0.5893684173951856, + "learning_rate": 1.2717226563843687e-05, + "loss": 0.9775, + "step": 912 + }, + { + "epoch": 2.3537766300839253, + "grad_norm": 0.6098963204737635, + "learning_rate": 1.26998563130323e-05, + "loss": 0.9352, + "step": 913 + }, + { + "epoch": 2.356358941252421, + "grad_norm": 0.6028323564667681, + "learning_rate": 1.268247727113069e-05, + "loss": 0.9535, + "step": 914 + }, + { + "epoch": 2.358941252420917, + "grad_norm": 0.6139836763290958, + "learning_rate": 1.2665089494727338e-05, + "loss": 0.9543, + "step": 915 + }, + { + "epoch": 2.3615235635894125, + "grad_norm": 0.5979010266216653, + "learning_rate": 1.2647693040439142e-05, + "loss": 0.9584, + "step": 916 + }, + { + "epoch": 2.3641058747579082, + "grad_norm": 0.6035572479241811, + "learning_rate": 1.2630287964911261e-05, + "loss": 0.958, + "step": 917 + }, + { + "epoch": 2.366688185926404, + "grad_norm": 0.5830490108904467, + "learning_rate": 1.2612874324816935e-05, + "loss": 0.9492, + "step": 918 + }, + { + "epoch": 2.3692704970949, + "grad_norm": 0.6049407013095448, + "learning_rate": 1.2595452176857283e-05, + "loss": 0.9215, + "step": 919 + }, + { + "epoch": 2.371852808263396, + "grad_norm": 0.5853837977544576, + "learning_rate": 1.2578021577761132e-05, + "loss": 0.9397, + "step": 920 + }, + { + "epoch": 2.3744351194318916, + "grad_norm": 0.6270536757002744, + "learning_rate": 1.2560582584284822e-05, + "loss": 0.9817, + "step": 921 + }, + { + "epoch": 2.3770174306003873, + "grad_norm": 0.6353840335416789, + "learning_rate": 1.2543135253212027e-05, + "loss": 0.9559, + "step": 922 + }, + { + "epoch": 2.379599741768883, + "grad_norm": 0.585963379760864, + "learning_rate": 1.2525679641353571e-05, + "loss": 0.9453, + "step": 923 + }, + { + "epoch": 2.382182052937379, + "grad_norm": 0.5894211852291655, + "learning_rate": 1.2508215805547246e-05, + "loss": 0.9251, + "step": 924 + }, + { + "epoch": 2.384764364105875, + "grad_norm": 0.572359971184135, + "learning_rate": 1.2490743802657614e-05, + "loss": 0.9564, + "step": 925 + }, + { + "epoch": 2.3873466752743706, + "grad_norm": 0.5786820742067271, + "learning_rate": 1.2473263689575835e-05, + "loss": 0.9291, + "step": 926 + }, + { + "epoch": 2.3899289864428663, + "grad_norm": 0.590281816939995, + "learning_rate": 1.2455775523219472e-05, + "loss": 0.9248, + "step": 927 + }, + { + "epoch": 2.392511297611362, + "grad_norm": 0.5851632591490395, + "learning_rate": 1.2438279360532317e-05, + "loss": 0.9558, + "step": 928 + }, + { + "epoch": 2.3950936087798578, + "grad_norm": 0.6073264082842632, + "learning_rate": 1.2420775258484194e-05, + "loss": 0.9152, + "step": 929 + }, + { + "epoch": 2.397675919948354, + "grad_norm": 0.5961742171533062, + "learning_rate": 1.2403263274070786e-05, + "loss": 0.9614, + "step": 930 + }, + { + "epoch": 2.4002582311168497, + "grad_norm": 0.6094671056323115, + "learning_rate": 1.238574346431343e-05, + "loss": 0.9478, + "step": 931 + }, + { + "epoch": 2.4028405422853454, + "grad_norm": 0.6141679641874132, + "learning_rate": 1.2368215886258952e-05, + "loss": 0.9588, + "step": 932 + }, + { + "epoch": 2.405422853453841, + "grad_norm": 0.5891888079618772, + "learning_rate": 1.2350680596979474e-05, + "loss": 0.9748, + "step": 933 + }, + { + "epoch": 2.408005164622337, + "grad_norm": 0.6220233686120056, + "learning_rate": 1.233313765357222e-05, + "loss": 0.9547, + "step": 934 + }, + { + "epoch": 2.410587475790833, + "grad_norm": 0.6111194107579635, + "learning_rate": 1.2315587113159342e-05, + "loss": 0.9374, + "step": 935 + }, + { + "epoch": 2.4131697869593287, + "grad_norm": 0.5816987304224244, + "learning_rate": 1.2298029032887725e-05, + "loss": 0.9611, + "step": 936 + }, + { + "epoch": 2.4157520981278244, + "grad_norm": 0.6103256535275182, + "learning_rate": 1.228046346992881e-05, + "loss": 0.9388, + "step": 937 + }, + { + "epoch": 2.41833440929632, + "grad_norm": 0.5861255113568193, + "learning_rate": 1.22628904814784e-05, + "loss": 0.9582, + "step": 938 + }, + { + "epoch": 2.420916720464816, + "grad_norm": 0.5823515712678948, + "learning_rate": 1.224531012475647e-05, + "loss": 0.9898, + "step": 939 + }, + { + "epoch": 2.4234990316333116, + "grad_norm": 0.609671663934881, + "learning_rate": 1.2227722457007e-05, + "loss": 0.9596, + "step": 940 + }, + { + "epoch": 2.4260813428018078, + "grad_norm": 0.5972948161325082, + "learning_rate": 1.221012753549776e-05, + "loss": 0.9955, + "step": 941 + }, + { + "epoch": 2.4286636539703035, + "grad_norm": 0.5879925977172995, + "learning_rate": 1.2192525417520159e-05, + "loss": 0.9615, + "step": 942 + }, + { + "epoch": 2.431245965138799, + "grad_norm": 0.6075551488590047, + "learning_rate": 1.2174916160389024e-05, + "loss": 0.9572, + "step": 943 + }, + { + "epoch": 2.433828276307295, + "grad_norm": 0.6113872256428539, + "learning_rate": 1.2157299821442424e-05, + "loss": 0.9671, + "step": 944 + }, + { + "epoch": 2.4364105874757906, + "grad_norm": 0.5838911691926075, + "learning_rate": 1.2139676458041505e-05, + "loss": 0.9352, + "step": 945 + }, + { + "epoch": 2.438992898644287, + "grad_norm": 0.604879771295695, + "learning_rate": 1.2122046127570268e-05, + "loss": 0.9541, + "step": 946 + }, + { + "epoch": 2.4415752098127825, + "grad_norm": 0.6008885632399309, + "learning_rate": 1.2104408887435413e-05, + "loss": 0.9633, + "step": 947 + }, + { + "epoch": 2.4441575209812783, + "grad_norm": 0.5834385132140035, + "learning_rate": 1.2086764795066128e-05, + "loss": 0.9455, + "step": 948 + }, + { + "epoch": 2.446739832149774, + "grad_norm": 0.6092567677261247, + "learning_rate": 1.2069113907913921e-05, + "loss": 0.9564, + "step": 949 + }, + { + "epoch": 2.4493221433182697, + "grad_norm": 0.5650318694461209, + "learning_rate": 1.2051456283452423e-05, + "loss": 0.97, + "step": 950 + }, + { + "epoch": 2.4519044544867654, + "grad_norm": 0.608288549791379, + "learning_rate": 1.2033791979177196e-05, + "loss": 0.9628, + "step": 951 + }, + { + "epoch": 2.4544867656552616, + "grad_norm": 0.6033407862962766, + "learning_rate": 1.2016121052605558e-05, + "loss": 0.9565, + "step": 952 + }, + { + "epoch": 2.4570690768237573, + "grad_norm": 0.6028336342782669, + "learning_rate": 1.1998443561276395e-05, + "loss": 0.9829, + "step": 953 + }, + { + "epoch": 2.459651387992253, + "grad_norm": 0.584653200324165, + "learning_rate": 1.1980759562749957e-05, + "loss": 0.9566, + "step": 954 + }, + { + "epoch": 2.4622336991607487, + "grad_norm": 0.6030118438156815, + "learning_rate": 1.1963069114607692e-05, + "loss": 0.9306, + "step": 955 + }, + { + "epoch": 2.464816010329245, + "grad_norm": 0.598687643898121, + "learning_rate": 1.1945372274452045e-05, + "loss": 0.9717, + "step": 956 + }, + { + "epoch": 2.4673983214977406, + "grad_norm": 0.6007026870754814, + "learning_rate": 1.1927669099906274e-05, + "loss": 0.9483, + "step": 957 + }, + { + "epoch": 2.4699806326662364, + "grad_norm": 0.5841035235550123, + "learning_rate": 1.1909959648614262e-05, + "loss": 0.9888, + "step": 958 + }, + { + "epoch": 2.472562943834732, + "grad_norm": 0.5950731809881308, + "learning_rate": 1.1892243978240332e-05, + "loss": 0.9442, + "step": 959 + }, + { + "epoch": 2.475145255003228, + "grad_norm": 0.6073950825590259, + "learning_rate": 1.1874522146469056e-05, + "loss": 0.9607, + "step": 960 + }, + { + "epoch": 2.4777275661717235, + "grad_norm": 0.5917705341695404, + "learning_rate": 1.1856794211005069e-05, + "loss": 0.9288, + "step": 961 + }, + { + "epoch": 2.4803098773402197, + "grad_norm": 0.5839083243509722, + "learning_rate": 1.183906022957288e-05, + "loss": 0.9676, + "step": 962 + }, + { + "epoch": 2.4828921885087154, + "grad_norm": 0.597493851436026, + "learning_rate": 1.182132025991669e-05, + "loss": 0.9598, + "step": 963 + }, + { + "epoch": 2.485474499677211, + "grad_norm": 0.5765315620556862, + "learning_rate": 1.1803574359800179e-05, + "loss": 0.9744, + "step": 964 + }, + { + "epoch": 2.488056810845707, + "grad_norm": 0.5912961838573095, + "learning_rate": 1.1785822587006362e-05, + "loss": 0.9847, + "step": 965 + }, + { + "epoch": 2.4906391220142026, + "grad_norm": 0.5816475691671312, + "learning_rate": 1.1768064999337364e-05, + "loss": 0.9411, + "step": 966 + }, + { + "epoch": 2.4932214331826987, + "grad_norm": 0.5846058314378276, + "learning_rate": 1.1750301654614242e-05, + "loss": 0.9693, + "step": 967 + }, + { + "epoch": 2.4958037443511945, + "grad_norm": 0.5830471307870174, + "learning_rate": 1.1732532610676808e-05, + "loss": 0.9354, + "step": 968 + }, + { + "epoch": 2.49838605551969, + "grad_norm": 0.5836564756949956, + "learning_rate": 1.1714757925383418e-05, + "loss": 0.9617, + "step": 969 + }, + { + "epoch": 2.500968366688186, + "grad_norm": 0.604259375943389, + "learning_rate": 1.1696977656610813e-05, + "loss": 0.9519, + "step": 970 + }, + { + "epoch": 2.5035506778566816, + "grad_norm": 0.5742797021433684, + "learning_rate": 1.1679191862253898e-05, + "loss": 0.9547, + "step": 971 + }, + { + "epoch": 2.5061329890251773, + "grad_norm": 0.5746553555926329, + "learning_rate": 1.1661400600225588e-05, + "loss": 0.9564, + "step": 972 + }, + { + "epoch": 2.5087153001936735, + "grad_norm": 0.5956191491364381, + "learning_rate": 1.1643603928456581e-05, + "loss": 0.9315, + "step": 973 + }, + { + "epoch": 2.5112976113621692, + "grad_norm": 0.5972863649697912, + "learning_rate": 1.1625801904895207e-05, + "loss": 0.9828, + "step": 974 + }, + { + "epoch": 2.513879922530665, + "grad_norm": 0.628215407427667, + "learning_rate": 1.1607994587507216e-05, + "loss": 0.9791, + "step": 975 + }, + { + "epoch": 2.5164622336991607, + "grad_norm": 0.5794209025299315, + "learning_rate": 1.1590182034275588e-05, + "loss": 0.9765, + "step": 976 + }, + { + "epoch": 2.5190445448676564, + "grad_norm": 0.5995540341976862, + "learning_rate": 1.157236430320037e-05, + "loss": 0.9425, + "step": 977 + }, + { + "epoch": 2.5216268560361526, + "grad_norm": 0.589687726091925, + "learning_rate": 1.155454145229845e-05, + "loss": 0.9269, + "step": 978 + }, + { + "epoch": 2.5242091672046483, + "grad_norm": 0.5903300924561746, + "learning_rate": 1.1536713539603392e-05, + "loss": 0.9515, + "step": 979 + }, + { + "epoch": 2.526791478373144, + "grad_norm": 0.5926698140037857, + "learning_rate": 1.1518880623165249e-05, + "loss": 0.9613, + "step": 980 + }, + { + "epoch": 2.5293737895416397, + "grad_norm": 0.5882141138215461, + "learning_rate": 1.1501042761050359e-05, + "loss": 0.9646, + "step": 981 + }, + { + "epoch": 2.5319561007101354, + "grad_norm": 0.5907114287701524, + "learning_rate": 1.1483200011341172e-05, + "loss": 0.9502, + "step": 982 + }, + { + "epoch": 2.534538411878631, + "grad_norm": 0.5796881924318279, + "learning_rate": 1.1465352432136041e-05, + "loss": 0.9337, + "step": 983 + }, + { + "epoch": 2.5371207230471273, + "grad_norm": 0.5738929122712656, + "learning_rate": 1.1447500081549054e-05, + "loss": 0.9405, + "step": 984 + }, + { + "epoch": 2.539703034215623, + "grad_norm": 0.580328116392153, + "learning_rate": 1.1429643017709833e-05, + "loss": 0.9539, + "step": 985 + }, + { + "epoch": 2.5422853453841188, + "grad_norm": 0.5881438247765939, + "learning_rate": 1.1411781298763343e-05, + "loss": 0.9313, + "step": 986 + }, + { + "epoch": 2.5448676565526145, + "grad_norm": 0.5885562040781032, + "learning_rate": 1.1393914982869711e-05, + "loss": 0.9425, + "step": 987 + }, + { + "epoch": 2.5474499677211107, + "grad_norm": 0.580594597098575, + "learning_rate": 1.1376044128204033e-05, + "loss": 0.9391, + "step": 988 + }, + { + "epoch": 2.5500322788896064, + "grad_norm": 0.5952429990647207, + "learning_rate": 1.1358168792956178e-05, + "loss": 0.9504, + "step": 989 + }, + { + "epoch": 2.552614590058102, + "grad_norm": 0.5970710014238076, + "learning_rate": 1.1340289035330614e-05, + "loss": 0.9878, + "step": 990 + }, + { + "epoch": 2.555196901226598, + "grad_norm": 0.6152662315238809, + "learning_rate": 1.1322404913546197e-05, + "loss": 0.9465, + "step": 991 + }, + { + "epoch": 2.5577792123950935, + "grad_norm": 0.6027222855714028, + "learning_rate": 1.1304516485836002e-05, + "loss": 0.971, + "step": 992 + }, + { + "epoch": 2.5603615235635893, + "grad_norm": 0.5918016876334783, + "learning_rate": 1.1286623810447122e-05, + "loss": 0.9652, + "step": 993 + }, + { + "epoch": 2.562943834732085, + "grad_norm": 0.5896636574102831, + "learning_rate": 1.1268726945640483e-05, + "loss": 0.9372, + "step": 994 + }, + { + "epoch": 2.565526145900581, + "grad_norm": 0.5824587400275631, + "learning_rate": 1.125082594969065e-05, + "loss": 0.9529, + "step": 995 + }, + { + "epoch": 2.568108457069077, + "grad_norm": 0.5697833980293927, + "learning_rate": 1.1232920880885632e-05, + "loss": 0.9554, + "step": 996 + }, + { + "epoch": 2.5706907682375726, + "grad_norm": 0.5801306805314953, + "learning_rate": 1.1215011797526716e-05, + "loss": 0.9268, + "step": 997 + }, + { + "epoch": 2.5732730794060683, + "grad_norm": 0.5965814716018379, + "learning_rate": 1.119709875792825e-05, + "loss": 0.962, + "step": 998 + }, + { + "epoch": 2.5758553905745645, + "grad_norm": 0.6086711337973163, + "learning_rate": 1.1179181820417469e-05, + "loss": 0.97, + "step": 999 + }, + { + "epoch": 2.57843770174306, + "grad_norm": 0.5785369755423095, + "learning_rate": 1.1161261043334296e-05, + "loss": 0.9495, + "step": 1000 + }, + { + "epoch": 2.581020012911556, + "grad_norm": 0.608023719014441, + "learning_rate": 1.1143336485031156e-05, + "loss": 0.9165, + "step": 1001 + }, + { + "epoch": 2.5836023240800516, + "grad_norm": 0.6320332520260791, + "learning_rate": 1.1125408203872793e-05, + "loss": 1.0028, + "step": 1002 + }, + { + "epoch": 2.5861846352485474, + "grad_norm": 0.5833673102474324, + "learning_rate": 1.1107476258236059e-05, + "loss": 0.942, + "step": 1003 + }, + { + "epoch": 2.588766946417043, + "grad_norm": 0.5888791372130312, + "learning_rate": 1.1089540706509757e-05, + "loss": 0.9548, + "step": 1004 + }, + { + "epoch": 2.591349257585539, + "grad_norm": 0.5862228558392754, + "learning_rate": 1.1071601607094416e-05, + "loss": 0.9096, + "step": 1005 + }, + { + "epoch": 2.593931568754035, + "grad_norm": 0.6178780038575998, + "learning_rate": 1.1053659018402123e-05, + "loss": 0.9539, + "step": 1006 + }, + { + "epoch": 2.5965138799225307, + "grad_norm": 0.6227035958216502, + "learning_rate": 1.1035712998856332e-05, + "loss": 0.9845, + "step": 1007 + }, + { + "epoch": 2.5990961910910264, + "grad_norm": 0.585793574816453, + "learning_rate": 1.1017763606891653e-05, + "loss": 0.9564, + "step": 1008 + }, + { + "epoch": 2.601678502259522, + "grad_norm": 0.6031850388726575, + "learning_rate": 1.0999810900953701e-05, + "loss": 0.966, + "step": 1009 + }, + { + "epoch": 2.6042608134280183, + "grad_norm": 0.6325995476999388, + "learning_rate": 1.0981854939498853e-05, + "loss": 0.934, + "step": 1010 + }, + { + "epoch": 2.606843124596514, + "grad_norm": 0.604370954178913, + "learning_rate": 1.0963895780994106e-05, + "loss": 0.962, + "step": 1011 + }, + { + "epoch": 2.6094254357650097, + "grad_norm": 0.6046507204858135, + "learning_rate": 1.0945933483916867e-05, + "loss": 0.9628, + "step": 1012 + }, + { + "epoch": 2.6120077469335055, + "grad_norm": 0.6055958607582257, + "learning_rate": 1.0927968106754747e-05, + "loss": 0.9724, + "step": 1013 + }, + { + "epoch": 2.614590058102001, + "grad_norm": 0.6142519834748665, + "learning_rate": 1.0909999708005407e-05, + "loss": 0.9859, + "step": 1014 + }, + { + "epoch": 2.617172369270497, + "grad_norm": 0.587585188897923, + "learning_rate": 1.0892028346176333e-05, + "loss": 0.9337, + "step": 1015 + }, + { + "epoch": 2.6197546804389926, + "grad_norm": 0.5775553775383109, + "learning_rate": 1.087405407978466e-05, + "loss": 0.9247, + "step": 1016 + }, + { + "epoch": 2.622336991607489, + "grad_norm": 0.5914086152693361, + "learning_rate": 1.0856076967356983e-05, + "loss": 0.9646, + "step": 1017 + }, + { + "epoch": 2.6249193027759845, + "grad_norm": 0.6252845963452488, + "learning_rate": 1.0838097067429168e-05, + "loss": 0.9783, + "step": 1018 + }, + { + "epoch": 2.6275016139444802, + "grad_norm": 0.5861511527646114, + "learning_rate": 1.0820114438546152e-05, + "loss": 0.9621, + "step": 1019 + }, + { + "epoch": 2.630083925112976, + "grad_norm": 0.5836312295046293, + "learning_rate": 1.080212913926176e-05, + "loss": 0.9554, + "step": 1020 + }, + { + "epoch": 2.632666236281472, + "grad_norm": 0.6040474003003209, + "learning_rate": 1.0784141228138507e-05, + "loss": 0.9516, + "step": 1021 + }, + { + "epoch": 2.635248547449968, + "grad_norm": 0.6355202880988752, + "learning_rate": 1.0766150763747423e-05, + "loss": 0.9789, + "step": 1022 + }, + { + "epoch": 2.6378308586184636, + "grad_norm": 0.5897510462672635, + "learning_rate": 1.0748157804667844e-05, + "loss": 0.9374, + "step": 1023 + }, + { + "epoch": 2.6404131697869593, + "grad_norm": 0.5899429810230572, + "learning_rate": 1.0730162409487233e-05, + "loss": 0.9329, + "step": 1024 + }, + { + "epoch": 2.642995480955455, + "grad_norm": 0.6030569126093994, + "learning_rate": 1.071216463680098e-05, + "loss": 0.9662, + "step": 1025 + }, + { + "epoch": 2.6455777921239507, + "grad_norm": 0.5981188227832869, + "learning_rate": 1.069416454521222e-05, + "loss": 0.9753, + "step": 1026 + }, + { + "epoch": 2.648160103292447, + "grad_norm": 0.618406401340536, + "learning_rate": 1.0676162193331642e-05, + "loss": 0.9729, + "step": 1027 + }, + { + "epoch": 2.6507424144609426, + "grad_norm": 0.5945181324122579, + "learning_rate": 1.0658157639777285e-05, + "loss": 0.9296, + "step": 1028 + }, + { + "epoch": 2.6533247256294383, + "grad_norm": 0.621876814177428, + "learning_rate": 1.0640150943174368e-05, + "loss": 0.9628, + "step": 1029 + }, + { + "epoch": 2.655907036797934, + "grad_norm": 0.5872555607480314, + "learning_rate": 1.0622142162155084e-05, + "loss": 0.9647, + "step": 1030 + }, + { + "epoch": 2.65848934796643, + "grad_norm": 0.6016180713767454, + "learning_rate": 1.060413135535841e-05, + "loss": 0.9489, + "step": 1031 + }, + { + "epoch": 2.661071659134926, + "grad_norm": 0.5963657410420156, + "learning_rate": 1.0586118581429923e-05, + "loss": 0.9476, + "step": 1032 + }, + { + "epoch": 2.6636539703034217, + "grad_norm": 0.5814763983307615, + "learning_rate": 1.05681038990216e-05, + "loss": 0.9463, + "step": 1033 + }, + { + "epoch": 2.6662362814719174, + "grad_norm": 0.5725192948619975, + "learning_rate": 1.0550087366791641e-05, + "loss": 0.9804, + "step": 1034 + }, + { + "epoch": 2.668818592640413, + "grad_norm": 0.5916916107783017, + "learning_rate": 1.053206904340426e-05, + "loss": 0.9629, + "step": 1035 + }, + { + "epoch": 2.671400903808909, + "grad_norm": 0.5904165915891584, + "learning_rate": 1.0514048987529515e-05, + "loss": 0.9579, + "step": 1036 + }, + { + "epoch": 2.6739832149774045, + "grad_norm": 0.5914405056148352, + "learning_rate": 1.0496027257843088e-05, + "loss": 0.9807, + "step": 1037 + }, + { + "epoch": 2.6765655261459007, + "grad_norm": 0.5846745644240308, + "learning_rate": 1.0478003913026125e-05, + "loss": 0.9679, + "step": 1038 + }, + { + "epoch": 2.6791478373143964, + "grad_norm": 0.6002766375251781, + "learning_rate": 1.045997901176503e-05, + "loss": 0.971, + "step": 1039 + }, + { + "epoch": 2.681730148482892, + "grad_norm": 0.5847650891279706, + "learning_rate": 1.0441952612751267e-05, + "loss": 0.9627, + "step": 1040 + }, + { + "epoch": 2.684312459651388, + "grad_norm": 0.6040931012169604, + "learning_rate": 1.0423924774681186e-05, + "loss": 0.9503, + "step": 1041 + }, + { + "epoch": 2.686894770819884, + "grad_norm": 0.5785542819032363, + "learning_rate": 1.0405895556255818e-05, + "loss": 0.9559, + "step": 1042 + }, + { + "epoch": 2.6894770819883798, + "grad_norm": 0.6052229883487668, + "learning_rate": 1.0387865016180688e-05, + "loss": 0.9622, + "step": 1043 + }, + { + "epoch": 2.6920593931568755, + "grad_norm": 0.5848263105245827, + "learning_rate": 1.0369833213165625e-05, + "loss": 0.9598, + "step": 1044 + }, + { + "epoch": 2.694641704325371, + "grad_norm": 0.5926309991366325, + "learning_rate": 1.035180020592457e-05, + "loss": 0.9372, + "step": 1045 + }, + { + "epoch": 2.697224015493867, + "grad_norm": 0.5844049554145337, + "learning_rate": 1.0333766053175391e-05, + "loss": 0.9439, + "step": 1046 + }, + { + "epoch": 2.6998063266623626, + "grad_norm": 0.6001743480120659, + "learning_rate": 1.031573081363968e-05, + "loss": 0.9346, + "step": 1047 + }, + { + "epoch": 2.7023886378308584, + "grad_norm": 0.5897380533051093, + "learning_rate": 1.0297694546042563e-05, + "loss": 0.9604, + "step": 1048 + }, + { + "epoch": 2.7049709489993545, + "grad_norm": 0.584956431101729, + "learning_rate": 1.0279657309112526e-05, + "loss": 0.9045, + "step": 1049 + }, + { + "epoch": 2.7075532601678503, + "grad_norm": 0.5712935010828868, + "learning_rate": 1.02616191615812e-05, + "loss": 0.9466, + "step": 1050 + }, + { + "epoch": 2.710135571336346, + "grad_norm": 0.583381386123002, + "learning_rate": 1.0243580162183189e-05, + "loss": 0.9838, + "step": 1051 + }, + { + "epoch": 2.7127178825048417, + "grad_norm": 0.5846652612272821, + "learning_rate": 1.0225540369655866e-05, + "loss": 0.9751, + "step": 1052 + }, + { + "epoch": 2.715300193673338, + "grad_norm": 0.5978067742385131, + "learning_rate": 1.0207499842739185e-05, + "loss": 0.9625, + "step": 1053 + }, + { + "epoch": 2.7178825048418336, + "grad_norm": 0.5853977002645502, + "learning_rate": 1.01894586401755e-05, + "loss": 0.9614, + "step": 1054 + }, + { + "epoch": 2.7204648160103293, + "grad_norm": 0.5983002966741684, + "learning_rate": 1.0171416820709356e-05, + "loss": 0.9373, + "step": 1055 + }, + { + "epoch": 2.723047127178825, + "grad_norm": 0.5856993759606652, + "learning_rate": 1.015337444308731e-05, + "loss": 0.9489, + "step": 1056 + }, + { + "epoch": 2.7256294383473207, + "grad_norm": 0.5901281403453162, + "learning_rate": 1.0135331566057735e-05, + "loss": 0.9332, + "step": 1057 + }, + { + "epoch": 2.7282117495158165, + "grad_norm": 0.5906660579573058, + "learning_rate": 1.0117288248370636e-05, + "loss": 0.9609, + "step": 1058 + }, + { + "epoch": 2.730794060684312, + "grad_norm": 0.6062946865104221, + "learning_rate": 1.0099244548777444e-05, + "loss": 0.9372, + "step": 1059 + }, + { + "epoch": 2.7333763718528084, + "grad_norm": 0.6025103390237757, + "learning_rate": 1.008120052603084e-05, + "loss": 0.9325, + "step": 1060 + }, + { + "epoch": 2.735958683021304, + "grad_norm": 0.6037740140636985, + "learning_rate": 1.006315623888455e-05, + "loss": 0.9407, + "step": 1061 + }, + { + "epoch": 2.7385409941898, + "grad_norm": 0.5818379563267816, + "learning_rate": 1.0045111746093174e-05, + "loss": 0.9565, + "step": 1062 + }, + { + "epoch": 2.7411233053582955, + "grad_norm": 0.5972098469584126, + "learning_rate": 1.0027067106411969e-05, + "loss": 0.9559, + "step": 1063 + }, + { + "epoch": 2.7437056165267917, + "grad_norm": 0.5921309288084705, + "learning_rate": 1.000902237859668e-05, + "loss": 0.9267, + "step": 1064 + }, + { + "epoch": 2.7462879276952874, + "grad_norm": 0.5858852838442818, + "learning_rate": 9.990977621403326e-06, + "loss": 0.9778, + "step": 1065 + }, + { + "epoch": 2.748870238863783, + "grad_norm": 0.5887566802759674, + "learning_rate": 9.972932893588033e-06, + "loss": 0.9054, + "step": 1066 + }, + { + "epoch": 2.751452550032279, + "grad_norm": 0.5706187383084692, + "learning_rate": 9.954888253906827e-06, + "loss": 0.9482, + "step": 1067 + }, + { + "epoch": 2.7540348612007746, + "grad_norm": 0.5737416712225011, + "learning_rate": 9.936843761115448e-06, + "loss": 0.9313, + "step": 1068 + }, + { + "epoch": 2.7566171723692703, + "grad_norm": 0.5618668457848085, + "learning_rate": 9.918799473969162e-06, + "loss": 0.9268, + "step": 1069 + }, + { + "epoch": 2.7591994835377665, + "grad_norm": 0.5945215622528138, + "learning_rate": 9.90075545122256e-06, + "loss": 0.9708, + "step": 1070 + }, + { + "epoch": 2.761781794706262, + "grad_norm": 0.5965929940159351, + "learning_rate": 9.882711751629368e-06, + "loss": 0.9618, + "step": 1071 + }, + { + "epoch": 2.764364105874758, + "grad_norm": 0.6238969650308814, + "learning_rate": 9.864668433942266e-06, + "loss": 0.9206, + "step": 1072 + }, + { + "epoch": 2.7669464170432536, + "grad_norm": 0.561902457075373, + "learning_rate": 9.84662555691269e-06, + "loss": 0.9762, + "step": 1073 + }, + { + "epoch": 2.76952872821175, + "grad_norm": 0.6148930289646558, + "learning_rate": 9.828583179290645e-06, + "loss": 0.9293, + "step": 1074 + }, + { + "epoch": 2.7721110393802455, + "grad_norm": 0.5816613771287756, + "learning_rate": 9.810541359824501e-06, + "loss": 0.9591, + "step": 1075 + }, + { + "epoch": 2.774693350548741, + "grad_norm": 0.6121639894598173, + "learning_rate": 9.792500157260816e-06, + "loss": 0.9727, + "step": 1076 + }, + { + "epoch": 2.777275661717237, + "grad_norm": 0.5738661064344951, + "learning_rate": 9.774459630344137e-06, + "loss": 0.9067, + "step": 1077 + }, + { + "epoch": 2.7798579728857327, + "grad_norm": 0.5745834880727902, + "learning_rate": 9.756419837816811e-06, + "loss": 0.9283, + "step": 1078 + }, + { + "epoch": 2.7824402840542284, + "grad_norm": 0.6019753697435574, + "learning_rate": 9.738380838418804e-06, + "loss": 0.9414, + "step": 1079 + }, + { + "epoch": 2.785022595222724, + "grad_norm": 0.5919495172527766, + "learning_rate": 9.720342690887477e-06, + "loss": 0.9464, + "step": 1080 + }, + { + "epoch": 2.7876049063912203, + "grad_norm": 0.5868873130752621, + "learning_rate": 9.702305453957439e-06, + "loss": 0.9589, + "step": 1081 + }, + { + "epoch": 2.790187217559716, + "grad_norm": 0.5951626550396919, + "learning_rate": 9.684269186360325e-06, + "loss": 0.9559, + "step": 1082 + }, + { + "epoch": 2.7927695287282117, + "grad_norm": 0.6004222716865213, + "learning_rate": 9.666233946824612e-06, + "loss": 0.9812, + "step": 1083 + }, + { + "epoch": 2.7953518398967074, + "grad_norm": 0.5696400324744211, + "learning_rate": 9.648199794075433e-06, + "loss": 0.9503, + "step": 1084 + }, + { + "epoch": 2.7979341510652036, + "grad_norm": 0.5743980783747284, + "learning_rate": 9.630166786834378e-06, + "loss": 0.935, + "step": 1085 + }, + { + "epoch": 2.8005164622336993, + "grad_norm": 0.5751575788089159, + "learning_rate": 9.612134983819316e-06, + "loss": 0.9294, + "step": 1086 + }, + { + "epoch": 2.803098773402195, + "grad_norm": 0.5735290035862011, + "learning_rate": 9.594104443744184e-06, + "loss": 0.9326, + "step": 1087 + }, + { + "epoch": 2.8056810845706908, + "grad_norm": 0.5850179347042352, + "learning_rate": 9.576075225318817e-06, + "loss": 0.9489, + "step": 1088 + }, + { + "epoch": 2.8082633957391865, + "grad_norm": 0.5751681929532767, + "learning_rate": 9.558047387248736e-06, + "loss": 0.933, + "step": 1089 + }, + { + "epoch": 2.810845706907682, + "grad_norm": 0.5767517016594284, + "learning_rate": 9.540020988234972e-06, + "loss": 0.9688, + "step": 1090 + }, + { + "epoch": 2.813428018076178, + "grad_norm": 0.5834104537340995, + "learning_rate": 9.521996086973877e-06, + "loss": 1.0005, + "step": 1091 + }, + { + "epoch": 2.816010329244674, + "grad_norm": 0.6121890401561288, + "learning_rate": 9.503972742156917e-06, + "loss": 0.9683, + "step": 1092 + }, + { + "epoch": 2.81859264041317, + "grad_norm": 0.5780502941320363, + "learning_rate": 9.485951012470491e-06, + "loss": 0.9651, + "step": 1093 + }, + { + "epoch": 2.8211749515816655, + "grad_norm": 0.5872895683685423, + "learning_rate": 9.467930956595742e-06, + "loss": 0.9497, + "step": 1094 + }, + { + "epoch": 2.8237572627501613, + "grad_norm": 0.6012252860645063, + "learning_rate": 9.449912633208362e-06, + "loss": 0.962, + "step": 1095 + }, + { + "epoch": 2.8263395739186574, + "grad_norm": 0.5812288173466004, + "learning_rate": 9.431896100978402e-06, + "loss": 0.9516, + "step": 1096 + }, + { + "epoch": 2.828921885087153, + "grad_norm": 0.5903667057899601, + "learning_rate": 9.413881418570082e-06, + "loss": 0.933, + "step": 1097 + }, + { + "epoch": 2.831504196255649, + "grad_norm": 0.574567344834327, + "learning_rate": 9.395868644641594e-06, + "loss": 0.9311, + "step": 1098 + }, + { + "epoch": 2.8340865074241446, + "grad_norm": 0.6029860483322287, + "learning_rate": 9.37785783784492e-06, + "loss": 0.9365, + "step": 1099 + }, + { + "epoch": 2.8366688185926403, + "grad_norm": 0.5885522147718864, + "learning_rate": 9.359849056825632e-06, + "loss": 0.9375, + "step": 1100 + }, + { + "epoch": 2.839251129761136, + "grad_norm": 0.5897382549514845, + "learning_rate": 9.341842360222717e-06, + "loss": 0.9568, + "step": 1101 + }, + { + "epoch": 2.8418334409296317, + "grad_norm": 0.5970771697415443, + "learning_rate": 9.323837806668363e-06, + "loss": 0.9544, + "step": 1102 + }, + { + "epoch": 2.844415752098128, + "grad_norm": 0.5998731951808198, + "learning_rate": 9.305835454787784e-06, + "loss": 0.9668, + "step": 1103 + }, + { + "epoch": 2.8469980632666236, + "grad_norm": 0.5820942005741839, + "learning_rate": 9.287835363199026e-06, + "loss": 0.9552, + "step": 1104 + }, + { + "epoch": 2.8495803744351194, + "grad_norm": 0.6084126869227644, + "learning_rate": 9.269837590512768e-06, + "loss": 0.9628, + "step": 1105 + }, + { + "epoch": 2.852162685603615, + "grad_norm": 0.66554698305709, + "learning_rate": 9.25184219533216e-06, + "loss": 0.9367, + "step": 1106 + }, + { + "epoch": 2.8547449967721112, + "grad_norm": 0.5807306091688449, + "learning_rate": 9.23384923625258e-06, + "loss": 0.9692, + "step": 1107 + }, + { + "epoch": 2.857327307940607, + "grad_norm": 0.5837765762229058, + "learning_rate": 9.215858771861495e-06, + "loss": 0.9355, + "step": 1108 + }, + { + "epoch": 2.8599096191091027, + "grad_norm": 0.6108951996108233, + "learning_rate": 9.197870860738245e-06, + "loss": 0.9618, + "step": 1109 + }, + { + "epoch": 2.8624919302775984, + "grad_norm": 0.6085207165359778, + "learning_rate": 9.17988556145385e-06, + "loss": 0.9749, + "step": 1110 + }, + { + "epoch": 2.865074241446094, + "grad_norm": 0.6014777439249565, + "learning_rate": 9.161902932570837e-06, + "loss": 0.9419, + "step": 1111 + }, + { + "epoch": 2.86765655261459, + "grad_norm": 0.5740295354518736, + "learning_rate": 9.143923032643019e-06, + "loss": 0.9325, + "step": 1112 + }, + { + "epoch": 2.870238863783086, + "grad_norm": 0.5824503091707712, + "learning_rate": 9.125945920215344e-06, + "loss": 0.9624, + "step": 1113 + }, + { + "epoch": 2.8728211749515817, + "grad_norm": 0.5881119183147646, + "learning_rate": 9.10797165382367e-06, + "loss": 0.9604, + "step": 1114 + }, + { + "epoch": 2.8754034861200775, + "grad_norm": 0.591602227679226, + "learning_rate": 9.090000291994596e-06, + "loss": 0.9522, + "step": 1115 + }, + { + "epoch": 2.877985797288573, + "grad_norm": 0.5894398262140761, + "learning_rate": 9.072031893245256e-06, + "loss": 0.9447, + "step": 1116 + }, + { + "epoch": 2.8805681084570693, + "grad_norm": 0.5843901076209989, + "learning_rate": 9.054066516083138e-06, + "loss": 0.9651, + "step": 1117 + }, + { + "epoch": 2.883150419625565, + "grad_norm": 0.5830155470269734, + "learning_rate": 9.036104219005895e-06, + "loss": 0.9391, + "step": 1118 + }, + { + "epoch": 2.885732730794061, + "grad_norm": 0.5795525849711025, + "learning_rate": 9.018145060501152e-06, + "loss": 0.9522, + "step": 1119 + }, + { + "epoch": 2.8883150419625565, + "grad_norm": 0.5722538427227781, + "learning_rate": 9.000189099046306e-06, + "loss": 0.9652, + "step": 1120 + }, + { + "epoch": 2.8908973531310522, + "grad_norm": 0.5834430509021916, + "learning_rate": 8.982236393108349e-06, + "loss": 0.9573, + "step": 1121 + }, + { + "epoch": 2.893479664299548, + "grad_norm": 0.5834678810709014, + "learning_rate": 8.964287001143672e-06, + "loss": 0.9901, + "step": 1122 + }, + { + "epoch": 2.8960619754680437, + "grad_norm": 0.6148204310068593, + "learning_rate": 8.946340981597879e-06, + "loss": 0.9392, + "step": 1123 + }, + { + "epoch": 2.89864428663654, + "grad_norm": 0.5823218513706327, + "learning_rate": 8.92839839290559e-06, + "loss": 0.9595, + "step": 1124 + }, + { + "epoch": 2.9012265978050356, + "grad_norm": 0.5970215090631561, + "learning_rate": 8.910459293490248e-06, + "loss": 0.9334, + "step": 1125 + }, + { + "epoch": 2.9038089089735313, + "grad_norm": 0.5998590562400262, + "learning_rate": 8.892523741763945e-06, + "loss": 0.9442, + "step": 1126 + }, + { + "epoch": 2.906391220142027, + "grad_norm": 0.6101328337460503, + "learning_rate": 8.874591796127213e-06, + "loss": 0.9584, + "step": 1127 + }, + { + "epoch": 2.908973531310523, + "grad_norm": 0.6129595613398248, + "learning_rate": 8.856663514968845e-06, + "loss": 0.9524, + "step": 1128 + }, + { + "epoch": 2.911555842479019, + "grad_norm": 0.603793328877133, + "learning_rate": 8.838738956665709e-06, + "loss": 0.9197, + "step": 1129 + }, + { + "epoch": 2.9141381536475146, + "grad_norm": 0.5909017292382529, + "learning_rate": 8.820818179582533e-06, + "loss": 0.9405, + "step": 1130 + }, + { + "epoch": 2.9167204648160103, + "grad_norm": 0.5831175832584677, + "learning_rate": 8.802901242071751e-06, + "loss": 0.9346, + "step": 1131 + }, + { + "epoch": 2.919302775984506, + "grad_norm": 0.5837762849349555, + "learning_rate": 8.784988202473284e-06, + "loss": 0.9333, + "step": 1132 + }, + { + "epoch": 2.9218850871530018, + "grad_norm": 0.5848385359972617, + "learning_rate": 8.76707911911437e-06, + "loss": 0.9551, + "step": 1133 + }, + { + "epoch": 2.9244673983214975, + "grad_norm": 0.6253939331925262, + "learning_rate": 8.749174050309357e-06, + "loss": 0.9813, + "step": 1134 + }, + { + "epoch": 2.9270497094899937, + "grad_norm": 0.5845874358674058, + "learning_rate": 8.73127305435952e-06, + "loss": 0.9567, + "step": 1135 + }, + { + "epoch": 2.9296320206584894, + "grad_norm": 0.5953343497217751, + "learning_rate": 8.71337618955288e-06, + "loss": 0.9953, + "step": 1136 + }, + { + "epoch": 2.932214331826985, + "grad_norm": 0.6030060364823723, + "learning_rate": 8.695483514163998e-06, + "loss": 0.9455, + "step": 1137 + }, + { + "epoch": 2.934796642995481, + "grad_norm": 0.5854700296562423, + "learning_rate": 8.677595086453808e-06, + "loss": 0.9408, + "step": 1138 + }, + { + "epoch": 2.937378954163977, + "grad_norm": 0.5891938385206548, + "learning_rate": 8.65971096466939e-06, + "loss": 0.9547, + "step": 1139 + }, + { + "epoch": 2.9399612653324727, + "grad_norm": 0.6001369316354844, + "learning_rate": 8.641831207043823e-06, + "loss": 0.9686, + "step": 1140 + }, + { + "epoch": 2.9425435765009684, + "grad_norm": 0.5815314396468559, + "learning_rate": 8.62395587179597e-06, + "loss": 0.9582, + "step": 1141 + }, + { + "epoch": 2.945125887669464, + "grad_norm": 0.5918140434178532, + "learning_rate": 8.606085017130289e-06, + "loss": 0.9825, + "step": 1142 + }, + { + "epoch": 2.94770819883796, + "grad_norm": 0.5892397810416701, + "learning_rate": 8.588218701236662e-06, + "loss": 0.944, + "step": 1143 + }, + { + "epoch": 2.9502905100064556, + "grad_norm": 0.5832582968436837, + "learning_rate": 8.570356982290172e-06, + "loss": 0.9375, + "step": 1144 + }, + { + "epoch": 2.9528728211749513, + "grad_norm": 0.6173280182644898, + "learning_rate": 8.552499918450949e-06, + "loss": 0.9782, + "step": 1145 + }, + { + "epoch": 2.9554551323434475, + "grad_norm": 0.584992585761828, + "learning_rate": 8.534647567863962e-06, + "loss": 0.9657, + "step": 1146 + }, + { + "epoch": 2.958037443511943, + "grad_norm": 0.6102553737687162, + "learning_rate": 8.516799988658833e-06, + "loss": 0.9371, + "step": 1147 + }, + { + "epoch": 2.960619754680439, + "grad_norm": 0.5813839505299179, + "learning_rate": 8.498957238949645e-06, + "loss": 0.9702, + "step": 1148 + }, + { + "epoch": 2.9632020658489346, + "grad_norm": 0.5873347150678367, + "learning_rate": 8.481119376834753e-06, + "loss": 0.9843, + "step": 1149 + }, + { + "epoch": 2.965784377017431, + "grad_norm": 0.6111402642438966, + "learning_rate": 8.46328646039661e-06, + "loss": 0.9697, + "step": 1150 + }, + { + "epoch": 2.9683666881859265, + "grad_norm": 0.5922931684597259, + "learning_rate": 8.445458547701555e-06, + "loss": 0.9627, + "step": 1151 + }, + { + "epoch": 2.9709489993544222, + "grad_norm": 0.5851245117575304, + "learning_rate": 8.427635696799636e-06, + "loss": 0.9215, + "step": 1152 + }, + { + "epoch": 2.973531310522918, + "grad_norm": 0.5804327721924878, + "learning_rate": 8.409817965724413e-06, + "loss": 0.9716, + "step": 1153 + }, + { + "epoch": 2.9761136216914137, + "grad_norm": 0.6003712250873723, + "learning_rate": 8.392005412492788e-06, + "loss": 0.9648, + "step": 1154 + }, + { + "epoch": 2.9786959328599094, + "grad_norm": 0.6082518706572542, + "learning_rate": 8.374198095104795e-06, + "loss": 0.95, + "step": 1155 + }, + { + "epoch": 2.9812782440284056, + "grad_norm": 0.5866011566920423, + "learning_rate": 8.356396071543422e-06, + "loss": 0.9444, + "step": 1156 + }, + { + "epoch": 2.9838605551969013, + "grad_norm": 0.6114880019803942, + "learning_rate": 8.338599399774417e-06, + "loss": 0.9693, + "step": 1157 + }, + { + "epoch": 2.986442866365397, + "grad_norm": 0.5927950336965607, + "learning_rate": 8.320808137746104e-06, + "loss": 0.9667, + "step": 1158 + }, + { + "epoch": 2.9890251775338927, + "grad_norm": 0.6169016547178486, + "learning_rate": 8.303022343389188e-06, + "loss": 0.9406, + "step": 1159 + }, + { + "epoch": 2.991607488702389, + "grad_norm": 0.582024586435705, + "learning_rate": 8.285242074616582e-06, + "loss": 0.9729, + "step": 1160 + }, + { + "epoch": 2.9941897998708846, + "grad_norm": 0.5944691597316901, + "learning_rate": 8.267467389323197e-06, + "loss": 0.9649, + "step": 1161 + }, + { + "epoch": 2.9967721110393803, + "grad_norm": 0.5916217281333404, + "learning_rate": 8.249698345385761e-06, + "loss": 0.9567, + "step": 1162 + }, + { + "epoch": 2.999354422207876, + "grad_norm": 0.5736806203482997, + "learning_rate": 8.231935000662641e-06, + "loss": 0.9526, + "step": 1163 + }, + { + "epoch": 3.0, + "grad_norm": 0.5736806203482997, + "learning_rate": 8.21417741299364e-06, + "loss": 0.8611, + "step": 1164 + }, + { + "epoch": 3.0025823111684957, + "grad_norm": 1.3479570480783307, + "learning_rate": 8.196425640199823e-06, + "loss": 0.8352, + "step": 1165 + }, + { + "epoch": 3.0051646223369914, + "grad_norm": 1.1968748953729984, + "learning_rate": 8.178679740083317e-06, + "loss": 0.8032, + "step": 1166 + }, + { + "epoch": 3.0077469335054876, + "grad_norm": 0.9744412992683603, + "learning_rate": 8.160939770427122e-06, + "loss": 0.811, + "step": 1167 + }, + { + "epoch": 3.0103292446739833, + "grad_norm": 0.8092124569788149, + "learning_rate": 8.143205788994933e-06, + "loss": 0.8442, + "step": 1168 + }, + { + "epoch": 3.012911555842479, + "grad_norm": 1.1536699236828805, + "learning_rate": 8.125477853530944e-06, + "loss": 0.8623, + "step": 1169 + }, + { + "epoch": 3.0154938670109748, + "grad_norm": 1.4343186656583924, + "learning_rate": 8.107756021759673e-06, + "loss": 0.7984, + "step": 1170 + }, + { + "epoch": 3.0180761781794705, + "grad_norm": 1.1928449593644268, + "learning_rate": 8.090040351385741e-06, + "loss": 0.8323, + "step": 1171 + }, + { + "epoch": 3.020658489347966, + "grad_norm": 0.9892158368981517, + "learning_rate": 8.072330900093728e-06, + "loss": 0.8219, + "step": 1172 + }, + { + "epoch": 3.0232408005164624, + "grad_norm": 0.9513434336373858, + "learning_rate": 8.054627725547958e-06, + "loss": 0.7942, + "step": 1173 + }, + { + "epoch": 3.025823111684958, + "grad_norm": 0.9160615847863206, + "learning_rate": 8.036930885392308e-06, + "loss": 0.824, + "step": 1174 + }, + { + "epoch": 3.028405422853454, + "grad_norm": 0.9712632670523301, + "learning_rate": 8.019240437250046e-06, + "loss": 0.8105, + "step": 1175 + }, + { + "epoch": 3.0309877340219495, + "grad_norm": 0.888585643517764, + "learning_rate": 8.001556438723608e-06, + "loss": 0.8133, + "step": 1176 + }, + { + "epoch": 3.0335700451904453, + "grad_norm": 0.8336632805823568, + "learning_rate": 7.983878947394444e-06, + "loss": 0.8087, + "step": 1177 + }, + { + "epoch": 3.0361523563589414, + "grad_norm": 0.9159983137263322, + "learning_rate": 7.966208020822808e-06, + "loss": 0.8458, + "step": 1178 + }, + { + "epoch": 3.038734667527437, + "grad_norm": 0.9601758597787429, + "learning_rate": 7.948543716547584e-06, + "loss": 0.8261, + "step": 1179 + }, + { + "epoch": 3.041316978695933, + "grad_norm": 0.876187258797956, + "learning_rate": 7.930886092086084e-06, + "loss": 0.8018, + "step": 1180 + }, + { + "epoch": 3.0438992898644286, + "grad_norm": 0.8319336547056765, + "learning_rate": 7.913235204933873e-06, + "loss": 0.8301, + "step": 1181 + }, + { + "epoch": 3.0464816010329243, + "grad_norm": 0.870478623487457, + "learning_rate": 7.895591112564588e-06, + "loss": 0.793, + "step": 1182 + }, + { + "epoch": 3.0490639122014205, + "grad_norm": 0.8628894314897713, + "learning_rate": 7.877953872429734e-06, + "loss": 0.8174, + "step": 1183 + }, + { + "epoch": 3.051646223369916, + "grad_norm": 0.7872403489116827, + "learning_rate": 7.8603235419585e-06, + "loss": 0.8163, + "step": 1184 + }, + { + "epoch": 3.054228534538412, + "grad_norm": 0.7851405920047361, + "learning_rate": 7.84270017855758e-06, + "loss": 0.8178, + "step": 1185 + }, + { + "epoch": 3.0568108457069076, + "grad_norm": 0.8157181746918352, + "learning_rate": 7.825083839610981e-06, + "loss": 0.7963, + "step": 1186 + }, + { + "epoch": 3.0593931568754034, + "grad_norm": 0.8180215928459832, + "learning_rate": 7.807474582479841e-06, + "loss": 0.8148, + "step": 1187 + }, + { + "epoch": 3.061975468043899, + "grad_norm": 0.8088389505024169, + "learning_rate": 7.789872464502241e-06, + "loss": 0.827, + "step": 1188 + }, + { + "epoch": 3.0645577792123952, + "grad_norm": 0.7907227679234932, + "learning_rate": 7.772277542993006e-06, + "loss": 0.8407, + "step": 1189 + }, + { + "epoch": 3.067140090380891, + "grad_norm": 0.8168926580368819, + "learning_rate": 7.754689875243536e-06, + "loss": 0.8252, + "step": 1190 + }, + { + "epoch": 3.0697224015493867, + "grad_norm": 0.8128108171059767, + "learning_rate": 7.737109518521604e-06, + "loss": 0.811, + "step": 1191 + }, + { + "epoch": 3.0723047127178824, + "grad_norm": 0.7723477729268966, + "learning_rate": 7.71953653007119e-06, + "loss": 0.8018, + "step": 1192 + }, + { + "epoch": 3.074887023886378, + "grad_norm": 0.7771679955089591, + "learning_rate": 7.701970967112278e-06, + "loss": 0.8206, + "step": 1193 + }, + { + "epoch": 3.0774693350548743, + "grad_norm": 0.7947531953649853, + "learning_rate": 7.684412886840662e-06, + "loss": 0.8374, + "step": 1194 + }, + { + "epoch": 3.08005164622337, + "grad_norm": 0.7797780708525804, + "learning_rate": 7.666862346427784e-06, + "loss": 0.809, + "step": 1195 + }, + { + "epoch": 3.0826339573918657, + "grad_norm": 0.7951018870568382, + "learning_rate": 7.649319403020528e-06, + "loss": 0.8148, + "step": 1196 + }, + { + "epoch": 3.0852162685603615, + "grad_norm": 0.7768045025376982, + "learning_rate": 7.631784113741048e-06, + "loss": 0.7905, + "step": 1197 + }, + { + "epoch": 3.087798579728857, + "grad_norm": 0.7380091530118719, + "learning_rate": 7.614256535686574e-06, + "loss": 0.8277, + "step": 1198 + }, + { + "epoch": 3.090380890897353, + "grad_norm": 0.8090369362037133, + "learning_rate": 7.596736725929218e-06, + "loss": 0.7897, + "step": 1199 + }, + { + "epoch": 3.092963202065849, + "grad_norm": 0.7816172334191853, + "learning_rate": 7.579224741515808e-06, + "loss": 0.801, + "step": 1200 + }, + { + "epoch": 3.095545513234345, + "grad_norm": 0.7716968909350221, + "learning_rate": 7.561720639467684e-06, + "loss": 0.8253, + "step": 1201 + }, + { + "epoch": 3.0981278244028405, + "grad_norm": 0.7767223781947307, + "learning_rate": 7.544224476780534e-06, + "loss": 0.8171, + "step": 1202 + }, + { + "epoch": 3.1007101355713362, + "grad_norm": 0.7780640346641391, + "learning_rate": 7.52673631042417e-06, + "loss": 0.8142, + "step": 1203 + }, + { + "epoch": 3.103292446739832, + "grad_norm": 0.7678875942864142, + "learning_rate": 7.509256197342389e-06, + "loss": 0.8437, + "step": 1204 + }, + { + "epoch": 3.105874757908328, + "grad_norm": 0.7741428737890553, + "learning_rate": 7.491784194452756e-06, + "loss": 0.7948, + "step": 1205 + }, + { + "epoch": 3.108457069076824, + "grad_norm": 0.7445434454135789, + "learning_rate": 7.4743203586464286e-06, + "loss": 0.8186, + "step": 1206 + }, + { + "epoch": 3.1110393802453196, + "grad_norm": 0.7308461534374082, + "learning_rate": 7.45686474678798e-06, + "loss": 0.8117, + "step": 1207 + }, + { + "epoch": 3.1136216914138153, + "grad_norm": 0.7624570651090968, + "learning_rate": 7.4394174157151826e-06, + "loss": 0.8184, + "step": 1208 + }, + { + "epoch": 3.116204002582311, + "grad_norm": 0.7787385810762857, + "learning_rate": 7.421978422238871e-06, + "loss": 0.8051, + "step": 1209 + }, + { + "epoch": 3.118786313750807, + "grad_norm": 0.7487622485166701, + "learning_rate": 7.404547823142718e-06, + "loss": 0.8065, + "step": 1210 + }, + { + "epoch": 3.121368624919303, + "grad_norm": 0.7700484213439688, + "learning_rate": 7.387125675183069e-06, + "loss": 0.7893, + "step": 1211 + }, + { + "epoch": 3.1239509360877986, + "grad_norm": 0.7498057989796449, + "learning_rate": 7.369712035088743e-06, + "loss": 0.8271, + "step": 1212 + }, + { + "epoch": 3.1265332472562943, + "grad_norm": 0.782447832053478, + "learning_rate": 7.352306959560862e-06, + "loss": 0.8177, + "step": 1213 + }, + { + "epoch": 3.12911555842479, + "grad_norm": 0.760945467135789, + "learning_rate": 7.3349105052726635e-06, + "loss": 0.8016, + "step": 1214 + }, + { + "epoch": 3.131697869593286, + "grad_norm": 0.7451691837423764, + "learning_rate": 7.317522728869308e-06, + "loss": 0.8292, + "step": 1215 + }, + { + "epoch": 3.134280180761782, + "grad_norm": 0.7594539784955314, + "learning_rate": 7.3001436869677056e-06, + "loss": 0.8363, + "step": 1216 + }, + { + "epoch": 3.1368624919302777, + "grad_norm": 0.7551959784047992, + "learning_rate": 7.2827734361563154e-06, + "loss": 0.8193, + "step": 1217 + }, + { + "epoch": 3.1394448030987734, + "grad_norm": 0.7523949203336101, + "learning_rate": 7.265412032994977e-06, + "loss": 0.8365, + "step": 1218 + }, + { + "epoch": 3.142027114267269, + "grad_norm": 0.7736463491191788, + "learning_rate": 7.248059534014728e-06, + "loss": 0.7735, + "step": 1219 + }, + { + "epoch": 3.144609425435765, + "grad_norm": 0.7260637259865401, + "learning_rate": 7.230715995717605e-06, + "loss": 0.816, + "step": 1220 + }, + { + "epoch": 3.147191736604261, + "grad_norm": 0.7971984476822972, + "learning_rate": 7.213381474576465e-06, + "loss": 0.844, + "step": 1221 + }, + { + "epoch": 3.1497740477727567, + "grad_norm": 0.7598850752183374, + "learning_rate": 7.19605602703481e-06, + "loss": 0.7923, + "step": 1222 + }, + { + "epoch": 3.1523563589412524, + "grad_norm": 0.7608019681518811, + "learning_rate": 7.178739709506592e-06, + "loss": 0.818, + "step": 1223 + }, + { + "epoch": 3.154938670109748, + "grad_norm": 0.7773577191907428, + "learning_rate": 7.161432578376042e-06, + "loss": 0.8353, + "step": 1224 + }, + { + "epoch": 3.157520981278244, + "grad_norm": 0.7551689941223817, + "learning_rate": 7.144134689997475e-06, + "loss": 0.8366, + "step": 1225 + }, + { + "epoch": 3.16010329244674, + "grad_norm": 0.7696289215692551, + "learning_rate": 7.126846100695105e-06, + "loss": 0.831, + "step": 1226 + }, + { + "epoch": 3.1626856036152358, + "grad_norm": 0.7600151124859899, + "learning_rate": 7.109566866762874e-06, + "loss": 0.8073, + "step": 1227 + }, + { + "epoch": 3.1652679147837315, + "grad_norm": 0.7520515666346982, + "learning_rate": 7.092297044464256e-06, + "loss": 0.8344, + "step": 1228 + }, + { + "epoch": 3.167850225952227, + "grad_norm": 0.7818985132603024, + "learning_rate": 7.075036690032088e-06, + "loss": 0.8273, + "step": 1229 + }, + { + "epoch": 3.170432537120723, + "grad_norm": 0.7438737448683109, + "learning_rate": 7.057785859668373e-06, + "loss": 0.8292, + "step": 1230 + }, + { + "epoch": 3.1730148482892186, + "grad_norm": 0.7604238311874598, + "learning_rate": 7.040544609544098e-06, + "loss": 0.806, + "step": 1231 + }, + { + "epoch": 3.175597159457715, + "grad_norm": 0.7739278944618028, + "learning_rate": 7.023312995799062e-06, + "loss": 0.8321, + "step": 1232 + }, + { + "epoch": 3.1781794706262105, + "grad_norm": 0.7829719049826178, + "learning_rate": 7.006091074541684e-06, + "loss": 0.8207, + "step": 1233 + }, + { + "epoch": 3.1807617817947063, + "grad_norm": 0.8051283397017396, + "learning_rate": 6.988878901848829e-06, + "loss": 0.7937, + "step": 1234 + }, + { + "epoch": 3.183344092963202, + "grad_norm": 0.7723245876655893, + "learning_rate": 6.9716765337656034e-06, + "loss": 0.7945, + "step": 1235 + }, + { + "epoch": 3.1859264041316977, + "grad_norm": 0.7838025063241568, + "learning_rate": 6.954484026305208e-06, + "loss": 0.7946, + "step": 1236 + }, + { + "epoch": 3.188508715300194, + "grad_norm": 0.7307107229399178, + "learning_rate": 6.937301435448725e-06, + "loss": 0.7995, + "step": 1237 + }, + { + "epoch": 3.1910910264686896, + "grad_norm": 0.8052398372954221, + "learning_rate": 6.920128817144946e-06, + "loss": 0.8201, + "step": 1238 + }, + { + "epoch": 3.1936733376371853, + "grad_norm": 0.760805158015623, + "learning_rate": 6.9029662273102015e-06, + "loss": 0.7999, + "step": 1239 + }, + { + "epoch": 3.196255648805681, + "grad_norm": 0.7483396811127839, + "learning_rate": 6.885813721828149e-06, + "loss": 0.7988, + "step": 1240 + }, + { + "epoch": 3.1988379599741767, + "grad_norm": 0.7404586071162459, + "learning_rate": 6.868671356549628e-06, + "loss": 0.8092, + "step": 1241 + }, + { + "epoch": 3.2014202711426725, + "grad_norm": 0.7813265679668377, + "learning_rate": 6.851539187292453e-06, + "loss": 0.8358, + "step": 1242 + }, + { + "epoch": 3.2040025823111686, + "grad_norm": 0.7759146212310302, + "learning_rate": 6.83441726984124e-06, + "loss": 0.8228, + "step": 1243 + }, + { + "epoch": 3.2065848934796644, + "grad_norm": 0.7818503076848575, + "learning_rate": 6.81730565994722e-06, + "loss": 0.8149, + "step": 1244 + }, + { + "epoch": 3.20916720464816, + "grad_norm": 0.7402110582844729, + "learning_rate": 6.800204413328062e-06, + "loss": 0.8388, + "step": 1245 + }, + { + "epoch": 3.211749515816656, + "grad_norm": 0.738425464653366, + "learning_rate": 6.7831135856677e-06, + "loss": 0.8089, + "step": 1246 + }, + { + "epoch": 3.2143318269851515, + "grad_norm": 0.777277503906415, + "learning_rate": 6.766033232616131e-06, + "loss": 0.8233, + "step": 1247 + }, + { + "epoch": 3.2169141381536477, + "grad_norm": 0.7530080273180854, + "learning_rate": 6.748963409789253e-06, + "loss": 0.82, + "step": 1248 + }, + { + "epoch": 3.2194964493221434, + "grad_norm": 0.7527460534251285, + "learning_rate": 6.731904172768668e-06, + "loss": 0.7935, + "step": 1249 + }, + { + "epoch": 3.222078760490639, + "grad_norm": 0.7417948760299368, + "learning_rate": 6.714855577101515e-06, + "loss": 0.81, + "step": 1250 + }, + { + "epoch": 3.224661071659135, + "grad_norm": 0.76497441596248, + "learning_rate": 6.697817678300287e-06, + "loss": 0.8134, + "step": 1251 + }, + { + "epoch": 3.2272433828276306, + "grad_norm": 0.7743095167259862, + "learning_rate": 6.680790531842641e-06, + "loss": 0.8158, + "step": 1252 + }, + { + "epoch": 3.2298256939961267, + "grad_norm": 0.7564105405805621, + "learning_rate": 6.6637741931712204e-06, + "loss": 0.8139, + "step": 1253 + }, + { + "epoch": 3.2324080051646225, + "grad_norm": 0.7569395840964698, + "learning_rate": 6.646768717693484e-06, + "loss": 0.8178, + "step": 1254 + }, + { + "epoch": 3.234990316333118, + "grad_norm": 0.7669325942851178, + "learning_rate": 6.629774160781511e-06, + "loss": 0.824, + "step": 1255 + }, + { + "epoch": 3.237572627501614, + "grad_norm": 0.7858446496283839, + "learning_rate": 6.612790577771835e-06, + "loss": 0.8176, + "step": 1256 + }, + { + "epoch": 3.2401549386701096, + "grad_norm": 0.756498502430699, + "learning_rate": 6.59581802396526e-06, + "loss": 0.8322, + "step": 1257 + }, + { + "epoch": 3.242737249838606, + "grad_norm": 0.7523880712468195, + "learning_rate": 6.578856554626665e-06, + "loss": 0.8179, + "step": 1258 + }, + { + "epoch": 3.2453195610071015, + "grad_norm": 0.7418381111698618, + "learning_rate": 6.561906224984844e-06, + "loss": 0.8214, + "step": 1259 + }, + { + "epoch": 3.2479018721755972, + "grad_norm": 0.748062534762086, + "learning_rate": 6.544967090232321e-06, + "loss": 0.8325, + "step": 1260 + }, + { + "epoch": 3.250484183344093, + "grad_norm": 0.7830260472124719, + "learning_rate": 6.5280392055251696e-06, + "loss": 0.8245, + "step": 1261 + }, + { + "epoch": 3.2530664945125887, + "grad_norm": 0.7580297797282579, + "learning_rate": 6.511122625982815e-06, + "loss": 0.8269, + "step": 1262 + }, + { + "epoch": 3.2556488056810844, + "grad_norm": 0.7545843411413197, + "learning_rate": 6.494217406687893e-06, + "loss": 0.8242, + "step": 1263 + }, + { + "epoch": 3.2582311168495806, + "grad_norm": 0.7439461629106354, + "learning_rate": 6.477323602686039e-06, + "loss": 0.8087, + "step": 1264 + }, + { + "epoch": 3.2608134280180763, + "grad_norm": 0.7693068257824085, + "learning_rate": 6.460441268985715e-06, + "loss": 0.8333, + "step": 1265 + }, + { + "epoch": 3.263395739186572, + "grad_norm": 0.7572186415123207, + "learning_rate": 6.443570460558048e-06, + "loss": 0.8085, + "step": 1266 + }, + { + "epoch": 3.2659780503550677, + "grad_norm": 0.7558961737811011, + "learning_rate": 6.426711232336613e-06, + "loss": 0.8068, + "step": 1267 + }, + { + "epoch": 3.2685603615235634, + "grad_norm": 0.7855400126793302, + "learning_rate": 6.409863639217306e-06, + "loss": 0.8147, + "step": 1268 + }, + { + "epoch": 3.2711426726920596, + "grad_norm": 0.7790255090638041, + "learning_rate": 6.393027736058117e-06, + "loss": 0.8256, + "step": 1269 + }, + { + "epoch": 3.2737249838605553, + "grad_norm": 0.7607044109115157, + "learning_rate": 6.376203577678981e-06, + "loss": 0.7971, + "step": 1270 + }, + { + "epoch": 3.276307295029051, + "grad_norm": 0.7553997691720208, + "learning_rate": 6.3593912188615966e-06, + "loss": 0.842, + "step": 1271 + }, + { + "epoch": 3.2788896061975468, + "grad_norm": 0.7889787626268039, + "learning_rate": 6.3425907143492216e-06, + "loss": 0.8183, + "step": 1272 + }, + { + "epoch": 3.2814719173660425, + "grad_norm": 0.7485790330649242, + "learning_rate": 6.325802118846533e-06, + "loss": 0.8185, + "step": 1273 + }, + { + "epoch": 3.284054228534538, + "grad_norm": 0.7536890088538672, + "learning_rate": 6.309025487019425e-06, + "loss": 0.8266, + "step": 1274 + }, + { + "epoch": 3.2866365397030344, + "grad_norm": 0.7550810533925633, + "learning_rate": 6.2922608734948355e-06, + "loss": 0.8079, + "step": 1275 + }, + { + "epoch": 3.28921885087153, + "grad_norm": 0.7471626158383303, + "learning_rate": 6.275508332860567e-06, + "loss": 0.8205, + "step": 1276 + }, + { + "epoch": 3.291801162040026, + "grad_norm": 0.7300314640072086, + "learning_rate": 6.258767919665113e-06, + "loss": 0.8021, + "step": 1277 + }, + { + "epoch": 3.2943834732085215, + "grad_norm": 0.7640658148164554, + "learning_rate": 6.242039688417483e-06, + "loss": 0.8132, + "step": 1278 + }, + { + "epoch": 3.2969657843770173, + "grad_norm": 0.7547528610145464, + "learning_rate": 6.225323693587014e-06, + "loss": 0.8287, + "step": 1279 + }, + { + "epoch": 3.2995480955455134, + "grad_norm": 0.7972400105837699, + "learning_rate": 6.208619989603205e-06, + "loss": 0.8315, + "step": 1280 + }, + { + "epoch": 3.302130406714009, + "grad_norm": 0.758612295379575, + "learning_rate": 6.191928630855527e-06, + "loss": 0.802, + "step": 1281 + }, + { + "epoch": 3.304712717882505, + "grad_norm": 0.7601220070386198, + "learning_rate": 6.1752496716932576e-06, + "loss": 0.834, + "step": 1282 + }, + { + "epoch": 3.3072950290510006, + "grad_norm": 0.7684262734009513, + "learning_rate": 6.158583166425304e-06, + "loss": 0.8481, + "step": 1283 + }, + { + "epoch": 3.3098773402194963, + "grad_norm": 0.7880034761965038, + "learning_rate": 6.141929169320018e-06, + "loss": 0.815, + "step": 1284 + }, + { + "epoch": 3.312459651387992, + "grad_norm": 0.7560178543170282, + "learning_rate": 6.125287734605018e-06, + "loss": 0.8129, + "step": 1285 + }, + { + "epoch": 3.315041962556488, + "grad_norm": 0.7706041266881096, + "learning_rate": 6.108658916467025e-06, + "loss": 0.8016, + "step": 1286 + }, + { + "epoch": 3.317624273724984, + "grad_norm": 0.7569765139274263, + "learning_rate": 6.092042769051674e-06, + "loss": 0.8273, + "step": 1287 + }, + { + "epoch": 3.3202065848934796, + "grad_norm": 0.7623292555288878, + "learning_rate": 6.075439346463349e-06, + "loss": 0.7931, + "step": 1288 + }, + { + "epoch": 3.3227888960619754, + "grad_norm": 0.7427549761512925, + "learning_rate": 6.0588487027649954e-06, + "loss": 0.7812, + "step": 1289 + }, + { + "epoch": 3.325371207230471, + "grad_norm": 0.7772449050888204, + "learning_rate": 6.042270891977946e-06, + "loss": 0.8305, + "step": 1290 + }, + { + "epoch": 3.3279535183989672, + "grad_norm": 0.7871222544756025, + "learning_rate": 6.025705968081753e-06, + "loss": 0.8387, + "step": 1291 + }, + { + "epoch": 3.330535829567463, + "grad_norm": 0.7583353530346796, + "learning_rate": 6.009153985014003e-06, + "loss": 0.8466, + "step": 1292 + }, + { + "epoch": 3.3331181407359587, + "grad_norm": 0.772127846582864, + "learning_rate": 5.992614996670156e-06, + "loss": 0.8155, + "step": 1293 + }, + { + "epoch": 3.3357004519044544, + "grad_norm": 0.7487201668100457, + "learning_rate": 5.976089056903342e-06, + "loss": 0.7953, + "step": 1294 + }, + { + "epoch": 3.33828276307295, + "grad_norm": 0.7614204718665639, + "learning_rate": 5.959576219524217e-06, + "loss": 0.8131, + "step": 1295 + }, + { + "epoch": 3.340865074241446, + "grad_norm": 0.7763835258668194, + "learning_rate": 5.94307653830077e-06, + "loss": 0.8198, + "step": 1296 + }, + { + "epoch": 3.343447385409942, + "grad_norm": 0.7753456222642561, + "learning_rate": 5.926590066958149e-06, + "loss": 0.8356, + "step": 1297 + }, + { + "epoch": 3.3460296965784377, + "grad_norm": 0.7516557804123375, + "learning_rate": 5.910116859178494e-06, + "loss": 0.7854, + "step": 1298 + }, + { + "epoch": 3.3486120077469335, + "grad_norm": 0.7503527754212284, + "learning_rate": 5.89365696860075e-06, + "loss": 0.8383, + "step": 1299 + }, + { + "epoch": 3.351194318915429, + "grad_norm": 0.8230846268240456, + "learning_rate": 5.877210448820508e-06, + "loss": 0.8282, + "step": 1300 + }, + { + "epoch": 3.3537766300839253, + "grad_norm": 0.7864548933883284, + "learning_rate": 5.860777353389816e-06, + "loss": 0.8201, + "step": 1301 + }, + { + "epoch": 3.356358941252421, + "grad_norm": 0.796951326601112, + "learning_rate": 5.844357735817012e-06, + "loss": 0.8124, + "step": 1302 + }, + { + "epoch": 3.358941252420917, + "grad_norm": 0.7449791284424515, + "learning_rate": 5.82795164956655e-06, + "loss": 0.8449, + "step": 1303 + }, + { + "epoch": 3.3615235635894125, + "grad_norm": 0.7629551074846378, + "learning_rate": 5.811559148058817e-06, + "loss": 0.787, + "step": 1304 + }, + { + "epoch": 3.3641058747579082, + "grad_norm": 0.7440812424379075, + "learning_rate": 5.795180284669981e-06, + "loss": 0.8282, + "step": 1305 + }, + { + "epoch": 3.366688185926404, + "grad_norm": 0.7744183120279426, + "learning_rate": 5.7788151127317825e-06, + "loss": 0.8258, + "step": 1306 + }, + { + "epoch": 3.3692704970949, + "grad_norm": 0.7418245554432372, + "learning_rate": 5.762463685531403e-06, + "loss": 0.8284, + "step": 1307 + }, + { + "epoch": 3.371852808263396, + "grad_norm": 0.7830933256063822, + "learning_rate": 5.746126056311248e-06, + "loss": 0.8452, + "step": 1308 + }, + { + "epoch": 3.3744351194318916, + "grad_norm": 0.7909760674112923, + "learning_rate": 5.729802278268813e-06, + "loss": 0.8168, + "step": 1309 + }, + { + "epoch": 3.3770174306003873, + "grad_norm": 0.8007258278890194, + "learning_rate": 5.713492404556477e-06, + "loss": 0.8027, + "step": 1310 + }, + { + "epoch": 3.379599741768883, + "grad_norm": 0.7758822514790055, + "learning_rate": 5.697196488281357e-06, + "loss": 0.8266, + "step": 1311 + }, + { + "epoch": 3.382182052937379, + "grad_norm": 0.765799650737426, + "learning_rate": 5.680914582505123e-06, + "loss": 0.8057, + "step": 1312 + }, + { + "epoch": 3.384764364105875, + "grad_norm": 0.7655979019681851, + "learning_rate": 5.6646467402438045e-06, + "loss": 0.8157, + "step": 1313 + }, + { + "epoch": 3.3873466752743706, + "grad_norm": 0.7766423523842311, + "learning_rate": 5.6483930144676616e-06, + "loss": 0.8162, + "step": 1314 + }, + { + "epoch": 3.3899289864428663, + "grad_norm": 0.7589364799160417, + "learning_rate": 5.632153458100985e-06, + "loss": 0.8321, + "step": 1315 + }, + { + "epoch": 3.392511297611362, + "grad_norm": 0.7788060942298414, + "learning_rate": 5.615928124021921e-06, + "loss": 0.837, + "step": 1316 + }, + { + "epoch": 3.3950936087798578, + "grad_norm": 0.7694554084656864, + "learning_rate": 5.599717065062302e-06, + "loss": 0.8438, + "step": 1317 + }, + { + "epoch": 3.397675919948354, + "grad_norm": 0.7631810781785031, + "learning_rate": 5.583520334007494e-06, + "loss": 0.8205, + "step": 1318 + }, + { + "epoch": 3.4002582311168497, + "grad_norm": 0.7854745994817811, + "learning_rate": 5.567337983596201e-06, + "loss": 0.8208, + "step": 1319 + }, + { + "epoch": 3.4028405422853454, + "grad_norm": 0.7690571440773396, + "learning_rate": 5.551170066520299e-06, + "loss": 0.814, + "step": 1320 + }, + { + "epoch": 3.405422853453841, + "grad_norm": 0.7681900860146816, + "learning_rate": 5.535016635424675e-06, + "loss": 0.822, + "step": 1321 + }, + { + "epoch": 3.408005164622337, + "grad_norm": 0.7811781678205161, + "learning_rate": 5.51887774290704e-06, + "loss": 0.818, + "step": 1322 + }, + { + "epoch": 3.410587475790833, + "grad_norm": 0.7664479268038544, + "learning_rate": 5.502753441517763e-06, + "loss": 0.8331, + "step": 1323 + }, + { + "epoch": 3.4131697869593287, + "grad_norm": 0.7775122726368401, + "learning_rate": 5.486643783759713e-06, + "loss": 0.8163, + "step": 1324 + }, + { + "epoch": 3.4157520981278244, + "grad_norm": 0.7851782250823803, + "learning_rate": 5.470548822088075e-06, + "loss": 0.833, + "step": 1325 + }, + { + "epoch": 3.41833440929632, + "grad_norm": 0.7722198216385613, + "learning_rate": 5.454468608910177e-06, + "loss": 0.8216, + "step": 1326 + }, + { + "epoch": 3.420916720464816, + "grad_norm": 0.7650125939985358, + "learning_rate": 5.43840319658532e-06, + "loss": 0.8195, + "step": 1327 + }, + { + "epoch": 3.4234990316333116, + "grad_norm": 0.7889877069137401, + "learning_rate": 5.422352637424623e-06, + "loss": 0.8356, + "step": 1328 + }, + { + "epoch": 3.4260813428018078, + "grad_norm": 0.759046090525073, + "learning_rate": 5.4063169836908355e-06, + "loss": 0.8281, + "step": 1329 + }, + { + "epoch": 3.4286636539703035, + "grad_norm": 0.7614819002487212, + "learning_rate": 5.390296287598173e-06, + "loss": 0.8176, + "step": 1330 + }, + { + "epoch": 3.431245965138799, + "grad_norm": 0.7733681640312509, + "learning_rate": 5.374290601312139e-06, + "loss": 0.8347, + "step": 1331 + }, + { + "epoch": 3.433828276307295, + "grad_norm": 0.7679752600393633, + "learning_rate": 5.3582999769493816e-06, + "loss": 0.8129, + "step": 1332 + }, + { + "epoch": 3.4364105874757906, + "grad_norm": 0.7666398230614995, + "learning_rate": 5.342324466577484e-06, + "loss": 0.8041, + "step": 1333 + }, + { + "epoch": 3.438992898644287, + "grad_norm": 0.757735562684295, + "learning_rate": 5.326364122214833e-06, + "loss": 0.832, + "step": 1334 + }, + { + "epoch": 3.4415752098127825, + "grad_norm": 0.7658776895218172, + "learning_rate": 5.310418995830429e-06, + "loss": 0.8127, + "step": 1335 + }, + { + "epoch": 3.4441575209812783, + "grad_norm": 0.7501769033527278, + "learning_rate": 5.2944891393437145e-06, + "loss": 0.8069, + "step": 1336 + }, + { + "epoch": 3.446739832149774, + "grad_norm": 0.7720443515660191, + "learning_rate": 5.278574604624411e-06, + "loss": 0.8031, + "step": 1337 + }, + { + "epoch": 3.4493221433182697, + "grad_norm": 0.7746952692548283, + "learning_rate": 5.262675443492359e-06, + "loss": 0.8212, + "step": 1338 + }, + { + "epoch": 3.4519044544867654, + "grad_norm": 0.7721583267540482, + "learning_rate": 5.246791707717343e-06, + "loss": 0.806, + "step": 1339 + }, + { + "epoch": 3.4544867656552616, + "grad_norm": 0.7656034684267539, + "learning_rate": 5.230923449018896e-06, + "loss": 0.815, + "step": 1340 + }, + { + "epoch": 3.4570690768237573, + "grad_norm": 0.7847317646384887, + "learning_rate": 5.215070719066182e-06, + "loss": 0.8406, + "step": 1341 + }, + { + "epoch": 3.459651387992253, + "grad_norm": 0.76108699723527, + "learning_rate": 5.199233569477796e-06, + "loss": 0.8535, + "step": 1342 + }, + { + "epoch": 3.4622336991607487, + "grad_norm": 0.7642688851829174, + "learning_rate": 5.183412051821591e-06, + "loss": 0.8082, + "step": 1343 + }, + { + "epoch": 3.464816010329245, + "grad_norm": 0.7767506589914347, + "learning_rate": 5.167606217614531e-06, + "loss": 0.8175, + "step": 1344 + }, + { + "epoch": 3.4673983214977406, + "grad_norm": 0.7584520457174025, + "learning_rate": 5.151816118322503e-06, + "loss": 0.8027, + "step": 1345 + }, + { + "epoch": 3.4699806326662364, + "grad_norm": 0.768873105060371, + "learning_rate": 5.136041805360172e-06, + "loss": 0.8109, + "step": 1346 + }, + { + "epoch": 3.472562943834732, + "grad_norm": 0.7839594886363217, + "learning_rate": 5.120283330090787e-06, + "loss": 0.8148, + "step": 1347 + }, + { + "epoch": 3.475145255003228, + "grad_norm": 0.773747886290827, + "learning_rate": 5.104540743826038e-06, + "loss": 0.8112, + "step": 1348 + }, + { + "epoch": 3.4777275661717235, + "grad_norm": 0.7739729084947021, + "learning_rate": 5.088814097825871e-06, + "loss": 0.809, + "step": 1349 + }, + { + "epoch": 3.4803098773402197, + "grad_norm": 0.7918183544657171, + "learning_rate": 5.073103443298326e-06, + "loss": 0.8455, + "step": 1350 + }, + { + "epoch": 3.4828921885087154, + "grad_norm": 0.7878706269294227, + "learning_rate": 5.057408831399385e-06, + "loss": 0.8308, + "step": 1351 + }, + { + "epoch": 3.485474499677211, + "grad_norm": 0.7823661607708897, + "learning_rate": 5.041730313232786e-06, + "loss": 0.8393, + "step": 1352 + }, + { + "epoch": 3.488056810845707, + "grad_norm": 0.7543319732416526, + "learning_rate": 5.026067939849864e-06, + "loss": 0.8318, + "step": 1353 + }, + { + "epoch": 3.4906391220142026, + "grad_norm": 0.769779193831718, + "learning_rate": 5.0104217622493736e-06, + "loss": 0.833, + "step": 1354 + }, + { + "epoch": 3.4932214331826987, + "grad_norm": 0.7716520294743638, + "learning_rate": 4.994791831377354e-06, + "loss": 0.8222, + "step": 1355 + }, + { + "epoch": 3.4958037443511945, + "grad_norm": 0.7736579686573494, + "learning_rate": 4.9791781981269326e-06, + "loss": 0.7974, + "step": 1356 + }, + { + "epoch": 3.49838605551969, + "grad_norm": 0.7302768684285259, + "learning_rate": 4.9635809133381685e-06, + "loss": 0.8207, + "step": 1357 + }, + { + "epoch": 3.500968366688186, + "grad_norm": 0.7621702814787035, + "learning_rate": 4.948000027797885e-06, + "loss": 0.8077, + "step": 1358 + }, + { + "epoch": 3.5035506778566816, + "grad_norm": 0.7638463102404097, + "learning_rate": 4.93243559223952e-06, + "loss": 0.7849, + "step": 1359 + }, + { + "epoch": 3.5061329890251773, + "grad_norm": 0.7525066168052732, + "learning_rate": 4.916887657342931e-06, + "loss": 0.8103, + "step": 1360 + }, + { + "epoch": 3.5087153001936735, + "grad_norm": 0.7465812252567701, + "learning_rate": 4.901356273734261e-06, + "loss": 0.8251, + "step": 1361 + }, + { + "epoch": 3.5112976113621692, + "grad_norm": 0.789989621898556, + "learning_rate": 4.885841491985758e-06, + "loss": 0.8156, + "step": 1362 + }, + { + "epoch": 3.513879922530665, + "grad_norm": 0.7567373410192682, + "learning_rate": 4.870343362615605e-06, + "loss": 0.8241, + "step": 1363 + }, + { + "epoch": 3.5164622336991607, + "grad_norm": 0.7748589228781302, + "learning_rate": 4.8548619360877635e-06, + "loss": 0.8061, + "step": 1364 + }, + { + "epoch": 3.5190445448676564, + "grad_norm": 0.7738666119944785, + "learning_rate": 4.839397262811814e-06, + "loss": 0.8101, + "step": 1365 + }, + { + "epoch": 3.5216268560361526, + "grad_norm": 0.7614113255993917, + "learning_rate": 4.823949393142791e-06, + "loss": 0.8237, + "step": 1366 + }, + { + "epoch": 3.5242091672046483, + "grad_norm": 0.776611038166, + "learning_rate": 4.808518377380999e-06, + "loss": 0.8334, + "step": 1367 + }, + { + "epoch": 3.526791478373144, + "grad_norm": 0.7600268059134173, + "learning_rate": 4.7931042657718685e-06, + "loss": 0.8221, + "step": 1368 + }, + { + "epoch": 3.5293737895416397, + "grad_norm": 0.7659298097233458, + "learning_rate": 4.777707108505801e-06, + "loss": 0.8374, + "step": 1369 + }, + { + "epoch": 3.5319561007101354, + "grad_norm": 0.740099038297969, + "learning_rate": 4.762326955717972e-06, + "loss": 0.8138, + "step": 1370 + }, + { + "epoch": 3.534538411878631, + "grad_norm": 0.7714772416888985, + "learning_rate": 4.746963857488208e-06, + "loss": 0.8288, + "step": 1371 + }, + { + "epoch": 3.5371207230471273, + "grad_norm": 0.7725845357201855, + "learning_rate": 4.7316178638407885e-06, + "loss": 0.822, + "step": 1372 + }, + { + "epoch": 3.539703034215623, + "grad_norm": 0.7621909389829162, + "learning_rate": 4.716289024744308e-06, + "loss": 0.8231, + "step": 1373 + }, + { + "epoch": 3.5422853453841188, + "grad_norm": 0.7607917831814722, + "learning_rate": 4.700977390111495e-06, + "loss": 0.8446, + "step": 1374 + }, + { + "epoch": 3.5448676565526145, + "grad_norm": 0.7656461760658241, + "learning_rate": 4.685683009799065e-06, + "loss": 0.8214, + "step": 1375 + }, + { + "epoch": 3.5474499677211107, + "grad_norm": 0.7752118246819907, + "learning_rate": 4.670405933607554e-06, + "loss": 0.8249, + "step": 1376 + }, + { + "epoch": 3.5500322788896064, + "grad_norm": 0.7402699684294802, + "learning_rate": 4.6551462112811384e-06, + "loss": 0.8409, + "step": 1377 + }, + { + "epoch": 3.552614590058102, + "grad_norm": 0.7778533292706469, + "learning_rate": 4.639903892507501e-06, + "loss": 0.7924, + "step": 1378 + }, + { + "epoch": 3.555196901226598, + "grad_norm": 0.7705963566454496, + "learning_rate": 4.624679026917658e-06, + "loss": 0.8203, + "step": 1379 + }, + { + "epoch": 3.5577792123950935, + "grad_norm": 0.7365633344886319, + "learning_rate": 4.609471664085787e-06, + "loss": 0.8123, + "step": 1380 + }, + { + "epoch": 3.5603615235635893, + "grad_norm": 0.7586569356393417, + "learning_rate": 4.594281853529076e-06, + "loss": 0.8299, + "step": 1381 + }, + { + "epoch": 3.562943834732085, + "grad_norm": 0.7652449035633111, + "learning_rate": 4.5791096447075645e-06, + "loss": 0.8141, + "step": 1382 + }, + { + "epoch": 3.565526145900581, + "grad_norm": 0.7582437627266295, + "learning_rate": 4.563955087023981e-06, + "loss": 0.805, + "step": 1383 + }, + { + "epoch": 3.568108457069077, + "grad_norm": 0.7826618805912787, + "learning_rate": 4.548818229823568e-06, + "loss": 0.8293, + "step": 1384 + }, + { + "epoch": 3.5706907682375726, + "grad_norm": 0.762954307030826, + "learning_rate": 4.5336991223939486e-06, + "loss": 0.8456, + "step": 1385 + }, + { + "epoch": 3.5732730794060683, + "grad_norm": 0.7623373985847602, + "learning_rate": 4.5185978139649355e-06, + "loss": 0.8192, + "step": 1386 + }, + { + "epoch": 3.5758553905745645, + "grad_norm": 0.7765489599431679, + "learning_rate": 4.503514353708389e-06, + "loss": 0.815, + "step": 1387 + }, + { + "epoch": 3.57843770174306, + "grad_norm": 0.7584186284661693, + "learning_rate": 4.488448790738059e-06, + "loss": 0.8301, + "step": 1388 + }, + { + "epoch": 3.581020012911556, + "grad_norm": 0.7763483912193899, + "learning_rate": 4.473401174109423e-06, + "loss": 0.8518, + "step": 1389 + }, + { + "epoch": 3.5836023240800516, + "grad_norm": 0.796977617965849, + "learning_rate": 4.45837155281951e-06, + "loss": 0.8258, + "step": 1390 + }, + { + "epoch": 3.5861846352485474, + "grad_norm": 0.7770662519335874, + "learning_rate": 4.443359975806757e-06, + "loss": 0.8068, + "step": 1391 + }, + { + "epoch": 3.588766946417043, + "grad_norm": 0.7790952514127689, + "learning_rate": 4.428366491950854e-06, + "loss": 0.8296, + "step": 1392 + }, + { + "epoch": 3.591349257585539, + "grad_norm": 0.7764896929937788, + "learning_rate": 4.413391150072577e-06, + "loss": 0.8007, + "step": 1393 + }, + { + "epoch": 3.593931568754035, + "grad_norm": 0.7607538258474698, + "learning_rate": 4.39843399893362e-06, + "loss": 0.8025, + "step": 1394 + }, + { + "epoch": 3.5965138799225307, + "grad_norm": 0.7492819656752748, + "learning_rate": 4.383495087236448e-06, + "loss": 0.8157, + "step": 1395 + }, + { + "epoch": 3.5990961910910264, + "grad_norm": 0.7596354310000931, + "learning_rate": 4.368574463624146e-06, + "loss": 0.8272, + "step": 1396 + }, + { + "epoch": 3.601678502259522, + "grad_norm": 0.7407159353225491, + "learning_rate": 4.353672176680236e-06, + "loss": 0.8123, + "step": 1397 + }, + { + "epoch": 3.6042608134280183, + "grad_norm": 0.7672926201622885, + "learning_rate": 4.338788274928544e-06, + "loss": 0.8086, + "step": 1398 + }, + { + "epoch": 3.606843124596514, + "grad_norm": 0.7636171894829668, + "learning_rate": 4.323922806833031e-06, + "loss": 0.8067, + "step": 1399 + }, + { + "epoch": 3.6094254357650097, + "grad_norm": 0.7677055145204084, + "learning_rate": 4.3090758207976305e-06, + "loss": 0.7908, + "step": 1400 + }, + { + "epoch": 3.6120077469335055, + "grad_norm": 0.748818246629621, + "learning_rate": 4.294247365166093e-06, + "loss": 0.8312, + "step": 1401 + }, + { + "epoch": 3.614590058102001, + "grad_norm": 0.7660560945307108, + "learning_rate": 4.279437488221843e-06, + "loss": 0.8022, + "step": 1402 + }, + { + "epoch": 3.617172369270497, + "grad_norm": 0.7838437600172277, + "learning_rate": 4.2646462381878076e-06, + "loss": 0.8377, + "step": 1403 + }, + { + "epoch": 3.6197546804389926, + "grad_norm": 0.8010185687327693, + "learning_rate": 4.249873663226245e-06, + "loss": 0.7993, + "step": 1404 + }, + { + "epoch": 3.622336991607489, + "grad_norm": 0.7553739164896088, + "learning_rate": 4.235119811438627e-06, + "loss": 0.8261, + "step": 1405 + }, + { + "epoch": 3.6249193027759845, + "grad_norm": 0.7716962358330001, + "learning_rate": 4.220384730865456e-06, + "loss": 0.8405, + "step": 1406 + }, + { + "epoch": 3.6275016139444802, + "grad_norm": 0.7536919382865905, + "learning_rate": 4.205668469486098e-06, + "loss": 0.8108, + "step": 1407 + }, + { + "epoch": 3.630083925112976, + "grad_norm": 0.7420774971418518, + "learning_rate": 4.190971075218662e-06, + "loss": 0.8065, + "step": 1408 + }, + { + "epoch": 3.632666236281472, + "grad_norm": 0.7632144438839245, + "learning_rate": 4.176292595919803e-06, + "loss": 0.7927, + "step": 1409 + }, + { + "epoch": 3.635248547449968, + "grad_norm": 0.7571064594393233, + "learning_rate": 4.1616330793846075e-06, + "loss": 0.8362, + "step": 1410 + }, + { + "epoch": 3.6378308586184636, + "grad_norm": 0.7622517676642483, + "learning_rate": 4.146992573346394e-06, + "loss": 0.8257, + "step": 1411 + }, + { + "epoch": 3.6404131697869593, + "grad_norm": 0.7835717386734106, + "learning_rate": 4.1323711254766015e-06, + "loss": 0.8223, + "step": 1412 + }, + { + "epoch": 3.642995480955455, + "grad_norm": 0.7478789535763465, + "learning_rate": 4.117768783384599e-06, + "loss": 0.7949, + "step": 1413 + }, + { + "epoch": 3.6455777921239507, + "grad_norm": 0.7767414531451252, + "learning_rate": 4.1031855946175455e-06, + "loss": 0.7961, + "step": 1414 + }, + { + "epoch": 3.648160103292447, + "grad_norm": 0.7840863973404958, + "learning_rate": 4.088621606660243e-06, + "loss": 0.7999, + "step": 1415 + }, + { + "epoch": 3.6507424144609426, + "grad_norm": 0.7547796898810605, + "learning_rate": 4.074076866934967e-06, + "loss": 0.818, + "step": 1416 + }, + { + "epoch": 3.6533247256294383, + "grad_norm": 0.7714902567445823, + "learning_rate": 4.05955142280132e-06, + "loss": 0.8105, + "step": 1417 + }, + { + "epoch": 3.655907036797934, + "grad_norm": 0.7532163616590967, + "learning_rate": 4.0450453215560684e-06, + "loss": 0.8049, + "step": 1418 + }, + { + "epoch": 3.65848934796643, + "grad_norm": 0.7773578119690996, + "learning_rate": 4.030558610433005e-06, + "loss": 0.7914, + "step": 1419 + }, + { + "epoch": 3.661071659134926, + "grad_norm": 0.763304292538588, + "learning_rate": 4.016091336602789e-06, + "loss": 0.8275, + "step": 1420 + }, + { + "epoch": 3.6636539703034217, + "grad_norm": 0.7804812337616664, + "learning_rate": 4.001643547172776e-06, + "loss": 0.8377, + "step": 1421 + }, + { + "epoch": 3.6662362814719174, + "grad_norm": 0.767412403491653, + "learning_rate": 3.987215289186881e-06, + "loss": 0.8256, + "step": 1422 + }, + { + "epoch": 3.668818592640413, + "grad_norm": 0.7617867902372989, + "learning_rate": 3.972806609625434e-06, + "loss": 0.8106, + "step": 1423 + }, + { + "epoch": 3.671400903808909, + "grad_norm": 0.7540382466723832, + "learning_rate": 3.958417555404999e-06, + "loss": 0.8074, + "step": 1424 + }, + { + "epoch": 3.6739832149774045, + "grad_norm": 0.7748311551652659, + "learning_rate": 3.9440481733782485e-06, + "loss": 0.8125, + "step": 1425 + }, + { + "epoch": 3.6765655261459007, + "grad_norm": 0.7623341018369493, + "learning_rate": 3.929698510333799e-06, + "loss": 0.8337, + "step": 1426 + }, + { + "epoch": 3.6791478373143964, + "grad_norm": 0.767758139184047, + "learning_rate": 3.915368612996055e-06, + "loss": 0.8341, + "step": 1427 + }, + { + "epoch": 3.681730148482892, + "grad_norm": 0.7519042843627542, + "learning_rate": 3.901058528025055e-06, + "loss": 0.8061, + "step": 1428 + }, + { + "epoch": 3.684312459651388, + "grad_norm": 0.7625093078444409, + "learning_rate": 3.8867683020163446e-06, + "loss": 0.822, + "step": 1429 + }, + { + "epoch": 3.686894770819884, + "grad_norm": 0.80028510095772, + "learning_rate": 3.872497981500787e-06, + "loss": 0.8502, + "step": 1430 + }, + { + "epoch": 3.6894770819883798, + "grad_norm": 0.7652238383245407, + "learning_rate": 3.8582476129444435e-06, + "loss": 0.8163, + "step": 1431 + }, + { + "epoch": 3.6920593931568755, + "grad_norm": 0.7842966117293941, + "learning_rate": 3.844017242748398e-06, + "loss": 0.7996, + "step": 1432 + }, + { + "epoch": 3.694641704325371, + "grad_norm": 0.7495726108816106, + "learning_rate": 3.829806917248631e-06, + "loss": 0.8061, + "step": 1433 + }, + { + "epoch": 3.697224015493867, + "grad_norm": 0.7579352515486196, + "learning_rate": 3.815616682715839e-06, + "loss": 0.7876, + "step": 1434 + }, + { + "epoch": 3.6998063266623626, + "grad_norm": 0.7788849623252266, + "learning_rate": 3.801446585355315e-06, + "loss": 0.8334, + "step": 1435 + }, + { + "epoch": 3.7023886378308584, + "grad_norm": 0.7445580252143607, + "learning_rate": 3.7872966713067683e-06, + "loss": 0.8182, + "step": 1436 + }, + { + "epoch": 3.7049709489993545, + "grad_norm": 0.7772631069572105, + "learning_rate": 3.773166986644202e-06, + "loss": 0.8149, + "step": 1437 + }, + { + "epoch": 3.7075532601678503, + "grad_norm": 0.7542443690987021, + "learning_rate": 3.7590575773757378e-06, + "loss": 0.8085, + "step": 1438 + }, + { + "epoch": 3.710135571336346, + "grad_norm": 0.7490006884397156, + "learning_rate": 3.744968489443488e-06, + "loss": 0.8364, + "step": 1439 + }, + { + "epoch": 3.7127178825048417, + "grad_norm": 0.7588964697785906, + "learning_rate": 3.7308997687233896e-06, + "loss": 0.8109, + "step": 1440 + }, + { + "epoch": 3.715300193673338, + "grad_norm": 0.7401504987559857, + "learning_rate": 3.7168514610250594e-06, + "loss": 0.8026, + "step": 1441 + }, + { + "epoch": 3.7178825048418336, + "grad_norm": 0.7554850061896863, + "learning_rate": 3.7028236120916537e-06, + "loss": 0.8315, + "step": 1442 + }, + { + "epoch": 3.7204648160103293, + "grad_norm": 0.7804604627439944, + "learning_rate": 3.688816267599713e-06, + "loss": 0.8317, + "step": 1443 + }, + { + "epoch": 3.723047127178825, + "grad_norm": 0.7748081363692426, + "learning_rate": 3.6748294731590038e-06, + "loss": 0.811, + "step": 1444 + }, + { + "epoch": 3.7256294383473207, + "grad_norm": 0.7657357348049666, + "learning_rate": 3.6608632743123827e-06, + "loss": 0.8244, + "step": 1445 + }, + { + "epoch": 3.7282117495158165, + "grad_norm": 0.775640736208981, + "learning_rate": 3.6469177165356493e-06, + "loss": 0.835, + "step": 1446 + }, + { + "epoch": 3.730794060684312, + "grad_norm": 0.7864122051723232, + "learning_rate": 3.6329928452373843e-06, + "loss": 0.8354, + "step": 1447 + }, + { + "epoch": 3.7333763718528084, + "grad_norm": 0.7862243619147634, + "learning_rate": 3.6190887057588185e-06, + "loss": 0.8311, + "step": 1448 + }, + { + "epoch": 3.735958683021304, + "grad_norm": 0.7454712765612813, + "learning_rate": 3.6052053433736777e-06, + "loss": 0.8061, + "step": 1449 + }, + { + "epoch": 3.7385409941898, + "grad_norm": 0.7477377098403756, + "learning_rate": 3.591342803288027e-06, + "loss": 0.7974, + "step": 1450 + }, + { + "epoch": 3.7411233053582955, + "grad_norm": 0.7414265044721174, + "learning_rate": 3.5775011306401317e-06, + "loss": 0.8101, + "step": 1451 + }, + { + "epoch": 3.7437056165267917, + "grad_norm": 0.7682965899056072, + "learning_rate": 3.5636803705003174e-06, + "loss": 0.8396, + "step": 1452 + }, + { + "epoch": 3.7462879276952874, + "grad_norm": 0.7638628497129718, + "learning_rate": 3.5498805678708172e-06, + "loss": 0.8086, + "step": 1453 + }, + { + "epoch": 3.748870238863783, + "grad_norm": 0.7987728579111757, + "learning_rate": 3.5361017676856114e-06, + "loss": 0.8301, + "step": 1454 + }, + { + "epoch": 3.751452550032279, + "grad_norm": 0.7642546226790663, + "learning_rate": 3.5223440148103017e-06, + "loss": 0.8127, + "step": 1455 + }, + { + "epoch": 3.7540348612007746, + "grad_norm": 0.7590358311679077, + "learning_rate": 3.5086073540419594e-06, + "loss": 0.8299, + "step": 1456 + }, + { + "epoch": 3.7566171723692703, + "grad_norm": 0.7737724761627253, + "learning_rate": 3.4948918301089687e-06, + "loss": 0.7995, + "step": 1457 + }, + { + "epoch": 3.7591994835377665, + "grad_norm": 0.7893128648030869, + "learning_rate": 3.481197487670901e-06, + "loss": 0.8304, + "step": 1458 + }, + { + "epoch": 3.761781794706262, + "grad_norm": 0.7672293433531253, + "learning_rate": 3.4675243713183436e-06, + "loss": 0.8271, + "step": 1459 + }, + { + "epoch": 3.764364105874758, + "grad_norm": 0.781703741527432, + "learning_rate": 3.4538725255727855e-06, + "loss": 0.8248, + "step": 1460 + }, + { + "epoch": 3.7669464170432536, + "grad_norm": 0.7580531483398701, + "learning_rate": 3.4402419948864384e-06, + "loss": 0.7916, + "step": 1461 + }, + { + "epoch": 3.76952872821175, + "grad_norm": 0.7486908586065847, + "learning_rate": 3.426632823642123e-06, + "loss": 0.8137, + "step": 1462 + }, + { + "epoch": 3.7721110393802455, + "grad_norm": 0.7644509690799265, + "learning_rate": 3.4130450561531102e-06, + "loss": 0.8355, + "step": 1463 + }, + { + "epoch": 3.774693350548741, + "grad_norm": 0.7534559123863085, + "learning_rate": 3.3994787366629623e-06, + "loss": 0.8255, + "step": 1464 + }, + { + "epoch": 3.777275661717237, + "grad_norm": 0.7614512432818442, + "learning_rate": 3.385933909345419e-06, + "loss": 0.8115, + "step": 1465 + }, + { + "epoch": 3.7798579728857327, + "grad_norm": 0.786112853584683, + "learning_rate": 3.372410618304238e-06, + "loss": 0.8559, + "step": 1466 + }, + { + "epoch": 3.7824402840542284, + "grad_norm": 0.7573955890285036, + "learning_rate": 3.3589089075730474e-06, + "loss": 0.8079, + "step": 1467 + }, + { + "epoch": 3.785022595222724, + "grad_norm": 0.7339247578928705, + "learning_rate": 3.345428821115202e-06, + "loss": 0.8239, + "step": 1468 + }, + { + "epoch": 3.7876049063912203, + "grad_norm": 0.8027337173972606, + "learning_rate": 3.3319704028236553e-06, + "loss": 0.8258, + "step": 1469 + }, + { + "epoch": 3.790187217559716, + "grad_norm": 0.7602204125813788, + "learning_rate": 3.3185336965208057e-06, + "loss": 0.8267, + "step": 1470 + }, + { + "epoch": 3.7927695287282117, + "grad_norm": 0.7661870955667278, + "learning_rate": 3.3051187459583454e-06, + "loss": 0.8059, + "step": 1471 + }, + { + "epoch": 3.7953518398967074, + "grad_norm": 0.7519474368509446, + "learning_rate": 3.2917255948171366e-06, + "loss": 0.8056, + "step": 1472 + }, + { + "epoch": 3.7979341510652036, + "grad_norm": 0.7657682155523916, + "learning_rate": 3.2783542867070538e-06, + "loss": 0.8293, + "step": 1473 + }, + { + "epoch": 3.8005164622336993, + "grad_norm": 0.76866731372553, + "learning_rate": 3.2650048651668463e-06, + "loss": 0.847, + "step": 1474 + }, + { + "epoch": 3.803098773402195, + "grad_norm": 0.7770729452000317, + "learning_rate": 3.251677373664004e-06, + "loss": 0.8026, + "step": 1475 + }, + { + "epoch": 3.8056810845706908, + "grad_norm": 0.7444822909996885, + "learning_rate": 3.2383718555946098e-06, + "loss": 0.8205, + "step": 1476 + }, + { + "epoch": 3.8082633957391865, + "grad_norm": 0.7522360723786109, + "learning_rate": 3.2250883542831933e-06, + "loss": 0.7975, + "step": 1477 + }, + { + "epoch": 3.810845706907682, + "grad_norm": 0.7476561413065432, + "learning_rate": 3.211826912982591e-06, + "loss": 0.8302, + "step": 1478 + }, + { + "epoch": 3.813428018076178, + "grad_norm": 0.7624889384036899, + "learning_rate": 3.1985875748738193e-06, + "loss": 0.8336, + "step": 1479 + }, + { + "epoch": 3.816010329244674, + "grad_norm": 0.7611831427412808, + "learning_rate": 3.1853703830659223e-06, + "loss": 0.8241, + "step": 1480 + }, + { + "epoch": 3.81859264041317, + "grad_norm": 0.7522692175193729, + "learning_rate": 3.1721753805958245e-06, + "loss": 0.8464, + "step": 1481 + }, + { + "epoch": 3.8211749515816655, + "grad_norm": 0.7971847308682318, + "learning_rate": 3.1590026104282024e-06, + "loss": 0.8315, + "step": 1482 + }, + { + "epoch": 3.8237572627501613, + "grad_norm": 0.7769310232315048, + "learning_rate": 3.145852115455348e-06, + "loss": 0.8264, + "step": 1483 + }, + { + "epoch": 3.8263395739186574, + "grad_norm": 0.7815145076904971, + "learning_rate": 3.132723938497011e-06, + "loss": 0.8103, + "step": 1484 + }, + { + "epoch": 3.828921885087153, + "grad_norm": 0.7394387659719708, + "learning_rate": 3.1196181223002842e-06, + "loss": 0.8057, + "step": 1485 + }, + { + "epoch": 3.831504196255649, + "grad_norm": 0.7552308531154759, + "learning_rate": 3.106534709539435e-06, + "loss": 0.8411, + "step": 1486 + }, + { + "epoch": 3.8340865074241446, + "grad_norm": 0.7493333935655804, + "learning_rate": 3.093473742815797e-06, + "loss": 0.8039, + "step": 1487 + }, + { + "epoch": 3.8366688185926403, + "grad_norm": 0.738912917115583, + "learning_rate": 3.0804352646576052e-06, + "loss": 0.8271, + "step": 1488 + }, + { + "epoch": 3.839251129761136, + "grad_norm": 0.7712798674043028, + "learning_rate": 3.067419317519875e-06, + "loss": 0.821, + "step": 1489 + }, + { + "epoch": 3.8418334409296317, + "grad_norm": 0.7673812637479288, + "learning_rate": 3.054425943784265e-06, + "loss": 0.8401, + "step": 1490 + }, + { + "epoch": 3.844415752098128, + "grad_norm": 0.7690243688879774, + "learning_rate": 3.041455185758908e-06, + "loss": 0.7975, + "step": 1491 + }, + { + "epoch": 3.8469980632666236, + "grad_norm": 0.7488704099566282, + "learning_rate": 3.0285070856783206e-06, + "loss": 0.793, + "step": 1492 + }, + { + "epoch": 3.8495803744351194, + "grad_norm": 0.7468016897185966, + "learning_rate": 3.015581685703237e-06, + "loss": 0.8109, + "step": 1493 + }, + { + "epoch": 3.852162685603615, + "grad_norm": 0.7504249930796498, + "learning_rate": 3.0026790279204664e-06, + "loss": 0.8314, + "step": 1494 + }, + { + "epoch": 3.8547449967721112, + "grad_norm": 0.7557407963240177, + "learning_rate": 2.9897991543427797e-06, + "loss": 0.8327, + "step": 1495 + }, + { + "epoch": 3.857327307940607, + "grad_norm": 0.7670324489918254, + "learning_rate": 2.976942106908749e-06, + "loss": 0.8292, + "step": 1496 + }, + { + "epoch": 3.8599096191091027, + "grad_norm": 0.7638293238485997, + "learning_rate": 2.9641079274826302e-06, + "loss": 0.8177, + "step": 1497 + }, + { + "epoch": 3.8624919302775984, + "grad_norm": 0.7712018435277216, + "learning_rate": 2.951296657854209e-06, + "loss": 0.8285, + "step": 1498 + }, + { + "epoch": 3.865074241446094, + "grad_norm": 0.7597575954985475, + "learning_rate": 2.938508339738683e-06, + "loss": 0.816, + "step": 1499 + }, + { + "epoch": 3.86765655261459, + "grad_norm": 0.7717927570202332, + "learning_rate": 2.9257430147765096e-06, + "loss": 0.8493, + "step": 1500 + }, + { + "epoch": 3.870238863783086, + "grad_norm": 0.7649795668645112, + "learning_rate": 2.913000724533277e-06, + "loss": 0.7985, + "step": 1501 + }, + { + "epoch": 3.8728211749515817, + "grad_norm": 0.7542531439383557, + "learning_rate": 2.900281510499575e-06, + "loss": 0.8093, + "step": 1502 + }, + { + "epoch": 3.8754034861200775, + "grad_norm": 0.7519132753628803, + "learning_rate": 2.8875854140908544e-06, + "loss": 0.8137, + "step": 1503 + }, + { + "epoch": 3.877985797288573, + "grad_norm": 0.7514626395050483, + "learning_rate": 2.8749124766472858e-06, + "loss": 0.8094, + "step": 1504 + }, + { + "epoch": 3.8805681084570693, + "grad_norm": 0.7683645531072449, + "learning_rate": 2.862262739433631e-06, + "loss": 0.8132, + "step": 1505 + }, + { + "epoch": 3.883150419625565, + "grad_norm": 0.7458416884658178, + "learning_rate": 2.8496362436391157e-06, + "loss": 0.8168, + "step": 1506 + }, + { + "epoch": 3.885732730794061, + "grad_norm": 0.7746722925229123, + "learning_rate": 2.8370330303772874e-06, + "loss": 0.7996, + "step": 1507 + }, + { + "epoch": 3.8883150419625565, + "grad_norm": 0.7553669840963009, + "learning_rate": 2.8244531406858765e-06, + "loss": 0.8288, + "step": 1508 + }, + { + "epoch": 3.8908973531310522, + "grad_norm": 0.7573845094295684, + "learning_rate": 2.81189661552667e-06, + "loss": 0.8374, + "step": 1509 + }, + { + "epoch": 3.893479664299548, + "grad_norm": 0.7400418799985741, + "learning_rate": 2.7993634957853843e-06, + "loss": 0.8375, + "step": 1510 + }, + { + "epoch": 3.8960619754680437, + "grad_norm": 0.7719250258407471, + "learning_rate": 2.7868538222715134e-06, + "loss": 0.826, + "step": 1511 + }, + { + "epoch": 3.89864428663654, + "grad_norm": 0.7563300858230262, + "learning_rate": 2.774367635718217e-06, + "loss": 0.7974, + "step": 1512 + }, + { + "epoch": 3.9012265978050356, + "grad_norm": 0.7458708691408243, + "learning_rate": 2.761904976782177e-06, + "loss": 0.8012, + "step": 1513 + }, + { + "epoch": 3.9038089089735313, + "grad_norm": 0.7442133907212378, + "learning_rate": 2.749465886043462e-06, + "loss": 0.8129, + "step": 1514 + }, + { + "epoch": 3.906391220142027, + "grad_norm": 0.7606057865370476, + "learning_rate": 2.7370504040053957e-06, + "loss": 0.7908, + "step": 1515 + }, + { + "epoch": 3.908973531310523, + "grad_norm": 0.7415475080933377, + "learning_rate": 2.7246585710944383e-06, + "loss": 0.8383, + "step": 1516 + }, + { + "epoch": 3.911555842479019, + "grad_norm": 0.7747366485131825, + "learning_rate": 2.7122904276600483e-06, + "loss": 0.8299, + "step": 1517 + }, + { + "epoch": 3.9141381536475146, + "grad_norm": 0.7470976085296888, + "learning_rate": 2.699946013974527e-06, + "loss": 0.8225, + "step": 1518 + }, + { + "epoch": 3.9167204648160103, + "grad_norm": 0.759056171874551, + "learning_rate": 2.68762537023293e-06, + "loss": 0.8079, + "step": 1519 + }, + { + "epoch": 3.919302775984506, + "grad_norm": 0.7626528987921768, + "learning_rate": 2.6753285365529103e-06, + "loss": 0.8272, + "step": 1520 + }, + { + "epoch": 3.9218850871530018, + "grad_norm": 0.7651769802741644, + "learning_rate": 2.6630555529745826e-06, + "loss": 0.8338, + "step": 1521 + }, + { + "epoch": 3.9244673983214975, + "grad_norm": 0.7774095564165737, + "learning_rate": 2.6508064594604157e-06, + "loss": 0.8203, + "step": 1522 + }, + { + "epoch": 3.9270497094899937, + "grad_norm": 0.7651143778092684, + "learning_rate": 2.638581295895075e-06, + "loss": 0.849, + "step": 1523 + }, + { + "epoch": 3.9296320206584894, + "grad_norm": 0.7903147596028517, + "learning_rate": 2.626380102085322e-06, + "loss": 0.8106, + "step": 1524 + }, + { + "epoch": 3.932214331826985, + "grad_norm": 0.7530887651415382, + "learning_rate": 2.614202917759855e-06, + "loss": 0.8333, + "step": 1525 + }, + { + "epoch": 3.934796642995481, + "grad_norm": 0.77568209115188, + "learning_rate": 2.602049782569206e-06, + "loss": 0.8137, + "step": 1526 + }, + { + "epoch": 3.937378954163977, + "grad_norm": 0.7855983310276548, + "learning_rate": 2.5899207360855984e-06, + "loss": 0.7917, + "step": 1527 + }, + { + "epoch": 3.9399612653324727, + "grad_norm": 0.7531574331667908, + "learning_rate": 2.5778158178028045e-06, + "loss": 0.8178, + "step": 1528 + }, + { + "epoch": 3.9425435765009684, + "grad_norm": 0.7469480826297799, + "learning_rate": 2.5657350671360514e-06, + "loss": 0.844, + "step": 1529 + }, + { + "epoch": 3.945125887669464, + "grad_norm": 0.7762679927670401, + "learning_rate": 2.5536785234218664e-06, + "loss": 0.8234, + "step": 1530 + }, + { + "epoch": 3.94770819883796, + "grad_norm": 0.7554503148115285, + "learning_rate": 2.541646225917954e-06, + "loss": 0.8214, + "step": 1531 + }, + { + "epoch": 3.9502905100064556, + "grad_norm": 0.7509360975231941, + "learning_rate": 2.529638213803065e-06, + "loss": 0.8096, + "step": 1532 + }, + { + "epoch": 3.9528728211749513, + "grad_norm": 0.71654004355707, + "learning_rate": 2.5176545261768847e-06, + "loss": 0.8168, + "step": 1533 + }, + { + "epoch": 3.9554551323434475, + "grad_norm": 0.7790891430190677, + "learning_rate": 2.5056952020598913e-06, + "loss": 0.8014, + "step": 1534 + }, + { + "epoch": 3.958037443511943, + "grad_norm": 0.7484517687807049, + "learning_rate": 2.4937602803932237e-06, + "loss": 0.8326, + "step": 1535 + }, + { + "epoch": 3.960619754680439, + "grad_norm": 0.7746336735865199, + "learning_rate": 2.481849800038577e-06, + "loss": 0.8329, + "step": 1536 + }, + { + "epoch": 3.9632020658489346, + "grad_norm": 0.7508850795911187, + "learning_rate": 2.4699637997780503e-06, + "loss": 0.8104, + "step": 1537 + }, + { + "epoch": 3.965784377017431, + "grad_norm": 0.7624683021848369, + "learning_rate": 2.458102318314034e-06, + "loss": 0.8195, + "step": 1538 + }, + { + "epoch": 3.9683666881859265, + "grad_norm": 0.7576649154014872, + "learning_rate": 2.4462653942690895e-06, + "loss": 0.8154, + "step": 1539 + }, + { + "epoch": 3.9709489993544222, + "grad_norm": 0.7548460407193209, + "learning_rate": 2.4344530661858123e-06, + "loss": 0.8193, + "step": 1540 + }, + { + "epoch": 3.973531310522918, + "grad_norm": 0.7602374654670865, + "learning_rate": 2.422665372526708e-06, + "loss": 0.8203, + "step": 1541 + }, + { + "epoch": 3.9761136216914137, + "grad_norm": 0.7675841326705145, + "learning_rate": 2.410902351674066e-06, + "loss": 0.8207, + "step": 1542 + }, + { + "epoch": 3.9786959328599094, + "grad_norm": 0.7514141818157982, + "learning_rate": 2.399164041929846e-06, + "loss": 0.7885, + "step": 1543 + }, + { + "epoch": 3.9812782440284056, + "grad_norm": 0.7642082673174962, + "learning_rate": 2.387450481515543e-06, + "loss": 0.799, + "step": 1544 + }, + { + "epoch": 3.9838605551969013, + "grad_norm": 0.7494471147548254, + "learning_rate": 2.3757617085720617e-06, + "loss": 0.8128, + "step": 1545 + }, + { + "epoch": 3.986442866365397, + "grad_norm": 0.7566122069144386, + "learning_rate": 2.364097761159594e-06, + "loss": 0.8212, + "step": 1546 + }, + { + "epoch": 3.9890251775338927, + "grad_norm": 0.7695050070788328, + "learning_rate": 2.3524586772575055e-06, + "loss": 0.8265, + "step": 1547 + }, + { + "epoch": 3.991607488702389, + "grad_norm": 0.767828875724427, + "learning_rate": 2.3408444947641897e-06, + "loss": 0.8107, + "step": 1548 + }, + { + "epoch": 3.9941897998708846, + "grad_norm": 0.7241738464533221, + "learning_rate": 2.3292552514969723e-06, + "loss": 0.8248, + "step": 1549 + }, + { + "epoch": 3.9967721110393803, + "grad_norm": 0.7625460770903586, + "learning_rate": 2.3176909851919593e-06, + "loss": 0.8179, + "step": 1550 + }, + { + "epoch": 3.999354422207876, + "grad_norm": 0.741225630820047, + "learning_rate": 2.306151733503943e-06, + "loss": 0.7945, + "step": 1551 + }, + { + "epoch": 4.0, + "grad_norm": 1.6164851247119985, + "learning_rate": 2.294637534006251e-06, + "loss": 0.789, + "step": 1552 + }, + { + "epoch": 4.002582311168496, + "grad_norm": 1.424317513923708, + "learning_rate": 2.2831484241906456e-06, + "loss": 0.7301, + "step": 1553 + }, + { + "epoch": 4.005164622336991, + "grad_norm": 1.3624952198303995, + "learning_rate": 2.271684441467198e-06, + "loss": 0.7151, + "step": 1554 + }, + { + "epoch": 4.007746933505487, + "grad_norm": 1.2756030345247056, + "learning_rate": 2.2602456231641457e-06, + "loss": 0.73, + "step": 1555 + }, + { + "epoch": 4.010329244673983, + "grad_norm": 1.0331700783629776, + "learning_rate": 2.2488320065278034e-06, + "loss": 0.6833, + "step": 1556 + }, + { + "epoch": 4.012911555842479, + "grad_norm": 0.9404904636726831, + "learning_rate": 2.2374436287224245e-06, + "loss": 0.728, + "step": 1557 + }, + { + "epoch": 4.015493867010975, + "grad_norm": 0.9727430065578684, + "learning_rate": 2.22608052683007e-06, + "loss": 0.7489, + "step": 1558 + }, + { + "epoch": 4.018076178179471, + "grad_norm": 1.098170239940058, + "learning_rate": 2.214742737850514e-06, + "loss": 0.7356, + "step": 1559 + }, + { + "epoch": 4.020658489347967, + "grad_norm": 1.2733638094374413, + "learning_rate": 2.2034302987010938e-06, + "loss": 0.7244, + "step": 1560 + }, + { + "epoch": 4.023240800516462, + "grad_norm": 1.2876487754543966, + "learning_rate": 2.192143246216618e-06, + "loss": 0.71, + "step": 1561 + }, + { + "epoch": 4.025823111684958, + "grad_norm": 1.2629073802929212, + "learning_rate": 2.180881617149221e-06, + "loss": 0.7205, + "step": 1562 + }, + { + "epoch": 4.028405422853454, + "grad_norm": 1.179837995515697, + "learning_rate": 2.169645448168265e-06, + "loss": 0.7329, + "step": 1563 + }, + { + "epoch": 4.0309877340219495, + "grad_norm": 1.0543627752747324, + "learning_rate": 2.158434775860205e-06, + "loss": 0.7173, + "step": 1564 + }, + { + "epoch": 4.033570045190445, + "grad_norm": 0.9861615682326618, + "learning_rate": 2.1472496367284746e-06, + "loss": 0.7369, + "step": 1565 + }, + { + "epoch": 4.036152356358941, + "grad_norm": 1.0060005644443055, + "learning_rate": 2.1360900671933703e-06, + "loss": 0.7039, + "step": 1566 + }, + { + "epoch": 4.038734667527437, + "grad_norm": 0.949784297792835, + "learning_rate": 2.1249561035919364e-06, + "loss": 0.7236, + "step": 1567 + }, + { + "epoch": 4.041316978695932, + "grad_norm": 0.9737652328085534, + "learning_rate": 2.113847782177829e-06, + "loss": 0.7088, + "step": 1568 + }, + { + "epoch": 4.043899289864429, + "grad_norm": 0.9725504207142661, + "learning_rate": 2.1027651391212158e-06, + "loss": 0.7366, + "step": 1569 + }, + { + "epoch": 4.046481601032925, + "grad_norm": 0.9215401564734376, + "learning_rate": 2.091708210508654e-06, + "loss": 0.7031, + "step": 1570 + }, + { + "epoch": 4.0490639122014205, + "grad_norm": 0.902423487206708, + "learning_rate": 2.0806770323429725e-06, + "loss": 0.7369, + "step": 1571 + }, + { + "epoch": 4.051646223369916, + "grad_norm": 0.9424761496048374, + "learning_rate": 2.069671640543147e-06, + "loss": 0.7624, + "step": 1572 + }, + { + "epoch": 4.054228534538412, + "grad_norm": 0.9147025826090719, + "learning_rate": 2.0586920709441916e-06, + "loss": 0.719, + "step": 1573 + }, + { + "epoch": 4.056810845706908, + "grad_norm": 0.8911633869358179, + "learning_rate": 2.0477383592970445e-06, + "loss": 0.6934, + "step": 1574 + }, + { + "epoch": 4.059393156875403, + "grad_norm": 1.0041409401766892, + "learning_rate": 2.0368105412684393e-06, + "loss": 0.7207, + "step": 1575 + }, + { + "epoch": 4.061975468043899, + "grad_norm": 1.0810423533153977, + "learning_rate": 2.0259086524408036e-06, + "loss": 0.7488, + "step": 1576 + }, + { + "epoch": 4.064557779212395, + "grad_norm": 1.081615913030172, + "learning_rate": 2.015032728312134e-06, + "loss": 0.7308, + "step": 1577 + }, + { + "epoch": 4.0671400903808905, + "grad_norm": 0.9264001598492572, + "learning_rate": 2.0041828042958823e-06, + "loss": 0.7099, + "step": 1578 + }, + { + "epoch": 4.069722401549387, + "grad_norm": 0.9108042452749365, + "learning_rate": 1.9933589157208356e-06, + "loss": 0.706, + "step": 1579 + }, + { + "epoch": 4.072304712717883, + "grad_norm": 0.8937819117625528, + "learning_rate": 1.9825610978310127e-06, + "loss": 0.7104, + "step": 1580 + }, + { + "epoch": 4.074887023886379, + "grad_norm": 0.8825092444237356, + "learning_rate": 1.9717893857855475e-06, + "loss": 0.7053, + "step": 1581 + }, + { + "epoch": 4.077469335054874, + "grad_norm": 0.8817000599546978, + "learning_rate": 1.961043814658552e-06, + "loss": 0.7098, + "step": 1582 + }, + { + "epoch": 4.08005164622337, + "grad_norm": 0.9005717565381924, + "learning_rate": 1.950324419439035e-06, + "loss": 0.6968, + "step": 1583 + }, + { + "epoch": 4.082633957391866, + "grad_norm": 0.9122929744904504, + "learning_rate": 1.9396312350307722e-06, + "loss": 0.7119, + "step": 1584 + }, + { + "epoch": 4.0852162685603615, + "grad_norm": 0.8952175519583466, + "learning_rate": 1.9289642962521847e-06, + "loss": 0.7177, + "step": 1585 + }, + { + "epoch": 4.087798579728857, + "grad_norm": 0.8962217979338948, + "learning_rate": 1.918323637836247e-06, + "loss": 0.7047, + "step": 1586 + }, + { + "epoch": 4.090380890897353, + "grad_norm": 0.898920603548723, + "learning_rate": 1.9077092944303453e-06, + "loss": 0.7328, + "step": 1587 + }, + { + "epoch": 4.092963202065849, + "grad_norm": 0.9407672412231067, + "learning_rate": 1.8971213005961985e-06, + "loss": 0.7244, + "step": 1588 + }, + { + "epoch": 4.095545513234344, + "grad_norm": 0.9175278092820267, + "learning_rate": 1.8865596908097105e-06, + "loss": 0.7076, + "step": 1589 + }, + { + "epoch": 4.098127824402841, + "grad_norm": 0.9219067326264855, + "learning_rate": 1.8760244994608911e-06, + "loss": 0.7205, + "step": 1590 + }, + { + "epoch": 4.100710135571337, + "grad_norm": 0.9313883405997428, + "learning_rate": 1.8655157608537156e-06, + "loss": 0.7329, + "step": 1591 + }, + { + "epoch": 4.103292446739832, + "grad_norm": 0.9069473980493022, + "learning_rate": 1.855033509206029e-06, + "loss": 0.7058, + "step": 1592 + }, + { + "epoch": 4.105874757908328, + "grad_norm": 0.9042132782800456, + "learning_rate": 1.8445777786494356e-06, + "loss": 0.722, + "step": 1593 + }, + { + "epoch": 4.108457069076824, + "grad_norm": 0.8984738476555618, + "learning_rate": 1.8341486032291834e-06, + "loss": 0.6965, + "step": 1594 + }, + { + "epoch": 4.11103938024532, + "grad_norm": 0.9033341125566603, + "learning_rate": 1.823746016904049e-06, + "loss": 0.7043, + "step": 1595 + }, + { + "epoch": 4.113621691413815, + "grad_norm": 0.9024091267438483, + "learning_rate": 1.8133700535462274e-06, + "loss": 0.7181, + "step": 1596 + }, + { + "epoch": 4.116204002582311, + "grad_norm": 0.8840547702331727, + "learning_rate": 1.8030207469412374e-06, + "loss": 0.7137, + "step": 1597 + }, + { + "epoch": 4.118786313750807, + "grad_norm": 0.8791770361734527, + "learning_rate": 1.7926981307877944e-06, + "loss": 0.707, + "step": 1598 + }, + { + "epoch": 4.1213686249193024, + "grad_norm": 0.8765500683094899, + "learning_rate": 1.7824022386977014e-06, + "loss": 0.7332, + "step": 1599 + }, + { + "epoch": 4.123950936087798, + "grad_norm": 0.8727571261897714, + "learning_rate": 1.7721331041957535e-06, + "loss": 0.7026, + "step": 1600 + }, + { + "epoch": 4.126533247256295, + "grad_norm": 0.8893215641197096, + "learning_rate": 1.7618907607196112e-06, + "loss": 0.699, + "step": 1601 + }, + { + "epoch": 4.1291155584247905, + "grad_norm": 0.8809282308536279, + "learning_rate": 1.7516752416197013e-06, + "loss": 0.6937, + "step": 1602 + }, + { + "epoch": 4.131697869593286, + "grad_norm": 0.9065700149312429, + "learning_rate": 1.741486580159112e-06, + "loss": 0.7156, + "step": 1603 + }, + { + "epoch": 4.134280180761782, + "grad_norm": 0.9172825352706213, + "learning_rate": 1.7313248095134772e-06, + "loss": 0.7224, + "step": 1604 + }, + { + "epoch": 4.136862491930278, + "grad_norm": 0.9359192197464888, + "learning_rate": 1.7211899627708694e-06, + "loss": 0.7159, + "step": 1605 + }, + { + "epoch": 4.139444803098773, + "grad_norm": 0.8923723735789315, + "learning_rate": 1.711082072931689e-06, + "loss": 0.7144, + "step": 1606 + }, + { + "epoch": 4.142027114267269, + "grad_norm": 0.8698447798036731, + "learning_rate": 1.7010011729085696e-06, + "loss": 0.7183, + "step": 1607 + }, + { + "epoch": 4.144609425435765, + "grad_norm": 0.9184478130391627, + "learning_rate": 1.6909472955262596e-06, + "loss": 0.7542, + "step": 1608 + }, + { + "epoch": 4.1471917366042605, + "grad_norm": 0.8850461139260019, + "learning_rate": 1.6809204735215179e-06, + "loss": 0.7186, + "step": 1609 + }, + { + "epoch": 4.149774047772756, + "grad_norm": 0.8889238278435273, + "learning_rate": 1.6709207395430005e-06, + "loss": 0.7405, + "step": 1610 + }, + { + "epoch": 4.152356358941253, + "grad_norm": 0.9208003199238048, + "learning_rate": 1.660948126151175e-06, + "loss": 0.7124, + "step": 1611 + }, + { + "epoch": 4.154938670109749, + "grad_norm": 0.8762844910372851, + "learning_rate": 1.6510026658181866e-06, + "loss": 0.7292, + "step": 1612 + }, + { + "epoch": 4.157520981278244, + "grad_norm": 0.8940625291851263, + "learning_rate": 1.6410843909277784e-06, + "loss": 0.7186, + "step": 1613 + }, + { + "epoch": 4.16010329244674, + "grad_norm": 0.8584435328232947, + "learning_rate": 1.6311933337751652e-06, + "loss": 0.7018, + "step": 1614 + }, + { + "epoch": 4.162685603615236, + "grad_norm": 0.8889295547847345, + "learning_rate": 1.6213295265669448e-06, + "loss": 0.713, + "step": 1615 + }, + { + "epoch": 4.1652679147837315, + "grad_norm": 0.8961567659168423, + "learning_rate": 1.6114930014209763e-06, + "loss": 0.716, + "step": 1616 + }, + { + "epoch": 4.167850225952227, + "grad_norm": 0.8623292807248303, + "learning_rate": 1.601683790366293e-06, + "loss": 0.7409, + "step": 1617 + }, + { + "epoch": 4.170432537120723, + "grad_norm": 0.9014567493180559, + "learning_rate": 1.5919019253429923e-06, + "loss": 0.7147, + "step": 1618 + }, + { + "epoch": 4.173014848289219, + "grad_norm": 0.8966269088663105, + "learning_rate": 1.5821474382021128e-06, + "loss": 0.7202, + "step": 1619 + }, + { + "epoch": 4.175597159457714, + "grad_norm": 0.9039554140412117, + "learning_rate": 1.5724203607055655e-06, + "loss": 0.7208, + "step": 1620 + }, + { + "epoch": 4.17817947062621, + "grad_norm": 0.9117935626371781, + "learning_rate": 1.5627207245260046e-06, + "loss": 0.7252, + "step": 1621 + }, + { + "epoch": 4.180761781794706, + "grad_norm": 0.8838537392487884, + "learning_rate": 1.5530485612467317e-06, + "loss": 0.7143, + "step": 1622 + }, + { + "epoch": 4.183344092963202, + "grad_norm": 0.8872948835258441, + "learning_rate": 1.54340390236159e-06, + "loss": 0.6962, + "step": 1623 + }, + { + "epoch": 4.185926404131698, + "grad_norm": 0.8960284062739021, + "learning_rate": 1.5337867792748694e-06, + "loss": 0.7195, + "step": 1624 + }, + { + "epoch": 4.188508715300194, + "grad_norm": 0.8995137169581848, + "learning_rate": 1.5241972233012015e-06, + "loss": 0.6987, + "step": 1625 + }, + { + "epoch": 4.19109102646869, + "grad_norm": 0.8725877233542136, + "learning_rate": 1.5146352656654473e-06, + "loss": 0.6822, + "step": 1626 + }, + { + "epoch": 4.193673337637185, + "grad_norm": 0.9022708955863336, + "learning_rate": 1.5051009375026127e-06, + "loss": 0.7124, + "step": 1627 + }, + { + "epoch": 4.196255648805681, + "grad_norm": 0.894833962035567, + "learning_rate": 1.4955942698577341e-06, + "loss": 0.7362, + "step": 1628 + }, + { + "epoch": 4.198837959974177, + "grad_norm": 0.9297395231684169, + "learning_rate": 1.4861152936857792e-06, + "loss": 0.7272, + "step": 1629 + }, + { + "epoch": 4.2014202711426725, + "grad_norm": 0.9212227444100315, + "learning_rate": 1.476664039851554e-06, + "loss": 0.7345, + "step": 1630 + }, + { + "epoch": 4.204002582311168, + "grad_norm": 0.9040538736898476, + "learning_rate": 1.4672405391295964e-06, + "loss": 0.7202, + "step": 1631 + }, + { + "epoch": 4.206584893479664, + "grad_norm": 0.90954184793202, + "learning_rate": 1.4578448222040708e-06, + "loss": 0.7144, + "step": 1632 + }, + { + "epoch": 4.2091672046481605, + "grad_norm": 0.8765308727348899, + "learning_rate": 1.4484769196686777e-06, + "loss": 0.6932, + "step": 1633 + }, + { + "epoch": 4.211749515816656, + "grad_norm": 0.8913725709084924, + "learning_rate": 1.4391368620265522e-06, + "loss": 0.6839, + "step": 1634 + }, + { + "epoch": 4.214331826985152, + "grad_norm": 0.8949496649062495, + "learning_rate": 1.4298246796901615e-06, + "loss": 0.7081, + "step": 1635 + }, + { + "epoch": 4.216914138153648, + "grad_norm": 0.8890323192323862, + "learning_rate": 1.4205404029812043e-06, + "loss": 0.7148, + "step": 1636 + }, + { + "epoch": 4.219496449322143, + "grad_norm": 0.8898844083009926, + "learning_rate": 1.4112840621305156e-06, + "loss": 0.7055, + "step": 1637 + }, + { + "epoch": 4.222078760490639, + "grad_norm": 0.8973037287194257, + "learning_rate": 1.4020556872779723e-06, + "loss": 0.7001, + "step": 1638 + }, + { + "epoch": 4.224661071659135, + "grad_norm": 0.8881050754779889, + "learning_rate": 1.3928553084723828e-06, + "loss": 0.7029, + "step": 1639 + }, + { + "epoch": 4.227243382827631, + "grad_norm": 0.9282818296375939, + "learning_rate": 1.3836829556714027e-06, + "loss": 0.7436, + "step": 1640 + }, + { + "epoch": 4.229825693996126, + "grad_norm": 0.896945078296962, + "learning_rate": 1.3745386587414312e-06, + "loss": 0.7051, + "step": 1641 + }, + { + "epoch": 4.232408005164622, + "grad_norm": 0.8904775792068886, + "learning_rate": 1.3654224474575105e-06, + "loss": 0.75, + "step": 1642 + }, + { + "epoch": 4.234990316333118, + "grad_norm": 0.9051407253126452, + "learning_rate": 1.3563343515032312e-06, + "loss": 0.7122, + "step": 1643 + }, + { + "epoch": 4.237572627501614, + "grad_norm": 0.875475228439986, + "learning_rate": 1.3472744004706406e-06, + "loss": 0.7138, + "step": 1644 + }, + { + "epoch": 4.24015493867011, + "grad_norm": 0.9170090364957498, + "learning_rate": 1.3382426238601443e-06, + "loss": 0.7209, + "step": 1645 + }, + { + "epoch": 4.242737249838606, + "grad_norm": 0.8963557044070765, + "learning_rate": 1.3292390510803987e-06, + "loss": 0.7207, + "step": 1646 + }, + { + "epoch": 4.2453195610071015, + "grad_norm": 0.8870017097792205, + "learning_rate": 1.320263711448232e-06, + "loss": 0.7344, + "step": 1647 + }, + { + "epoch": 4.247901872175597, + "grad_norm": 0.8643621623469757, + "learning_rate": 1.3113166341885453e-06, + "loss": 0.6909, + "step": 1648 + }, + { + "epoch": 4.250484183344093, + "grad_norm": 0.88693541516387, + "learning_rate": 1.3023978484342027e-06, + "loss": 0.7172, + "step": 1649 + }, + { + "epoch": 4.253066494512589, + "grad_norm": 0.8827410284281785, + "learning_rate": 1.293507383225958e-06, + "loss": 0.6974, + "step": 1650 + }, + { + "epoch": 4.255648805681084, + "grad_norm": 0.8921892267635948, + "learning_rate": 1.2846452675123412e-06, + "loss": 0.7198, + "step": 1651 + }, + { + "epoch": 4.25823111684958, + "grad_norm": 0.8699520076203895, + "learning_rate": 1.275811530149581e-06, + "loss": 0.712, + "step": 1652 + }, + { + "epoch": 4.260813428018076, + "grad_norm": 0.9053418709356683, + "learning_rate": 1.2670061999014926e-06, + "loss": 0.711, + "step": 1653 + }, + { + "epoch": 4.263395739186572, + "grad_norm": 0.8933588527941319, + "learning_rate": 1.2582293054394034e-06, + "loss": 0.7191, + "step": 1654 + }, + { + "epoch": 4.265978050355068, + "grad_norm": 0.8957606594695622, + "learning_rate": 1.249480875342044e-06, + "loss": 0.7013, + "step": 1655 + }, + { + "epoch": 4.268560361523564, + "grad_norm": 0.8896835872883254, + "learning_rate": 1.240760938095461e-06, + "loss": 0.6909, + "step": 1656 + }, + { + "epoch": 4.27114267269206, + "grad_norm": 0.902860652986643, + "learning_rate": 1.232069522092929e-06, + "loss": 0.7023, + "step": 1657 + }, + { + "epoch": 4.273724983860555, + "grad_norm": 0.8991644670101375, + "learning_rate": 1.2234066556348524e-06, + "loss": 0.7201, + "step": 1658 + }, + { + "epoch": 4.276307295029051, + "grad_norm": 0.871953669734789, + "learning_rate": 1.2147723669286703e-06, + "loss": 0.7135, + "step": 1659 + }, + { + "epoch": 4.278889606197547, + "grad_norm": 0.8976349496395944, + "learning_rate": 1.206166684088772e-06, + "loss": 0.7142, + "step": 1660 + }, + { + "epoch": 4.2814719173660425, + "grad_norm": 0.9150642282309124, + "learning_rate": 1.1975896351364036e-06, + "loss": 0.717, + "step": 1661 + }, + { + "epoch": 4.284054228534538, + "grad_norm": 0.89830091523198, + "learning_rate": 1.189041247999575e-06, + "loss": 0.7038, + "step": 1662 + }, + { + "epoch": 4.286636539703034, + "grad_norm": 0.9002768402003223, + "learning_rate": 1.1805215505129653e-06, + "loss": 0.7171, + "step": 1663 + }, + { + "epoch": 4.28921885087153, + "grad_norm": 0.9189338614452367, + "learning_rate": 1.1720305704178436e-06, + "loss": 0.7198, + "step": 1664 + }, + { + "epoch": 4.291801162040025, + "grad_norm": 0.9241169326437726, + "learning_rate": 1.1635683353619643e-06, + "loss": 0.733, + "step": 1665 + }, + { + "epoch": 4.294383473208522, + "grad_norm": 0.9074601151507607, + "learning_rate": 1.1551348728994849e-06, + "loss": 0.7251, + "step": 1666 + }, + { + "epoch": 4.296965784377018, + "grad_norm": 0.9201495613344312, + "learning_rate": 1.1467302104908796e-06, + "loss": 0.7261, + "step": 1667 + }, + { + "epoch": 4.299548095545513, + "grad_norm": 0.8938379902505227, + "learning_rate": 1.138354375502847e-06, + "loss": 0.6994, + "step": 1668 + }, + { + "epoch": 4.302130406714009, + "grad_norm": 0.9336658672822801, + "learning_rate": 1.1300073952082147e-06, + "loss": 0.7156, + "step": 1669 + }, + { + "epoch": 4.304712717882505, + "grad_norm": 0.8831440608193399, + "learning_rate": 1.121689296785854e-06, + "loss": 0.6995, + "step": 1670 + }, + { + "epoch": 4.307295029051001, + "grad_norm": 0.8799798301503751, + "learning_rate": 1.1134001073206025e-06, + "loss": 0.7193, + "step": 1671 + }, + { + "epoch": 4.309877340219496, + "grad_norm": 0.8804550173412993, + "learning_rate": 1.1051398538031544e-06, + "loss": 0.7258, + "step": 1672 + }, + { + "epoch": 4.312459651387992, + "grad_norm": 0.9036007303009939, + "learning_rate": 1.0969085631299946e-06, + "loss": 0.708, + "step": 1673 + }, + { + "epoch": 4.315041962556488, + "grad_norm": 0.9024030379125595, + "learning_rate": 1.0887062621032951e-06, + "loss": 0.7055, + "step": 1674 + }, + { + "epoch": 4.3176242737249835, + "grad_norm": 0.8894223361973113, + "learning_rate": 1.0805329774308392e-06, + "loss": 0.726, + "step": 1675 + }, + { + "epoch": 4.32020658489348, + "grad_norm": 0.885938538418052, + "learning_rate": 1.072388735725921e-06, + "loss": 0.703, + "step": 1676 + }, + { + "epoch": 4.322788896061976, + "grad_norm": 0.9305506401372774, + "learning_rate": 1.0642735635072764e-06, + "loss": 0.7266, + "step": 1677 + }, + { + "epoch": 4.3253712072304715, + "grad_norm": 0.9113060756686157, + "learning_rate": 1.0561874871989775e-06, + "loss": 0.7206, + "step": 1678 + }, + { + "epoch": 4.327953518398967, + "grad_norm": 0.889475937075751, + "learning_rate": 1.0481305331303659e-06, + "loss": 0.7145, + "step": 1679 + }, + { + "epoch": 4.330535829567463, + "grad_norm": 0.9193184675001664, + "learning_rate": 1.0401027275359487e-06, + "loss": 0.7213, + "step": 1680 + }, + { + "epoch": 4.333118140735959, + "grad_norm": 0.9057888219841576, + "learning_rate": 1.0321040965553286e-06, + "loss": 0.7142, + "step": 1681 + }, + { + "epoch": 4.335700451904454, + "grad_norm": 0.9181347146447232, + "learning_rate": 1.0241346662331075e-06, + "loss": 0.7001, + "step": 1682 + }, + { + "epoch": 4.33828276307295, + "grad_norm": 0.8761392579902068, + "learning_rate": 1.0161944625188046e-06, + "loss": 0.7144, + "step": 1683 + }, + { + "epoch": 4.340865074241446, + "grad_norm": 0.8957507000212769, + "learning_rate": 1.008283511266781e-06, + "loss": 0.72, + "step": 1684 + }, + { + "epoch": 4.343447385409942, + "grad_norm": 0.9048968155853336, + "learning_rate": 1.0004018382361414e-06, + "loss": 0.7154, + "step": 1685 + }, + { + "epoch": 4.346029696578437, + "grad_norm": 0.9061516663607375, + "learning_rate": 9.92549469090659e-07, + "loss": 0.7138, + "step": 1686 + }, + { + "epoch": 4.348612007746934, + "grad_norm": 0.907960683381014, + "learning_rate": 9.847264293986869e-07, + "loss": 0.6933, + "step": 1687 + }, + { + "epoch": 4.35119431891543, + "grad_norm": 0.9335500032392648, + "learning_rate": 9.769327446330802e-07, + "loss": 0.7203, + "step": 1688 + }, + { + "epoch": 4.353776630083925, + "grad_norm": 0.8979953039902708, + "learning_rate": 9.691684401711143e-07, + "loss": 0.7335, + "step": 1689 + }, + { + "epoch": 4.356358941252421, + "grad_norm": 0.884673020616097, + "learning_rate": 9.614335412943887e-07, + "loss": 0.7141, + "step": 1690 + }, + { + "epoch": 4.358941252420917, + "grad_norm": 0.8749160348750349, + "learning_rate": 9.537280731887644e-07, + "loss": 0.7038, + "step": 1691 + }, + { + "epoch": 4.3615235635894125, + "grad_norm": 0.8709729980037048, + "learning_rate": 9.460520609442647e-07, + "loss": 0.6812, + "step": 1692 + }, + { + "epoch": 4.364105874757908, + "grad_norm": 0.8750487222127739, + "learning_rate": 9.384055295550032e-07, + "loss": 0.69, + "step": 1693 + }, + { + "epoch": 4.366688185926404, + "grad_norm": 0.8996853241438628, + "learning_rate": 9.307885039191011e-07, + "loss": 0.7232, + "step": 1694 + }, + { + "epoch": 4.3692704970949, + "grad_norm": 0.908802912077574, + "learning_rate": 9.232010088386067e-07, + "loss": 0.7062, + "step": 1695 + }, + { + "epoch": 4.371852808263395, + "grad_norm": 0.9168462712654659, + "learning_rate": 9.156430690194074e-07, + "loss": 0.7084, + "step": 1696 + }, + { + "epoch": 4.374435119431892, + "grad_norm": 0.9259940687964294, + "learning_rate": 9.081147090711562e-07, + "loss": 0.742, + "step": 1697 + }, + { + "epoch": 4.377017430600388, + "grad_norm": 0.9383290055652561, + "learning_rate": 9.006159535071945e-07, + "loss": 0.7364, + "step": 1698 + }, + { + "epoch": 4.3795997417688834, + "grad_norm": 0.8900989908997234, + "learning_rate": 8.93146826744462e-07, + "loss": 0.6925, + "step": 1699 + }, + { + "epoch": 4.382182052937379, + "grad_norm": 0.906245275227718, + "learning_rate": 8.8570735310343e-07, + "loss": 0.7152, + "step": 1700 + }, + { + "epoch": 4.384764364105875, + "grad_norm": 0.8817115384410557, + "learning_rate": 8.782975568080066e-07, + "loss": 0.7119, + "step": 1701 + }, + { + "epoch": 4.387346675274371, + "grad_norm": 0.8936803543044946, + "learning_rate": 8.709174619854766e-07, + "loss": 0.7221, + "step": 1702 + }, + { + "epoch": 4.389928986442866, + "grad_norm": 0.9106707340778925, + "learning_rate": 8.635670926664019e-07, + "loss": 0.7159, + "step": 1703 + }, + { + "epoch": 4.392511297611362, + "grad_norm": 0.8995573561800408, + "learning_rate": 8.562464727845621e-07, + "loss": 0.7232, + "step": 1704 + }, + { + "epoch": 4.395093608779858, + "grad_norm": 0.9242674966931874, + "learning_rate": 8.489556261768694e-07, + "loss": 0.7511, + "step": 1705 + }, + { + "epoch": 4.3976759199483535, + "grad_norm": 0.8973937221578497, + "learning_rate": 8.41694576583284e-07, + "loss": 0.7151, + "step": 1706 + }, + { + "epoch": 4.400258231116849, + "grad_norm": 0.9016096213701568, + "learning_rate": 8.344633476467456e-07, + "loss": 0.7555, + "step": 1707 + }, + { + "epoch": 4.402840542285345, + "grad_norm": 0.9070638086527365, + "learning_rate": 8.272619629130984e-07, + "loss": 0.7405, + "step": 1708 + }, + { + "epoch": 4.4054228534538415, + "grad_norm": 0.8882275237620317, + "learning_rate": 8.200904458310022e-07, + "loss": 0.6947, + "step": 1709 + }, + { + "epoch": 4.408005164622337, + "grad_norm": 0.8804791482936096, + "learning_rate": 8.129488197518687e-07, + "loss": 0.6977, + "step": 1710 + }, + { + "epoch": 4.410587475790833, + "grad_norm": 0.9272005439255128, + "learning_rate": 8.0583710792978e-07, + "loss": 0.7132, + "step": 1711 + }, + { + "epoch": 4.413169786959329, + "grad_norm": 0.8899287069527094, + "learning_rate": 7.987553335214149e-07, + "loss": 0.731, + "step": 1712 + }, + { + "epoch": 4.415752098127824, + "grad_norm": 0.8808619191876175, + "learning_rate": 7.917035195859668e-07, + "loss": 0.7265, + "step": 1713 + }, + { + "epoch": 4.41833440929632, + "grad_norm": 0.8969831722003705, + "learning_rate": 7.846816890850806e-07, + "loss": 0.7116, + "step": 1714 + }, + { + "epoch": 4.420916720464816, + "grad_norm": 0.8924810647720324, + "learning_rate": 7.776898648827647e-07, + "loss": 0.7146, + "step": 1715 + }, + { + "epoch": 4.423499031633312, + "grad_norm": 1.1018976996499141, + "learning_rate": 7.707280697453256e-07, + "loss": 0.6941, + "step": 1716 + }, + { + "epoch": 4.426081342801807, + "grad_norm": 0.894570214234641, + "learning_rate": 7.637963263412929e-07, + "loss": 0.7145, + "step": 1717 + }, + { + "epoch": 4.428663653970303, + "grad_norm": 0.907267880331535, + "learning_rate": 7.568946572413438e-07, + "loss": 0.7239, + "step": 1718 + }, + { + "epoch": 4.4312459651388, + "grad_norm": 0.904439736544676, + "learning_rate": 7.500230849182278e-07, + "loss": 0.7148, + "step": 1719 + }, + { + "epoch": 4.433828276307295, + "grad_norm": 0.9165453956707031, + "learning_rate": 7.431816317466923e-07, + "loss": 0.7276, + "step": 1720 + }, + { + "epoch": 4.436410587475791, + "grad_norm": 0.8958325943917459, + "learning_rate": 7.363703200034177e-07, + "loss": 0.7121, + "step": 1721 + }, + { + "epoch": 4.438992898644287, + "grad_norm": 0.9140196598832558, + "learning_rate": 7.295891718669423e-07, + "loss": 0.7331, + "step": 1722 + }, + { + "epoch": 4.4415752098127825, + "grad_norm": 0.8872380269740475, + "learning_rate": 7.228382094175801e-07, + "loss": 0.7001, + "step": 1723 + }, + { + "epoch": 4.444157520981278, + "grad_norm": 0.9270216958684496, + "learning_rate": 7.161174546373595e-07, + "loss": 0.7181, + "step": 1724 + }, + { + "epoch": 4.446739832149774, + "grad_norm": 0.9132455796590981, + "learning_rate": 7.094269294099509e-07, + "loss": 0.731, + "step": 1725 + }, + { + "epoch": 4.44932214331827, + "grad_norm": 0.9227817170397697, + "learning_rate": 7.027666555205915e-07, + "loss": 0.7337, + "step": 1726 + }, + { + "epoch": 4.451904454486765, + "grad_norm": 0.9210558504510359, + "learning_rate": 6.961366546560156e-07, + "loss": 0.7291, + "step": 1727 + }, + { + "epoch": 4.454486765655261, + "grad_norm": 0.9075281818010581, + "learning_rate": 6.895369484043879e-07, + "loss": 0.7321, + "step": 1728 + }, + { + "epoch": 4.457069076823757, + "grad_norm": 1.0617614842864411, + "learning_rate": 6.829675582552253e-07, + "loss": 0.6943, + "step": 1729 + }, + { + "epoch": 4.4596513879922535, + "grad_norm": 0.9007403795148768, + "learning_rate": 6.764285055993313e-07, + "loss": 0.7094, + "step": 1730 + }, + { + "epoch": 4.462233699160749, + "grad_norm": 0.9117234111423507, + "learning_rate": 6.699198117287309e-07, + "loss": 0.7385, + "step": 1731 + }, + { + "epoch": 4.464816010329245, + "grad_norm": 0.8912704657574764, + "learning_rate": 6.634414978365978e-07, + "loss": 0.7145, + "step": 1732 + }, + { + "epoch": 4.467398321497741, + "grad_norm": 0.9112230980418938, + "learning_rate": 6.569935850171749e-07, + "loss": 0.7199, + "step": 1733 + }, + { + "epoch": 4.469980632666236, + "grad_norm": 0.9068319118246875, + "learning_rate": 6.505760942657235e-07, + "loss": 0.728, + "step": 1734 + }, + { + "epoch": 4.472562943834732, + "grad_norm": 0.8698085381695577, + "learning_rate": 6.441890464784473e-07, + "loss": 0.6873, + "step": 1735 + }, + { + "epoch": 4.475145255003228, + "grad_norm": 0.8820814075457806, + "learning_rate": 6.37832462452418e-07, + "loss": 0.7087, + "step": 1736 + }, + { + "epoch": 4.4777275661717235, + "grad_norm": 0.9076300266568639, + "learning_rate": 6.315063628855178e-07, + "loss": 0.7207, + "step": 1737 + }, + { + "epoch": 4.480309877340219, + "grad_norm": 0.8761294190152139, + "learning_rate": 6.252107683763642e-07, + "loss": 0.7028, + "step": 1738 + }, + { + "epoch": 4.482892188508715, + "grad_norm": 0.9023071618115094, + "learning_rate": 6.189456994242516e-07, + "loss": 0.7548, + "step": 1739 + }, + { + "epoch": 4.485474499677212, + "grad_norm": 0.9032118146111997, + "learning_rate": 6.127111764290694e-07, + "loss": 0.7198, + "step": 1740 + }, + { + "epoch": 4.488056810845707, + "grad_norm": 0.9423906564703732, + "learning_rate": 6.065072196912569e-07, + "loss": 0.7192, + "step": 1741 + }, + { + "epoch": 4.490639122014203, + "grad_norm": 0.8871664697024927, + "learning_rate": 6.003338494117183e-07, + "loss": 0.7261, + "step": 1742 + }, + { + "epoch": 4.493221433182699, + "grad_norm": 0.8723986193214446, + "learning_rate": 5.941910856917643e-07, + "loss": 0.6919, + "step": 1743 + }, + { + "epoch": 4.4958037443511945, + "grad_norm": 0.9160786881274069, + "learning_rate": 5.880789485330484e-07, + "loss": 0.7184, + "step": 1744 + }, + { + "epoch": 4.49838605551969, + "grad_norm": 0.8749173043898083, + "learning_rate": 5.81997457837502e-07, + "loss": 0.7038, + "step": 1745 + }, + { + "epoch": 4.500968366688186, + "grad_norm": 0.8836993274435628, + "learning_rate": 5.75946633407265e-07, + "loss": 0.7058, + "step": 1746 + }, + { + "epoch": 4.503550677856682, + "grad_norm": 0.9168059326784899, + "learning_rate": 5.699264949446215e-07, + "loss": 0.7576, + "step": 1747 + }, + { + "epoch": 4.506132989025177, + "grad_norm": 0.916608005705834, + "learning_rate": 5.639370620519424e-07, + "loss": 0.7176, + "step": 1748 + }, + { + "epoch": 4.508715300193673, + "grad_norm": 0.8792129119048742, + "learning_rate": 5.579783542316175e-07, + "loss": 0.7004, + "step": 1749 + }, + { + "epoch": 4.511297611362169, + "grad_norm": 0.9102642584599099, + "learning_rate": 5.520503908859876e-07, + "loss": 0.7296, + "step": 1750 + }, + { + "epoch": 4.5138799225306645, + "grad_norm": 0.8976181656024875, + "learning_rate": 5.461531913172869e-07, + "loss": 0.7137, + "step": 1751 + }, + { + "epoch": 4.516462233699161, + "grad_norm": 0.8955819573412418, + "learning_rate": 5.40286774727582e-07, + "loss": 0.7243, + "step": 1752 + }, + { + "epoch": 4.519044544867657, + "grad_norm": 0.8848415473903943, + "learning_rate": 5.344511602186986e-07, + "loss": 0.6937, + "step": 1753 + }, + { + "epoch": 4.5216268560361526, + "grad_norm": 0.8935210617924505, + "learning_rate": 5.28646366792176e-07, + "loss": 0.7115, + "step": 1754 + }, + { + "epoch": 4.524209167204648, + "grad_norm": 0.9277553981673546, + "learning_rate": 5.228724133491903e-07, + "loss": 0.7464, + "step": 1755 + }, + { + "epoch": 4.526791478373144, + "grad_norm": 0.8697756183845172, + "learning_rate": 5.171293186904991e-07, + "loss": 0.6713, + "step": 1756 + }, + { + "epoch": 4.52937378954164, + "grad_norm": 0.8699331724645898, + "learning_rate": 5.114171015163793e-07, + "loss": 0.6981, + "step": 1757 + }, + { + "epoch": 4.531956100710135, + "grad_norm": 0.8867759757326906, + "learning_rate": 5.057357804265695e-07, + "loss": 0.713, + "step": 1758 + }, + { + "epoch": 4.534538411878631, + "grad_norm": 0.8965559347020167, + "learning_rate": 5.000853739202039e-07, + "loss": 0.7084, + "step": 1759 + }, + { + "epoch": 4.537120723047127, + "grad_norm": 0.9176654044616335, + "learning_rate": 4.944659003957564e-07, + "loss": 0.7214, + "step": 1760 + }, + { + "epoch": 4.539703034215623, + "grad_norm": 0.9221143520181255, + "learning_rate": 4.888773781509748e-07, + "loss": 0.737, + "step": 1761 + }, + { + "epoch": 4.542285345384119, + "grad_norm": 0.9322259452120046, + "learning_rate": 4.833198253828331e-07, + "loss": 0.7416, + "step": 1762 + }, + { + "epoch": 4.544867656552615, + "grad_norm": 0.9208718551935449, + "learning_rate": 4.777932601874557e-07, + "loss": 0.7487, + "step": 1763 + }, + { + "epoch": 4.547449967721111, + "grad_norm": 0.8890662647225912, + "learning_rate": 4.7229770056007707e-07, + "loss": 0.6894, + "step": 1764 + }, + { + "epoch": 4.550032278889606, + "grad_norm": 0.887652550850276, + "learning_rate": 4.66833164394962e-07, + "loss": 0.7031, + "step": 1765 + }, + { + "epoch": 4.552614590058102, + "grad_norm": 0.8951195625274412, + "learning_rate": 4.6139966948537064e-07, + "loss": 0.7419, + "step": 1766 + }, + { + "epoch": 4.555196901226598, + "grad_norm": 0.9259533456732394, + "learning_rate": 4.5599723352347857e-07, + "loss": 0.6975, + "step": 1767 + }, + { + "epoch": 4.5577792123950935, + "grad_norm": 0.9119373622565761, + "learning_rate": 4.5062587410033663e-07, + "loss": 0.727, + "step": 1768 + }, + { + "epoch": 4.560361523563589, + "grad_norm": 0.8749876771396683, + "learning_rate": 4.452856087058044e-07, + "loss": 0.6747, + "step": 1769 + }, + { + "epoch": 4.562943834732085, + "grad_norm": 0.8781088555589601, + "learning_rate": 4.3997645472849016e-07, + "loss": 0.7024, + "step": 1770 + }, + { + "epoch": 4.565526145900581, + "grad_norm": 0.8811255741349678, + "learning_rate": 4.346984294557055e-07, + "loss": 0.7078, + "step": 1771 + }, + { + "epoch": 4.568108457069076, + "grad_norm": 0.9013328171452745, + "learning_rate": 4.29451550073402e-07, + "loss": 0.7471, + "step": 1772 + }, + { + "epoch": 4.570690768237572, + "grad_norm": 0.9233092729022429, + "learning_rate": 4.2423583366611345e-07, + "loss": 0.7443, + "step": 1773 + }, + { + "epoch": 4.573273079406069, + "grad_norm": 0.8783319514354955, + "learning_rate": 4.190512972169036e-07, + "loss": 0.7247, + "step": 1774 + }, + { + "epoch": 4.5758553905745645, + "grad_norm": 0.8856387585636677, + "learning_rate": 4.13897957607311e-07, + "loss": 0.701, + "step": 1775 + }, + { + "epoch": 4.57843770174306, + "grad_norm": 0.9032296044304654, + "learning_rate": 4.0877583161729406e-07, + "loss": 0.7, + "step": 1776 + }, + { + "epoch": 4.581020012911556, + "grad_norm": 0.8704929901157932, + "learning_rate": 4.036849359251738e-07, + "loss": 0.7071, + "step": 1777 + }, + { + "epoch": 4.583602324080052, + "grad_norm": 0.9068350583367365, + "learning_rate": 3.986252871075813e-07, + "loss": 0.6992, + "step": 1778 + }, + { + "epoch": 4.586184635248547, + "grad_norm": 0.874153868567516, + "learning_rate": 3.935969016394048e-07, + "loss": 0.708, + "step": 1779 + }, + { + "epoch": 4.588766946417043, + "grad_norm": 0.8976406116845383, + "learning_rate": 3.8859979589373265e-07, + "loss": 0.7182, + "step": 1780 + }, + { + "epoch": 4.591349257585539, + "grad_norm": 0.8973977227814353, + "learning_rate": 3.836339861418059e-07, + "loss": 0.6996, + "step": 1781 + }, + { + "epoch": 4.5939315687540345, + "grad_norm": 0.9057741077447837, + "learning_rate": 3.786994885529582e-07, + "loss": 0.707, + "step": 1782 + }, + { + "epoch": 4.596513879922531, + "grad_norm": 0.9248023645258433, + "learning_rate": 3.7379631919457036e-07, + "loss": 0.7433, + "step": 1783 + }, + { + "epoch": 4.599096191091027, + "grad_norm": 0.8815428248966636, + "learning_rate": 3.6892449403200805e-07, + "loss": 0.7049, + "step": 1784 + }, + { + "epoch": 4.601678502259523, + "grad_norm": 0.8865336637004515, + "learning_rate": 3.6408402892858297e-07, + "loss": 0.7074, + "step": 1785 + }, + { + "epoch": 4.604260813428018, + "grad_norm": 0.8908085795505662, + "learning_rate": 3.592749396454931e-07, + "loss": 0.7158, + "step": 1786 + }, + { + "epoch": 4.606843124596514, + "grad_norm": 0.895409574949093, + "learning_rate": 3.5449724184176695e-07, + "loss": 0.7006, + "step": 1787 + }, + { + "epoch": 4.60942543576501, + "grad_norm": 0.8953545545983936, + "learning_rate": 3.4975095107422473e-07, + "loss": 0.7043, + "step": 1788 + }, + { + "epoch": 4.6120077469335055, + "grad_norm": 0.9024082202649676, + "learning_rate": 3.450360827974175e-07, + "loss": 0.7188, + "step": 1789 + }, + { + "epoch": 4.614590058102001, + "grad_norm": 0.8901535896655877, + "learning_rate": 3.403526523635825e-07, + "loss": 0.7044, + "step": 1790 + }, + { + "epoch": 4.617172369270497, + "grad_norm": 0.8661077661763952, + "learning_rate": 3.3570067502258887e-07, + "loss": 0.6895, + "step": 1791 + }, + { + "epoch": 4.619754680438993, + "grad_norm": 0.8685402290165503, + "learning_rate": 3.310801659218943e-07, + "loss": 0.7115, + "step": 1792 + }, + { + "epoch": 4.622336991607488, + "grad_norm": 0.8964236761531204, + "learning_rate": 3.264911401064874e-07, + "loss": 0.7268, + "step": 1793 + }, + { + "epoch": 4.624919302775984, + "grad_norm": 0.8907216569293869, + "learning_rate": 3.219336125188455e-07, + "loss": 0.7009, + "step": 1794 + }, + { + "epoch": 4.627501613944481, + "grad_norm": 0.8752296130228919, + "learning_rate": 3.174075979988811e-07, + "loss": 0.7155, + "step": 1795 + }, + { + "epoch": 4.630083925112976, + "grad_norm": 0.8970700959270853, + "learning_rate": 3.1291311128390233e-07, + "loss": 0.7261, + "step": 1796 + }, + { + "epoch": 4.632666236281472, + "grad_norm": 0.8832170790109071, + "learning_rate": 3.0845016700854827e-07, + "loss": 0.6962, + "step": 1797 + }, + { + "epoch": 4.635248547449968, + "grad_norm": 0.8737294820452377, + "learning_rate": 3.0401877970476e-07, + "loss": 0.697, + "step": 1798 + }, + { + "epoch": 4.637830858618464, + "grad_norm": 0.8664266455857452, + "learning_rate": 2.996189638017233e-07, + "loss": 0.7204, + "step": 1799 + }, + { + "epoch": 4.640413169786959, + "grad_norm": 0.8875751806701481, + "learning_rate": 2.9525073362581924e-07, + "loss": 0.7349, + "step": 1800 + }, + { + "epoch": 4.642995480955455, + "grad_norm": 0.8976875904169599, + "learning_rate": 2.909141034005891e-07, + "loss": 0.7118, + "step": 1801 + }, + { + "epoch": 4.645577792123951, + "grad_norm": 0.875180687352795, + "learning_rate": 2.86609087246672e-07, + "loss": 0.7072, + "step": 1802 + }, + { + "epoch": 4.648160103292446, + "grad_norm": 0.8837922794467638, + "learning_rate": 2.8233569918177384e-07, + "loss": 0.6872, + "step": 1803 + }, + { + "epoch": 4.650742414460942, + "grad_norm": 0.8773613611606011, + "learning_rate": 2.780939531206106e-07, + "loss": 0.6997, + "step": 1804 + }, + { + "epoch": 4.653324725629439, + "grad_norm": 0.9021810787669173, + "learning_rate": 2.73883862874873e-07, + "loss": 0.7059, + "step": 1805 + }, + { + "epoch": 4.6559070367979345, + "grad_norm": 0.8823700361393249, + "learning_rate": 2.6970544215317197e-07, + "loss": 0.7047, + "step": 1806 + }, + { + "epoch": 4.65848934796643, + "grad_norm": 0.8783238825683962, + "learning_rate": 2.655587045609975e-07, + "loss": 0.7249, + "step": 1807 + }, + { + "epoch": 4.661071659134926, + "grad_norm": 0.8827143960172458, + "learning_rate": 2.6144366360067896e-07, + "loss": 0.7159, + "step": 1808 + }, + { + "epoch": 4.663653970303422, + "grad_norm": 0.8851244634156046, + "learning_rate": 2.57360332671337e-07, + "loss": 0.7181, + "step": 1809 + }, + { + "epoch": 4.666236281471917, + "grad_norm": 0.91360975849369, + "learning_rate": 2.5330872506883595e-07, + "loss": 0.7426, + "step": 1810 + }, + { + "epoch": 4.668818592640413, + "grad_norm": 0.9075412171714544, + "learning_rate": 2.492888539857485e-07, + "loss": 0.6909, + "step": 1811 + }, + { + "epoch": 4.671400903808909, + "grad_norm": 0.888744832476072, + "learning_rate": 2.453007325113077e-07, + "loss": 0.7102, + "step": 1812 + }, + { + "epoch": 4.6739832149774045, + "grad_norm": 0.890507374481685, + "learning_rate": 2.41344373631367e-07, + "loss": 0.727, + "step": 1813 + }, + { + "epoch": 4.6765655261459, + "grad_norm": 0.8866830934511432, + "learning_rate": 2.374197902283548e-07, + "loss": 0.7308, + "step": 1814 + }, + { + "epoch": 4.679147837314396, + "grad_norm": 0.8771540551708261, + "learning_rate": 2.3352699508123579e-07, + "loss": 0.6949, + "step": 1815 + }, + { + "epoch": 4.681730148482892, + "grad_norm": 1.0503180466316975, + "learning_rate": 2.296660008654661e-07, + "loss": 0.7213, + "step": 1816 + }, + { + "epoch": 4.684312459651388, + "grad_norm": 0.8959301940497476, + "learning_rate": 2.2583682015295593e-07, + "loss": 0.7101, + "step": 1817 + }, + { + "epoch": 4.686894770819884, + "grad_norm": 0.8581736544054299, + "learning_rate": 2.2203946541202392e-07, + "loss": 0.6836, + "step": 1818 + }, + { + "epoch": 4.68947708198838, + "grad_norm": 0.8933010730764249, + "learning_rate": 2.1827394900736377e-07, + "loss": 0.7032, + "step": 1819 + }, + { + "epoch": 4.6920593931568755, + "grad_norm": 0.8824184636051327, + "learning_rate": 2.145402831999943e-07, + "loss": 0.6866, + "step": 1820 + }, + { + "epoch": 4.694641704325371, + "grad_norm": 0.8962203196556846, + "learning_rate": 2.108384801472263e-07, + "loss": 0.6891, + "step": 1821 + }, + { + "epoch": 4.697224015493867, + "grad_norm": 0.8825956696146154, + "learning_rate": 2.0716855190262118e-07, + "loss": 0.7159, + "step": 1822 + }, + { + "epoch": 4.699806326662363, + "grad_norm": 0.8978568032331434, + "learning_rate": 2.035305104159546e-07, + "loss": 0.6948, + "step": 1823 + }, + { + "epoch": 4.702388637830858, + "grad_norm": 0.9066750181863855, + "learning_rate": 1.9992436753316967e-07, + "loss": 0.7321, + "step": 1824 + }, + { + "epoch": 4.704970948999354, + "grad_norm": 0.8862969137730125, + "learning_rate": 1.963501349963448e-07, + "loss": 0.7224, + "step": 1825 + }, + { + "epoch": 4.707553260167851, + "grad_norm": 0.8963573378284775, + "learning_rate": 1.928078244436582e-07, + "loss": 0.7078, + "step": 1826 + }, + { + "epoch": 4.710135571336346, + "grad_norm": 0.8951212547195972, + "learning_rate": 1.892974474093412e-07, + "loss": 0.7324, + "step": 1827 + }, + { + "epoch": 4.712717882504842, + "grad_norm": 0.8796409018946402, + "learning_rate": 1.8581901532364722e-07, + "loss": 0.6997, + "step": 1828 + }, + { + "epoch": 4.715300193673338, + "grad_norm": 0.9084865841892311, + "learning_rate": 1.8237253951281287e-07, + "loss": 0.7176, + "step": 1829 + }, + { + "epoch": 4.717882504841834, + "grad_norm": 0.8939350444456864, + "learning_rate": 1.789580311990191e-07, + "loss": 0.7273, + "step": 1830 + }, + { + "epoch": 4.720464816010329, + "grad_norm": 0.8897734672415167, + "learning_rate": 1.7557550150035906e-07, + "loss": 0.7311, + "step": 1831 + }, + { + "epoch": 4.723047127178825, + "grad_norm": 0.9004838760057952, + "learning_rate": 1.7222496143079803e-07, + "loss": 0.735, + "step": 1832 + }, + { + "epoch": 4.725629438347321, + "grad_norm": 0.8899859768430314, + "learning_rate": 1.6890642190013906e-07, + "loss": 0.7231, + "step": 1833 + }, + { + "epoch": 4.7282117495158165, + "grad_norm": 0.8945971398194342, + "learning_rate": 1.6561989371398523e-07, + "loss": 0.7242, + "step": 1834 + }, + { + "epoch": 4.730794060684312, + "grad_norm": 0.9022295113729779, + "learning_rate": 1.6236538757370967e-07, + "loss": 0.7124, + "step": 1835 + }, + { + "epoch": 4.733376371852808, + "grad_norm": 0.9072500620579464, + "learning_rate": 1.5914291407641668e-07, + "loss": 0.7252, + "step": 1836 + }, + { + "epoch": 4.735958683021304, + "grad_norm": 0.9078114931306495, + "learning_rate": 1.5595248371490512e-07, + "loss": 0.7252, + "step": 1837 + }, + { + "epoch": 4.7385409941898, + "grad_norm": 0.8961898314266026, + "learning_rate": 1.5279410687764173e-07, + "loss": 0.7436, + "step": 1838 + }, + { + "epoch": 4.741123305358296, + "grad_norm": 0.8983212101762444, + "learning_rate": 1.4966779384871789e-07, + "loss": 0.7123, + "step": 1839 + }, + { + "epoch": 4.743705616526792, + "grad_norm": 0.8867079379295172, + "learning_rate": 1.465735548078262e-07, + "loss": 0.7091, + "step": 1840 + }, + { + "epoch": 4.746287927695287, + "grad_norm": 0.9004558245837394, + "learning_rate": 1.4351139983021623e-07, + "loss": 0.7001, + "step": 1841 + }, + { + "epoch": 4.748870238863783, + "grad_norm": 0.9069705760502946, + "learning_rate": 1.4048133888667436e-07, + "loss": 0.7132, + "step": 1842 + }, + { + "epoch": 4.751452550032279, + "grad_norm": 0.8977843363832826, + "learning_rate": 1.3748338184347842e-07, + "loss": 0.7348, + "step": 1843 + }, + { + "epoch": 4.754034861200775, + "grad_norm": 0.9077365721137113, + "learning_rate": 1.3451753846237314e-07, + "loss": 0.7221, + "step": 1844 + }, + { + "epoch": 4.75661717236927, + "grad_norm": 0.8989075753789744, + "learning_rate": 1.3158381840054025e-07, + "loss": 0.7193, + "step": 1845 + }, + { + "epoch": 4.759199483537766, + "grad_norm": 0.9186459910068699, + "learning_rate": 1.2868223121056178e-07, + "loss": 0.722, + "step": 1846 + }, + { + "epoch": 4.761781794706262, + "grad_norm": 0.9020493854829156, + "learning_rate": 1.2581278634038795e-07, + "loss": 0.7148, + "step": 1847 + }, + { + "epoch": 4.764364105874758, + "grad_norm": 0.8768801704765319, + "learning_rate": 1.229754931333127e-07, + "loss": 0.7035, + "step": 1848 + }, + { + "epoch": 4.766946417043254, + "grad_norm": 0.8844151561998117, + "learning_rate": 1.2017036082793922e-07, + "loss": 0.7184, + "step": 1849 + }, + { + "epoch": 4.76952872821175, + "grad_norm": 0.8929803701346444, + "learning_rate": 1.1739739855815224e-07, + "loss": 0.7302, + "step": 1850 + }, + { + "epoch": 4.7721110393802455, + "grad_norm": 0.883991183642243, + "learning_rate": 1.1465661535308147e-07, + "loss": 0.7293, + "step": 1851 + }, + { + "epoch": 4.774693350548741, + "grad_norm": 0.8949880710889703, + "learning_rate": 1.1194802013708151e-07, + "loss": 0.723, + "step": 1852 + }, + { + "epoch": 4.777275661717237, + "grad_norm": 0.8764502668642703, + "learning_rate": 1.0927162172969852e-07, + "loss": 0.6951, + "step": 1853 + }, + { + "epoch": 4.779857972885733, + "grad_norm": 0.8800003007012933, + "learning_rate": 1.0662742884563926e-07, + "loss": 0.7233, + "step": 1854 + }, + { + "epoch": 4.782440284054228, + "grad_norm": 0.9056650063921504, + "learning_rate": 1.0401545009474768e-07, + "loss": 0.7303, + "step": 1855 + }, + { + "epoch": 4.785022595222724, + "grad_norm": 0.8936292984984834, + "learning_rate": 1.0143569398197384e-07, + "loss": 0.7165, + "step": 1856 + }, + { + "epoch": 4.78760490639122, + "grad_norm": 0.8930843203036322, + "learning_rate": 9.888816890734399e-08, + "loss": 0.6987, + "step": 1857 + }, + { + "epoch": 4.7901872175597155, + "grad_norm": 0.8914663323073332, + "learning_rate": 9.637288316593718e-08, + "loss": 0.7257, + "step": 1858 + }, + { + "epoch": 4.792769528728211, + "grad_norm": 0.8776073044944982, + "learning_rate": 9.388984494785869e-08, + "loss": 0.692, + "step": 1859 + }, + { + "epoch": 4.795351839896708, + "grad_norm": 0.8779266479863871, + "learning_rate": 9.14390623382111e-08, + "loss": 0.7133, + "step": 1860 + }, + { + "epoch": 4.797934151065204, + "grad_norm": 0.9264245878689963, + "learning_rate": 8.902054331706545e-08, + "loss": 0.7283, + "step": 1861 + }, + { + "epoch": 4.800516462233699, + "grad_norm": 0.8900359447271092, + "learning_rate": 8.663429575944126e-08, + "loss": 0.6855, + "step": 1862 + }, + { + "epoch": 4.803098773402195, + "grad_norm": 0.8817750073930983, + "learning_rate": 8.42803274352777e-08, + "loss": 0.6943, + "step": 1863 + }, + { + "epoch": 4.805681084570691, + "grad_norm": 0.8903555513223317, + "learning_rate": 8.195864600940684e-08, + "loss": 0.6981, + "step": 1864 + }, + { + "epoch": 4.8082633957391865, + "grad_norm": 0.9144780971305131, + "learning_rate": 7.966925904153156e-08, + "loss": 0.7352, + "step": 1865 + }, + { + "epoch": 4.810845706907682, + "grad_norm": 0.9009644315118552, + "learning_rate": 7.741217398619993e-08, + "loss": 0.6861, + "step": 1866 + }, + { + "epoch": 4.813428018076178, + "grad_norm": 0.9355145906326285, + "learning_rate": 7.518739819278087e-08, + "loss": 0.7482, + "step": 1867 + }, + { + "epoch": 4.816010329244674, + "grad_norm": 0.9073134543589949, + "learning_rate": 7.29949389054374e-08, + "loss": 0.7514, + "step": 1868 + }, + { + "epoch": 4.81859264041317, + "grad_norm": 0.9117075631720867, + "learning_rate": 7.08348032631101e-08, + "loss": 0.7251, + "step": 1869 + }, + { + "epoch": 4.821174951581666, + "grad_norm": 0.9074195781202343, + "learning_rate": 6.870699829948479e-08, + "loss": 0.7186, + "step": 1870 + }, + { + "epoch": 4.823757262750162, + "grad_norm": 0.8938435858944255, + "learning_rate": 6.661153094297823e-08, + "loss": 0.7074, + "step": 1871 + }, + { + "epoch": 4.826339573918657, + "grad_norm": 0.9003531915044409, + "learning_rate": 6.454840801670803e-08, + "loss": 0.7319, + "step": 1872 + }, + { + "epoch": 4.828921885087153, + "grad_norm": 0.8820415069306509, + "learning_rate": 6.25176362384794e-08, + "loss": 0.712, + "step": 1873 + }, + { + "epoch": 4.831504196255649, + "grad_norm": 0.8995431946768817, + "learning_rate": 6.051922222075179e-08, + "loss": 0.7358, + "step": 1874 + }, + { + "epoch": 4.834086507424145, + "grad_norm": 0.9146524578848008, + "learning_rate": 5.855317247062786e-08, + "loss": 0.708, + "step": 1875 + }, + { + "epoch": 4.83666881859264, + "grad_norm": 0.882452405450629, + "learning_rate": 5.6619493389824534e-08, + "loss": 0.6995, + "step": 1876 + }, + { + "epoch": 4.839251129761136, + "grad_norm": 0.903967291714597, + "learning_rate": 5.4718191274659716e-08, + "loss": 0.7299, + "step": 1877 + }, + { + "epoch": 4.841833440929632, + "grad_norm": 0.8789763679697778, + "learning_rate": 5.284927231602344e-08, + "loss": 0.6955, + "step": 1878 + }, + { + "epoch": 4.8444157520981275, + "grad_norm": 0.8987948069283417, + "learning_rate": 5.101274259936451e-08, + "loss": 0.7204, + "step": 1879 + }, + { + "epoch": 4.846998063266623, + "grad_norm": 0.9053366952624305, + "learning_rate": 4.92086081046661e-08, + "loss": 0.7246, + "step": 1880 + }, + { + "epoch": 4.84958037443512, + "grad_norm": 0.8949590025267126, + "learning_rate": 4.7436874706431324e-08, + "loss": 0.7101, + "step": 1881 + }, + { + "epoch": 4.8521626856036155, + "grad_norm": 0.9214195129359755, + "learning_rate": 4.569754817365657e-08, + "loss": 0.7246, + "step": 1882 + }, + { + "epoch": 4.854744996772111, + "grad_norm": 0.9089598728157908, + "learning_rate": 4.399063416982263e-08, + "loss": 0.7218, + "step": 1883 + }, + { + "epoch": 4.857327307940607, + "grad_norm": 0.901720805129725, + "learning_rate": 4.2316138252866954e-08, + "loss": 0.6872, + "step": 1884 + }, + { + "epoch": 4.859909619109103, + "grad_norm": 0.9104687149335616, + "learning_rate": 4.067406587516809e-08, + "loss": 0.7451, + "step": 1885 + }, + { + "epoch": 4.862491930277598, + "grad_norm": 0.9041958158253693, + "learning_rate": 3.9064422383534587e-08, + "loss": 0.7049, + "step": 1886 + }, + { + "epoch": 4.865074241446094, + "grad_norm": 0.8939398515698603, + "learning_rate": 3.748721301917724e-08, + "loss": 0.7071, + "step": 1887 + }, + { + "epoch": 4.86765655261459, + "grad_norm": 0.9161924402984971, + "learning_rate": 3.5942442917699107e-08, + "loss": 0.7442, + "step": 1888 + }, + { + "epoch": 4.870238863783086, + "grad_norm": 0.8963507291873432, + "learning_rate": 3.443011710907662e-08, + "loss": 0.726, + "step": 1889 + }, + { + "epoch": 4.872821174951581, + "grad_norm": 0.8879121159936676, + "learning_rate": 3.295024051764406e-08, + "loss": 0.6938, + "step": 1890 + }, + { + "epoch": 4.875403486120078, + "grad_norm": 0.875832944883481, + "learning_rate": 3.150281796207466e-08, + "loss": 0.6997, + "step": 1891 + }, + { + "epoch": 4.877985797288574, + "grad_norm": 0.8977974322344513, + "learning_rate": 3.008785415536841e-08, + "loss": 0.7159, + "step": 1892 + }, + { + "epoch": 4.880568108457069, + "grad_norm": 0.8748881414978601, + "learning_rate": 2.8705353704836515e-08, + "loss": 0.7027, + "step": 1893 + }, + { + "epoch": 4.883150419625565, + "grad_norm": 0.88445635221765, + "learning_rate": 2.73553211120825e-08, + "loss": 0.7245, + "step": 1894 + }, + { + "epoch": 4.885732730794061, + "grad_norm": 0.9134713261429174, + "learning_rate": 2.6037760772991138e-08, + "loss": 0.7239, + "step": 1895 + }, + { + "epoch": 4.8883150419625565, + "grad_norm": 0.8985059705700046, + "learning_rate": 2.4752676977713997e-08, + "loss": 0.7245, + "step": 1896 + }, + { + "epoch": 4.890897353131052, + "grad_norm": 0.8885305233658356, + "learning_rate": 2.3500073910655007e-08, + "loss": 0.74, + "step": 1897 + }, + { + "epoch": 4.893479664299548, + "grad_norm": 0.8823177052515677, + "learning_rate": 2.2279955650456043e-08, + "loss": 0.6953, + "step": 1898 + }, + { + "epoch": 4.896061975468044, + "grad_norm": 0.9143103720474103, + "learning_rate": 2.109232616998247e-08, + "loss": 0.7159, + "step": 1899 + }, + { + "epoch": 4.898644286636539, + "grad_norm": 0.9122168970845141, + "learning_rate": 1.993718933631428e-08, + "loss": 0.7356, + "step": 1900 + }, + { + "epoch": 4.901226597805035, + "grad_norm": 0.8855771980386666, + "learning_rate": 1.8814548910730535e-08, + "loss": 0.7129, + "step": 1901 + }, + { + "epoch": 4.903808908973531, + "grad_norm": 0.9057306349994061, + "learning_rate": 1.7724408548697168e-08, + "loss": 0.718, + "step": 1902 + }, + { + "epoch": 4.906391220142027, + "grad_norm": 0.9014437329192154, + "learning_rate": 1.6666771799855875e-08, + "loss": 0.7178, + "step": 1903 + }, + { + "epoch": 4.908973531310523, + "grad_norm": 0.8787325490430078, + "learning_rate": 1.5641642108011888e-08, + "loss": 0.698, + "step": 1904 + }, + { + "epoch": 4.911555842479019, + "grad_norm": 0.9212584947084932, + "learning_rate": 1.4649022811122904e-08, + "loss": 0.7462, + "step": 1905 + }, + { + "epoch": 4.914138153647515, + "grad_norm": 0.8828205242032983, + "learning_rate": 1.368891714129017e-08, + "loss": 0.6942, + "step": 1906 + }, + { + "epoch": 4.91672046481601, + "grad_norm": 0.9012685033665765, + "learning_rate": 1.2761328224744074e-08, + "loss": 0.711, + "step": 1907 + }, + { + "epoch": 4.919302775984506, + "grad_norm": 0.8922139187625531, + "learning_rate": 1.1866259081837473e-08, + "loss": 0.7111, + "step": 1908 + }, + { + "epoch": 4.921885087153002, + "grad_norm": 0.9214550856281982, + "learning_rate": 1.100371262703459e-08, + "loss": 0.7151, + "step": 1909 + }, + { + "epoch": 4.9244673983214975, + "grad_norm": 0.8940406772236208, + "learning_rate": 1.0173691668901031e-08, + "loss": 0.7364, + "step": 1910 + }, + { + "epoch": 4.927049709489993, + "grad_norm": 0.9145036949820405, + "learning_rate": 9.376198910094892e-09, + "loss": 0.7231, + "step": 1911 + }, + { + "epoch": 4.92963202065849, + "grad_norm": 0.8932180687439902, + "learning_rate": 8.611236947357881e-09, + "loss": 0.7347, + "step": 1912 + }, + { + "epoch": 4.9322143318269855, + "grad_norm": 0.9074743226465335, + "learning_rate": 7.878808271507554e-09, + "loss": 0.7205, + "step": 1913 + }, + { + "epoch": 4.934796642995481, + "grad_norm": 0.9340148563069326, + "learning_rate": 7.178915267429531e-09, + "loss": 0.7412, + "step": 1914 + }, + { + "epoch": 4.937378954163977, + "grad_norm": 0.9029720647465063, + "learning_rate": 6.5115602140686244e-09, + "loss": 0.706, + "step": 1915 + }, + { + "epoch": 4.939961265332473, + "grad_norm": 0.9024330652166745, + "learning_rate": 5.876745284421059e-09, + "loss": 0.7179, + "step": 1916 + }, + { + "epoch": 4.942543576500968, + "grad_norm": 0.905536650405652, + "learning_rate": 5.27447254552782e-09, + "loss": 0.7265, + "step": 1917 + }, + { + "epoch": 4.945125887669464, + "grad_norm": 0.9411331572422978, + "learning_rate": 4.704743958467984e-09, + "loss": 0.7411, + "step": 1918 + }, + { + "epoch": 4.94770819883796, + "grad_norm": 0.8844024471969056, + "learning_rate": 4.1675613783565e-09, + "loss": 0.7076, + "step": 1919 + }, + { + "epoch": 4.950290510006456, + "grad_norm": 0.8830101926273661, + "learning_rate": 3.6629265543275393e-09, + "loss": 0.7213, + "step": 1920 + }, + { + "epoch": 4.952872821174951, + "grad_norm": 0.9108539534557583, + "learning_rate": 3.190841129542266e-09, + "loss": 0.7139, + "step": 1921 + }, + { + "epoch": 4.955455132343447, + "grad_norm": 0.9188851667256828, + "learning_rate": 2.7513066411699597e-09, + "loss": 0.7048, + "step": 1922 + }, + { + "epoch": 4.958037443511943, + "grad_norm": 0.9011821980131381, + "learning_rate": 2.344324520396901e-09, + "loss": 0.7166, + "step": 1923 + }, + { + "epoch": 4.960619754680439, + "grad_norm": 0.9260828637057613, + "learning_rate": 1.9698960924074973e-09, + "loss": 0.7124, + "step": 1924 + }, + { + "epoch": 4.963202065848935, + "grad_norm": 0.9176208831919505, + "learning_rate": 1.6280225763931623e-09, + "loss": 0.7192, + "step": 1925 + }, + { + "epoch": 4.965784377017431, + "grad_norm": 0.8999670316918704, + "learning_rate": 1.3187050855367755e-09, + "loss": 0.7011, + "step": 1926 + }, + { + "epoch": 4.9683666881859265, + "grad_norm": 0.9002005545182482, + "learning_rate": 1.0419446270193423e-09, + "loss": 0.7271, + "step": 1927 + }, + { + "epoch": 4.970948999354422, + "grad_norm": 0.8792229281282367, + "learning_rate": 7.977421020088916e-10, + "loss": 0.6954, + "step": 1928 + }, + { + "epoch": 4.973531310522918, + "grad_norm": 0.9030675678611226, + "learning_rate": 5.860983056604763e-10, + "loss": 0.717, + "step": 1929 + }, + { + "epoch": 4.976113621691414, + "grad_norm": 0.8822396526894568, + "learning_rate": 4.0701392711506307e-10, + "loss": 0.6956, + "step": 1930 + }, + { + "epoch": 4.978695932859909, + "grad_norm": 0.8758112231639331, + "learning_rate": 2.60489549495091e-10, + "loss": 0.6989, + "step": 1931 + }, + { + "epoch": 4.981278244028405, + "grad_norm": 0.9008898902977136, + "learning_rate": 1.4652564990336183e-10, + "loss": 0.7116, + "step": 1932 + }, + { + "epoch": 4.983860555196901, + "grad_norm": 0.8627458403348608, + "learning_rate": 6.512259942192955e-11, + "loss": 0.7134, + "step": 1933 + }, + { + "epoch": 4.9864428663653975, + "grad_norm": 0.8843096004745279, + "learning_rate": 1.6280663108769745e-11, + "loss": 0.6674, + "step": 1934 + }, + { + "epoch": 4.989025177533893, + "grad_norm": 0.8707183390390197, + "learning_rate": 0.0, + "loss": 0.6915, + "step": 1935 + }, + { + "epoch": 4.989025177533893, + "step": 1935, + "total_flos": 3.212141825011745e+18, + "train_loss": 0.9760797875796178, + "train_runtime": 14099.2726, + "train_samples_per_second": 17.57, + "train_steps_per_second": 0.137 + } + ], + "logging_steps": 1, + "max_steps": 1935, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.212141825011745e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}