diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,16 +1,16 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 4.22680412371134, + "epoch": 0.02577319587628866, "eval_steps": 97, - "global_step": 820, + "global_step": 5, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005154639175257732, - "grad_norm": 0.8343322277069092, + "grad_norm": 0.8365415930747986, "learning_rate": 2.0000000000000003e-06, "loss": 0.9672, "step": 1 @@ -18,5807 +18,38 @@ { "epoch": 0.005154639175257732, "eval_loss": 0.986353874206543, - "eval_runtime": 22.8548, - "eval_samples_per_second": 7.176, - "eval_steps_per_second": 1.794, + "eval_runtime": 22.9621, + "eval_samples_per_second": 7.142, + "eval_steps_per_second": 1.786, "step": 1 }, { "epoch": 0.010309278350515464, - "grad_norm": 0.7854290008544922, + "grad_norm": 0.7841001749038696, "learning_rate": 4.000000000000001e-06, "loss": 1.0214, "step": 2 }, { "epoch": 0.015463917525773196, - "grad_norm": 0.7100067734718323, + "grad_norm": 0.6986056566238403, "learning_rate": 6e-06, - "loss": 0.8938, + "loss": 0.8946, "step": 3 }, { "epoch": 0.020618556701030927, - "grad_norm": 0.80195152759552, + "grad_norm": 0.8076366782188416, "learning_rate": 8.000000000000001e-06, - "loss": 0.901, + "loss": 0.904, "step": 4 }, { "epoch": 0.02577319587628866, - "grad_norm": 1.1374644041061401, + "grad_norm": 1.1367769241333008, "learning_rate": 1e-05, - "loss": 1.0694, + "loss": 1.0636, "step": 5 - }, - { - "epoch": 0.030927835051546393, - "grad_norm": 1.244362473487854, - "learning_rate": 1.2e-05, - "loss": 1.1299, - "step": 6 - }, - { - "epoch": 0.03608247422680412, - "grad_norm": 0.7648528218269348, - "learning_rate": 1.4000000000000001e-05, - "loss": 0.9204, - "step": 7 - }, - { - "epoch": 0.041237113402061855, - "grad_norm": 0.6700325608253479, - "learning_rate": 1.6000000000000003e-05, - "loss": 0.6981, - "step": 8 - }, - { - "epoch": 0.04639175257731959, - "grad_norm": 0.9419430494308472, - "learning_rate": 1.8e-05, - "loss": 0.8936, - "step": 9 - }, - { - "epoch": 0.05154639175257732, - "grad_norm": 0.8192828893661499, - "learning_rate": 2e-05, - "loss": 0.9367, - "step": 10 - }, - { - "epoch": 0.05670103092783505, - "grad_norm": 1.0335086584091187, - "learning_rate": 2.2000000000000003e-05, - "loss": 0.9871, - "step": 11 - }, - { - "epoch": 0.061855670103092786, - "grad_norm": 0.7089415788650513, - "learning_rate": 2.4e-05, - "loss": 1.061, - "step": 12 - }, - { - "epoch": 0.06701030927835051, - "grad_norm": 0.6549484133720398, - "learning_rate": 2.6000000000000002e-05, - "loss": 0.8456, - "step": 13 - }, - { - "epoch": 0.07216494845360824, - "grad_norm": 0.7343222498893738, - "learning_rate": 2.8000000000000003e-05, - "loss": 0.8595, - "step": 14 - }, - { - "epoch": 0.07731958762886598, - "grad_norm": 0.7335950136184692, - "learning_rate": 3e-05, - "loss": 0.9044, - "step": 15 - }, - { - "epoch": 0.08247422680412371, - "grad_norm": 0.6129910945892334, - "learning_rate": 3.2000000000000005e-05, - "loss": 0.9111, - "step": 16 - }, - { - "epoch": 0.08762886597938144, - "grad_norm": 0.6522359848022461, - "learning_rate": 3.4000000000000007e-05, - "loss": 0.9435, - "step": 17 - }, - { - "epoch": 0.09278350515463918, - "grad_norm": 0.4705953299999237, - "learning_rate": 3.6e-05, - "loss": 0.8463, - "step": 18 - }, - { - "epoch": 0.0979381443298969, - "grad_norm": 0.42669880390167236, - "learning_rate": 3.8e-05, - "loss": 0.8329, - "step": 19 - }, - { - "epoch": 0.10309278350515463, - "grad_norm": 0.6905720233917236, - "learning_rate": 4e-05, - "loss": 1.1096, - "step": 20 - }, - { - "epoch": 0.10824742268041238, - "grad_norm": 0.3870203495025635, - "learning_rate": 4.2e-05, - "loss": 0.8613, - "step": 21 - }, - { - "epoch": 0.1134020618556701, - "grad_norm": 0.5222039818763733, - "learning_rate": 4.4000000000000006e-05, - "loss": 0.8606, - "step": 22 - }, - { - "epoch": 0.11855670103092783, - "grad_norm": 0.3936106264591217, - "learning_rate": 4.600000000000001e-05, - "loss": 0.9014, - "step": 23 - }, - { - "epoch": 0.12371134020618557, - "grad_norm": 0.3917650878429413, - "learning_rate": 4.8e-05, - "loss": 0.9613, - "step": 24 - }, - { - "epoch": 0.12886597938144329, - "grad_norm": 0.371128112077713, - "learning_rate": 5e-05, - "loss": 0.8298, - "step": 25 - }, - { - "epoch": 0.13402061855670103, - "grad_norm": 0.44909828901290894, - "learning_rate": 5.2000000000000004e-05, - "loss": 0.8887, - "step": 26 - }, - { - "epoch": 0.13917525773195877, - "grad_norm": 0.5001224875450134, - "learning_rate": 5.4000000000000005e-05, - "loss": 0.8943, - "step": 27 - }, - { - "epoch": 0.14432989690721648, - "grad_norm": 0.4336227774620056, - "learning_rate": 5.6000000000000006e-05, - "loss": 0.8709, - "step": 28 - }, - { - "epoch": 0.14948453608247422, - "grad_norm": 0.36210742592811584, - "learning_rate": 5.8e-05, - "loss": 0.7933, - "step": 29 - }, - { - "epoch": 0.15463917525773196, - "grad_norm": 0.560114324092865, - "learning_rate": 6e-05, - "loss": 0.9966, - "step": 30 - }, - { - "epoch": 0.15979381443298968, - "grad_norm": 0.3248458802700043, - "learning_rate": 6.2e-05, - "loss": 0.6494, - "step": 31 - }, - { - "epoch": 0.16494845360824742, - "grad_norm": 0.51533442735672, - "learning_rate": 6.400000000000001e-05, - "loss": 0.9284, - "step": 32 - }, - { - "epoch": 0.17010309278350516, - "grad_norm": 0.4753219187259674, - "learning_rate": 6.6e-05, - "loss": 0.722, - "step": 33 - }, - { - "epoch": 0.17525773195876287, - "grad_norm": 0.42082515358924866, - "learning_rate": 6.800000000000001e-05, - "loss": 0.8662, - "step": 34 - }, - { - "epoch": 0.18041237113402062, - "grad_norm": 0.5010572075843811, - "learning_rate": 7e-05, - "loss": 0.8844, - "step": 35 - }, - { - "epoch": 0.18556701030927836, - "grad_norm": 0.3900461792945862, - "learning_rate": 7.2e-05, - "loss": 0.7872, - "step": 36 - }, - { - "epoch": 0.19072164948453607, - "grad_norm": 0.3964938223361969, - "learning_rate": 7.4e-05, - "loss": 0.7122, - "step": 37 - }, - { - "epoch": 0.1958762886597938, - "grad_norm": 0.44170987606048584, - "learning_rate": 7.6e-05, - "loss": 0.7552, - "step": 38 - }, - { - "epoch": 0.20103092783505155, - "grad_norm": 0.42387211322784424, - "learning_rate": 7.800000000000001e-05, - "loss": 0.9254, - "step": 39 - }, - { - "epoch": 0.20618556701030927, - "grad_norm": 0.4199470281600952, - "learning_rate": 8e-05, - "loss": 0.7172, - "step": 40 - }, - { - "epoch": 0.211340206185567, - "grad_norm": 0.5539634227752686, - "learning_rate": 8.2e-05, - "loss": 0.8196, - "step": 41 - }, - { - "epoch": 0.21649484536082475, - "grad_norm": 0.422987699508667, - "learning_rate": 8.4e-05, - "loss": 0.8814, - "step": 42 - }, - { - "epoch": 0.22164948453608246, - "grad_norm": 0.431035578250885, - "learning_rate": 8.6e-05, - "loss": 0.9498, - "step": 43 - }, - { - "epoch": 0.2268041237113402, - "grad_norm": 0.36294519901275635, - "learning_rate": 8.800000000000001e-05, - "loss": 0.6622, - "step": 44 - }, - { - "epoch": 0.23195876288659795, - "grad_norm": 0.43537455797195435, - "learning_rate": 9e-05, - "loss": 0.7567, - "step": 45 - }, - { - "epoch": 0.23711340206185566, - "grad_norm": 0.513528048992157, - "learning_rate": 9.200000000000001e-05, - "loss": 1.104, - "step": 46 - }, - { - "epoch": 0.2422680412371134, - "grad_norm": 0.5338684916496277, - "learning_rate": 9.4e-05, - "loss": 0.8215, - "step": 47 - }, - { - "epoch": 0.24742268041237114, - "grad_norm": 0.4168243408203125, - "learning_rate": 9.6e-05, - "loss": 0.8727, - "step": 48 - }, - { - "epoch": 0.25257731958762886, - "grad_norm": 0.4411843419075012, - "learning_rate": 9.8e-05, - "loss": 0.9479, - "step": 49 - }, - { - "epoch": 0.25773195876288657, - "grad_norm": 0.43782272934913635, - "learning_rate": 0.0001, - "loss": 0.9748, - "step": 50 - }, - { - "epoch": 0.26288659793814434, - "grad_norm": 0.37000253796577454, - "learning_rate": 9.999970848314005e-05, - "loss": 0.6958, - "step": 51 - }, - { - "epoch": 0.26804123711340205, - "grad_norm": 0.4658275544643402, - "learning_rate": 9.999883393595947e-05, - "loss": 0.8385, - "step": 52 - }, - { - "epoch": 0.27319587628865977, - "grad_norm": 0.37418484687805176, - "learning_rate": 9.999737636865609e-05, - "loss": 0.7309, - "step": 53 - }, - { - "epoch": 0.27835051546391754, - "grad_norm": 0.42213866114616394, - "learning_rate": 9.99953357982261e-05, - "loss": 0.7723, - "step": 54 - }, - { - "epoch": 0.28350515463917525, - "grad_norm": 0.45320671796798706, - "learning_rate": 9.999271224846396e-05, - "loss": 0.8041, - "step": 55 - }, - { - "epoch": 0.28865979381443296, - "grad_norm": 0.34378930926322937, - "learning_rate": 9.998950574996199e-05, - "loss": 0.6543, - "step": 56 - }, - { - "epoch": 0.29381443298969073, - "grad_norm": 0.5220825672149658, - "learning_rate": 9.998571634011015e-05, - "loss": 0.931, - "step": 57 - }, - { - "epoch": 0.29896907216494845, - "grad_norm": 0.3490442931652069, - "learning_rate": 9.998134406309554e-05, - "loss": 0.748, - "step": 58 - }, - { - "epoch": 0.30412371134020616, - "grad_norm": 0.35038453340530396, - "learning_rate": 9.99763889699018e-05, - "loss": 0.6783, - "step": 59 - }, - { - "epoch": 0.30927835051546393, - "grad_norm": 0.4343213140964508, - "learning_rate": 9.99708511183087e-05, - "loss": 0.8773, - "step": 60 - }, - { - "epoch": 0.31443298969072164, - "grad_norm": 0.3930794596672058, - "learning_rate": 9.996473057289132e-05, - "loss": 0.8495, - "step": 61 - }, - { - "epoch": 0.31958762886597936, - "grad_norm": 0.41221436858177185, - "learning_rate": 9.995802740501933e-05, - "loss": 0.8531, - "step": 62 - }, - { - "epoch": 0.3247422680412371, - "grad_norm": 0.4284820854663849, - "learning_rate": 9.99507416928562e-05, - "loss": 0.7711, - "step": 63 - }, - { - "epoch": 0.32989690721649484, - "grad_norm": 0.3693944215774536, - "learning_rate": 9.994287352135825e-05, - "loss": 0.8896, - "step": 64 - }, - { - "epoch": 0.33505154639175255, - "grad_norm": 0.3671146631240845, - "learning_rate": 9.993442298227365e-05, - "loss": 0.8124, - "step": 65 - }, - { - "epoch": 0.3402061855670103, - "grad_norm": 0.4267442524433136, - "learning_rate": 9.99253901741414e-05, - "loss": 0.7468, - "step": 66 - }, - { - "epoch": 0.34536082474226804, - "grad_norm": 0.392914742231369, - "learning_rate": 9.991577520229014e-05, - "loss": 0.9241, - "step": 67 - }, - { - "epoch": 0.35051546391752575, - "grad_norm": 0.3600424528121948, - "learning_rate": 9.99055781788369e-05, - "loss": 0.7838, - "step": 68 - }, - { - "epoch": 0.3556701030927835, - "grad_norm": 0.38290566205978394, - "learning_rate": 9.989479922268588e-05, - "loss": 0.6526, - "step": 69 - }, - { - "epoch": 0.36082474226804123, - "grad_norm": 0.38870134949684143, - "learning_rate": 9.988343845952697e-05, - "loss": 0.8359, - "step": 70 - }, - { - "epoch": 0.36597938144329895, - "grad_norm": 0.4147457480430603, - "learning_rate": 9.98714960218343e-05, - "loss": 0.82, - "step": 71 - }, - { - "epoch": 0.3711340206185567, - "grad_norm": 0.3855450451374054, - "learning_rate": 9.985897204886481e-05, - "loss": 0.797, - "step": 72 - }, - { - "epoch": 0.37628865979381443, - "grad_norm": 0.4507073163986206, - "learning_rate": 9.98458666866564e-05, - "loss": 0.8844, - "step": 73 - }, - { - "epoch": 0.38144329896907214, - "grad_norm": 0.44775328040122986, - "learning_rate": 9.983218008802648e-05, - "loss": 0.8082, - "step": 74 - }, - { - "epoch": 0.3865979381443299, - "grad_norm": 0.41650065779685974, - "learning_rate": 9.981791241257e-05, - "loss": 0.8983, - "step": 75 - }, - { - "epoch": 0.3917525773195876, - "grad_norm": 0.3322182595729828, - "learning_rate": 9.98030638266577e-05, - "loss": 0.6726, - "step": 76 - }, - { - "epoch": 0.39690721649484534, - "grad_norm": 0.4027014374732971, - "learning_rate": 9.978763450343407e-05, - "loss": 0.9871, - "step": 77 - }, - { - "epoch": 0.4020618556701031, - "grad_norm": 0.45693254470825195, - "learning_rate": 9.977162462281544e-05, - "loss": 0.8502, - "step": 78 - }, - { - "epoch": 0.4072164948453608, - "grad_norm": 0.3622555136680603, - "learning_rate": 9.975503437148783e-05, - "loss": 0.7157, - "step": 79 - }, - { - "epoch": 0.41237113402061853, - "grad_norm": 0.4588169455528259, - "learning_rate": 9.973786394290474e-05, - "loss": 0.898, - "step": 80 - }, - { - "epoch": 0.4175257731958763, - "grad_norm": 0.3862941265106201, - "learning_rate": 9.972011353728496e-05, - "loss": 0.6313, - "step": 81 - }, - { - "epoch": 0.422680412371134, - "grad_norm": 0.421906441450119, - "learning_rate": 9.970178336161018e-05, - "loss": 0.8687, - "step": 82 - }, - { - "epoch": 0.42783505154639173, - "grad_norm": 0.43780192732810974, - "learning_rate": 9.968287362962264e-05, - "loss": 0.8284, - "step": 83 - }, - { - "epoch": 0.4329896907216495, - "grad_norm": 0.3814310133457184, - "learning_rate": 9.96633845618225e-05, - "loss": 0.6429, - "step": 84 - }, - { - "epoch": 0.4381443298969072, - "grad_norm": 0.38887372612953186, - "learning_rate": 9.96433163854655e-05, - "loss": 0.8692, - "step": 85 - }, - { - "epoch": 0.44329896907216493, - "grad_norm": 0.42212334275245667, - "learning_rate": 9.962266933456008e-05, - "loss": 0.8881, - "step": 86 - }, - { - "epoch": 0.4484536082474227, - "grad_norm": 0.4047574996948242, - "learning_rate": 9.96014436498648e-05, - "loss": 0.7942, - "step": 87 - }, - { - "epoch": 0.4536082474226804, - "grad_norm": 0.3563002347946167, - "learning_rate": 9.957963957888542e-05, - "loss": 0.702, - "step": 88 - }, - { - "epoch": 0.4587628865979381, - "grad_norm": 0.383470743894577, - "learning_rate": 9.955725737587214e-05, - "loss": 0.7593, - "step": 89 - }, - { - "epoch": 0.4639175257731959, - "grad_norm": 0.4369049668312073, - "learning_rate": 9.953429730181653e-05, - "loss": 0.8449, - "step": 90 - }, - { - "epoch": 0.4690721649484536, - "grad_norm": 0.5086058378219604, - "learning_rate": 9.951075962444856e-05, - "loss": 0.9816, - "step": 91 - }, - { - "epoch": 0.4742268041237113, - "grad_norm": 0.4870317578315735, - "learning_rate": 9.94866446182334e-05, - "loss": 0.9004, - "step": 92 - }, - { - "epoch": 0.4793814432989691, - "grad_norm": 0.4313347339630127, - "learning_rate": 9.94619525643683e-05, - "loss": 0.9499, - "step": 93 - }, - { - "epoch": 0.4845360824742268, - "grad_norm": 0.3745776116847992, - "learning_rate": 9.943668375077925e-05, - "loss": 0.819, - "step": 94 - }, - { - "epoch": 0.4896907216494845, - "grad_norm": 0.3991529643535614, - "learning_rate": 9.941083847211765e-05, - "loss": 0.7799, - "step": 95 - }, - { - "epoch": 0.4948453608247423, - "grad_norm": 0.4022909998893738, - "learning_rate": 9.938441702975689e-05, - "loss": 0.8125, - "step": 96 - }, - { - "epoch": 0.5, - "grad_norm": 0.3985065221786499, - "learning_rate": 9.93574197317888e-05, - "loss": 0.7959, - "step": 97 - }, - { - "epoch": 0.5, - "eval_loss": 0.802886426448822, - "eval_runtime": 23.123, - "eval_samples_per_second": 7.092, - "eval_steps_per_second": 1.773, - "step": 97 - }, - { - "epoch": 0.5051546391752577, - "grad_norm": 0.5668719410896301, - "learning_rate": 9.93298468930201e-05, - "loss": 0.9617, - "step": 98 - }, - { - "epoch": 0.5103092783505154, - "grad_norm": 0.3678329885005951, - "learning_rate": 9.930169883496867e-05, - "loss": 0.7494, - "step": 99 - }, - { - "epoch": 0.5154639175257731, - "grad_norm": 0.4211786985397339, - "learning_rate": 9.927297588585984e-05, - "loss": 0.6733, - "step": 100 - }, - { - "epoch": 0.520618556701031, - "grad_norm": 0.4105909764766693, - "learning_rate": 9.924367838062259e-05, - "loss": 0.6879, - "step": 101 - }, - { - "epoch": 0.5257731958762887, - "grad_norm": 0.4837935268878937, - "learning_rate": 9.921380666088558e-05, - "loss": 0.8099, - "step": 102 - }, - { - "epoch": 0.5309278350515464, - "grad_norm": 0.3713263273239136, - "learning_rate": 9.91833610749732e-05, - "loss": 0.8607, - "step": 103 - }, - { - "epoch": 0.5360824742268041, - "grad_norm": 0.3924923241138458, - "learning_rate": 9.915234197790152e-05, - "loss": 0.7541, - "step": 104 - }, - { - "epoch": 0.5412371134020618, - "grad_norm": 0.4768580198287964, - "learning_rate": 9.912074973137412e-05, - "loss": 0.9051, - "step": 105 - }, - { - "epoch": 0.5463917525773195, - "grad_norm": 0.42937901616096497, - "learning_rate": 9.908858470377793e-05, - "loss": 0.7494, - "step": 106 - }, - { - "epoch": 0.5515463917525774, - "grad_norm": 0.391781747341156, - "learning_rate": 9.905584727017884e-05, - "loss": 0.8718, - "step": 107 - }, - { - "epoch": 0.5567010309278351, - "grad_norm": 0.35565879940986633, - "learning_rate": 9.90225378123174e-05, - "loss": 0.7339, - "step": 108 - }, - { - "epoch": 0.5618556701030928, - "grad_norm": 0.33125853538513184, - "learning_rate": 9.898865671860438e-05, - "loss": 0.7042, - "step": 109 - }, - { - "epoch": 0.5670103092783505, - "grad_norm": 0.511025607585907, - "learning_rate": 9.895420438411616e-05, - "loss": 0.7971, - "step": 110 - }, - { - "epoch": 0.5721649484536082, - "grad_norm": 0.41539090871810913, - "learning_rate": 9.891918121059019e-05, - "loss": 0.8643, - "step": 111 - }, - { - "epoch": 0.5773195876288659, - "grad_norm": 0.371518611907959, - "learning_rate": 9.888358760642029e-05, - "loss": 0.6514, - "step": 112 - }, - { - "epoch": 0.5824742268041238, - "grad_norm": 0.5038712620735168, - "learning_rate": 9.884742398665191e-05, - "loss": 0.8113, - "step": 113 - }, - { - "epoch": 0.5876288659793815, - "grad_norm": 0.3781215250492096, - "learning_rate": 9.881069077297723e-05, - "loss": 0.702, - "step": 114 - }, - { - "epoch": 0.5927835051546392, - "grad_norm": 0.4231729805469513, - "learning_rate": 9.877338839373032e-05, - "loss": 0.7659, - "step": 115 - }, - { - "epoch": 0.5979381443298969, - "grad_norm": 0.4518745541572571, - "learning_rate": 9.873551728388203e-05, - "loss": 0.7877, - "step": 116 - }, - { - "epoch": 0.6030927835051546, - "grad_norm": 0.4570085406303406, - "learning_rate": 9.869707788503508e-05, - "loss": 0.8543, - "step": 117 - }, - { - "epoch": 0.6082474226804123, - "grad_norm": 0.38847261667251587, - "learning_rate": 9.865807064541877e-05, - "loss": 0.787, - "step": 118 - }, - { - "epoch": 0.6134020618556701, - "grad_norm": 0.3872878849506378, - "learning_rate": 9.861849601988383e-05, - "loss": 0.7859, - "step": 119 - }, - { - "epoch": 0.6185567010309279, - "grad_norm": 0.4313697814941406, - "learning_rate": 9.857835446989707e-05, - "loss": 0.7365, - "step": 120 - }, - { - "epoch": 0.6237113402061856, - "grad_norm": 0.4301742911338806, - "learning_rate": 9.853764646353605e-05, - "loss": 0.9206, - "step": 121 - }, - { - "epoch": 0.6288659793814433, - "grad_norm": 0.3453035354614258, - "learning_rate": 9.849637247548356e-05, - "loss": 0.6391, - "step": 122 - }, - { - "epoch": 0.634020618556701, - "grad_norm": 0.49422284960746765, - "learning_rate": 9.845453298702216e-05, - "loss": 0.8156, - "step": 123 - }, - { - "epoch": 0.6391752577319587, - "grad_norm": 0.37356287240982056, - "learning_rate": 9.841212848602846e-05, - "loss": 0.688, - "step": 124 - }, - { - "epoch": 0.6443298969072165, - "grad_norm": 0.47690337896347046, - "learning_rate": 9.836915946696759e-05, - "loss": 0.741, - "step": 125 - }, - { - "epoch": 0.6494845360824743, - "grad_norm": 0.5058567523956299, - "learning_rate": 9.832562643088724e-05, - "loss": 0.8239, - "step": 126 - }, - { - "epoch": 0.654639175257732, - "grad_norm": 0.36732959747314453, - "learning_rate": 9.828152988541201e-05, - "loss": 0.6655, - "step": 127 - }, - { - "epoch": 0.6597938144329897, - "grad_norm": 0.3805343508720398, - "learning_rate": 9.823687034473735e-05, - "loss": 0.7894, - "step": 128 - }, - { - "epoch": 0.6649484536082474, - "grad_norm": 0.36144956946372986, - "learning_rate": 9.81916483296236e-05, - "loss": 0.553, - "step": 129 - }, - { - "epoch": 0.6701030927835051, - "grad_norm": 0.38662460446357727, - "learning_rate": 9.814586436738998e-05, - "loss": 0.8187, - "step": 130 - }, - { - "epoch": 0.6752577319587629, - "grad_norm": 0.4534616470336914, - "learning_rate": 9.809951899190835e-05, - "loss": 0.9291, - "step": 131 - }, - { - "epoch": 0.6804123711340206, - "grad_norm": 0.42224985361099243, - "learning_rate": 9.805261274359705e-05, - "loss": 0.8359, - "step": 132 - }, - { - "epoch": 0.6855670103092784, - "grad_norm": 0.3735229969024658, - "learning_rate": 9.800514616941457e-05, - "loss": 0.8374, - "step": 133 - }, - { - "epoch": 0.6907216494845361, - "grad_norm": 0.3265180289745331, - "learning_rate": 9.795711982285316e-05, - "loss": 0.5942, - "step": 134 - }, - { - "epoch": 0.6958762886597938, - "grad_norm": 0.41515135765075684, - "learning_rate": 9.790853426393245e-05, - "loss": 0.7797, - "step": 135 - }, - { - "epoch": 0.7010309278350515, - "grad_norm": 0.41013866662979126, - "learning_rate": 9.785939005919278e-05, - "loss": 0.7919, - "step": 136 - }, - { - "epoch": 0.7061855670103093, - "grad_norm": 0.3023189902305603, - "learning_rate": 9.780968778168874e-05, - "loss": 0.6237, - "step": 137 - }, - { - "epoch": 0.711340206185567, - "grad_norm": 0.5121697187423706, - "learning_rate": 9.77594280109824e-05, - "loss": 0.834, - "step": 138 - }, - { - "epoch": 0.7164948453608248, - "grad_norm": 0.3422345519065857, - "learning_rate": 9.77086113331366e-05, - "loss": 0.7347, - "step": 139 - }, - { - "epoch": 0.7216494845360825, - "grad_norm": 0.4234163165092468, - "learning_rate": 9.765723834070804e-05, - "loss": 0.6087, - "step": 140 - }, - { - "epoch": 0.7268041237113402, - "grad_norm": 0.4280671775341034, - "learning_rate": 9.760530963274048e-05, - "loss": 0.6604, - "step": 141 - }, - { - "epoch": 0.7319587628865979, - "grad_norm": 0.4124312698841095, - "learning_rate": 9.755282581475769e-05, - "loss": 0.7191, - "step": 142 - }, - { - "epoch": 0.7371134020618557, - "grad_norm": 0.42981845140457153, - "learning_rate": 9.749978749875635e-05, - "loss": 0.7464, - "step": 143 - }, - { - "epoch": 0.7422680412371134, - "grad_norm": 0.44360974431037903, - "learning_rate": 9.744619530319899e-05, - "loss": 0.7628, - "step": 144 - }, - { - "epoch": 0.7474226804123711, - "grad_norm": 0.43546900153160095, - "learning_rate": 9.739204985300679e-05, - "loss": 0.8613, - "step": 145 - }, - { - "epoch": 0.7525773195876289, - "grad_norm": 0.3809047043323517, - "learning_rate": 9.733735177955219e-05, - "loss": 0.7123, - "step": 146 - }, - { - "epoch": 0.7577319587628866, - "grad_norm": 0.48975661396980286, - "learning_rate": 9.728210172065162e-05, - "loss": 0.9697, - "step": 147 - }, - { - "epoch": 0.7628865979381443, - "grad_norm": 0.4534354507923126, - "learning_rate": 9.722630032055803e-05, - "loss": 0.8023, - "step": 148 - }, - { - "epoch": 0.7680412371134021, - "grad_norm": 0.45175039768218994, - "learning_rate": 9.716994822995338e-05, - "loss": 0.9383, - "step": 149 - }, - { - "epoch": 0.7731958762886598, - "grad_norm": 0.35632631182670593, - "learning_rate": 9.711304610594104e-05, - "loss": 0.6622, - "step": 150 - }, - { - "epoch": 0.7783505154639175, - "grad_norm": 0.41617223620414734, - "learning_rate": 9.705559461203815e-05, - "loss": 0.8882, - "step": 151 - }, - { - "epoch": 0.7835051546391752, - "grad_norm": 0.4077165126800537, - "learning_rate": 9.699759441816787e-05, - "loss": 0.7558, - "step": 152 - }, - { - "epoch": 0.788659793814433, - "grad_norm": 0.42368316650390625, - "learning_rate": 9.69390462006516e-05, - "loss": 0.7681, - "step": 153 - }, - { - "epoch": 0.7938144329896907, - "grad_norm": 0.3329184055328369, - "learning_rate": 9.687995064220102e-05, - "loss": 0.7748, - "step": 154 - }, - { - "epoch": 0.7989690721649485, - "grad_norm": 0.45332592725753784, - "learning_rate": 9.682030843191022e-05, - "loss": 0.8737, - "step": 155 - }, - { - "epoch": 0.8041237113402062, - "grad_norm": 0.4276467561721802, - "learning_rate": 9.676012026524755e-05, - "loss": 0.7728, - "step": 156 - }, - { - "epoch": 0.8092783505154639, - "grad_norm": 0.4873369038105011, - "learning_rate": 9.669938684404766e-05, - "loss": 0.965, - "step": 157 - }, - { - "epoch": 0.8144329896907216, - "grad_norm": 0.33252307772636414, - "learning_rate": 9.663810887650318e-05, - "loss": 0.6058, - "step": 158 - }, - { - "epoch": 0.8195876288659794, - "grad_norm": 0.397871196269989, - "learning_rate": 9.657628707715655e-05, - "loss": 0.6472, - "step": 159 - }, - { - "epoch": 0.8247422680412371, - "grad_norm": 0.46122172474861145, - "learning_rate": 9.651392216689165e-05, - "loss": 0.9086, - "step": 160 - }, - { - "epoch": 0.8298969072164949, - "grad_norm": 0.39958837628364563, - "learning_rate": 9.645101487292539e-05, - "loss": 0.7856, - "step": 161 - }, - { - "epoch": 0.8350515463917526, - "grad_norm": 0.4407312273979187, - "learning_rate": 9.638756592879922e-05, - "loss": 0.7794, - "step": 162 - }, - { - "epoch": 0.8402061855670103, - "grad_norm": 0.388991117477417, - "learning_rate": 9.632357607437065e-05, - "loss": 0.7394, - "step": 163 - }, - { - "epoch": 0.845360824742268, - "grad_norm": 0.3550252616405487, - "learning_rate": 9.625904605580452e-05, - "loss": 0.6471, - "step": 164 - }, - { - "epoch": 0.8505154639175257, - "grad_norm": 0.37362176179885864, - "learning_rate": 9.619397662556435e-05, - "loss": 0.7905, - "step": 165 - }, - { - "epoch": 0.8556701030927835, - "grad_norm": 0.4304026961326599, - "learning_rate": 9.612836854240358e-05, - "loss": 0.8113, - "step": 166 - }, - { - "epoch": 0.8608247422680413, - "grad_norm": 0.43434932827949524, - "learning_rate": 9.606222257135675e-05, - "loss": 0.7418, - "step": 167 - }, - { - "epoch": 0.865979381443299, - "grad_norm": 0.4242919683456421, - "learning_rate": 9.599553948373045e-05, - "loss": 0.5827, - "step": 168 - }, - { - "epoch": 0.8711340206185567, - "grad_norm": 0.4583244025707245, - "learning_rate": 9.592832005709448e-05, - "loss": 0.816, - "step": 169 - }, - { - "epoch": 0.8762886597938144, - "grad_norm": 0.42892760038375854, - "learning_rate": 9.586056507527266e-05, - "loss": 0.8334, - "step": 170 - }, - { - "epoch": 0.8814432989690721, - "grad_norm": 0.4265868663787842, - "learning_rate": 9.579227532833377e-05, - "loss": 0.8701, - "step": 171 - }, - { - "epoch": 0.8865979381443299, - "grad_norm": 0.49999022483825684, - "learning_rate": 9.572345161258235e-05, - "loss": 0.8467, - "step": 172 - }, - { - "epoch": 0.8917525773195877, - "grad_norm": 0.3935216963291168, - "learning_rate": 9.565409473054932e-05, - "loss": 0.6476, - "step": 173 - }, - { - "epoch": 0.8969072164948454, - "grad_norm": 0.3507629930973053, - "learning_rate": 9.558420549098268e-05, - "loss": 0.883, - "step": 174 - }, - { - "epoch": 0.9020618556701031, - "grad_norm": 0.453337162733078, - "learning_rate": 9.551378470883812e-05, - "loss": 0.753, - "step": 175 - }, - { - "epoch": 0.9072164948453608, - "grad_norm": 0.438959002494812, - "learning_rate": 9.544283320526943e-05, - "loss": 0.8533, - "step": 176 - }, - { - "epoch": 0.9123711340206185, - "grad_norm": 0.43038681149482727, - "learning_rate": 9.537135180761903e-05, - "loss": 0.8053, - "step": 177 - }, - { - "epoch": 0.9175257731958762, - "grad_norm": 0.5258646011352539, - "learning_rate": 9.52993413494082e-05, - "loss": 0.9231, - "step": 178 - }, - { - "epoch": 0.9226804123711341, - "grad_norm": 0.3199097216129303, - "learning_rate": 9.522680267032742e-05, - "loss": 0.6158, - "step": 179 - }, - { - "epoch": 0.9278350515463918, - "grad_norm": 0.4209578335285187, - "learning_rate": 9.515373661622664e-05, - "loss": 0.8393, - "step": 180 - }, - { - "epoch": 0.9329896907216495, - "grad_norm": 0.47767141461372375, - "learning_rate": 9.508014403910533e-05, - "loss": 0.8761, - "step": 181 - }, - { - "epoch": 0.9381443298969072, - "grad_norm": 0.386308878660202, - "learning_rate": 9.500602579710256e-05, - "loss": 0.8358, - "step": 182 - }, - { - "epoch": 0.9432989690721649, - "grad_norm": 0.41348996758461, - "learning_rate": 9.4931382754487e-05, - "loss": 0.7701, - "step": 183 - }, - { - "epoch": 0.9484536082474226, - "grad_norm": 0.37899458408355713, - "learning_rate": 9.485621578164689e-05, - "loss": 0.6727, - "step": 184 - }, - { - "epoch": 0.9536082474226805, - "grad_norm": 0.45421937108039856, - "learning_rate": 9.478052575507982e-05, - "loss": 0.7223, - "step": 185 - }, - { - "epoch": 0.9587628865979382, - "grad_norm": 0.3540864884853363, - "learning_rate": 9.470431355738257e-05, - "loss": 0.7068, - "step": 186 - }, - { - "epoch": 0.9639175257731959, - "grad_norm": 0.3731318712234497, - "learning_rate": 9.46275800772407e-05, - "loss": 0.8276, - "step": 187 - }, - { - "epoch": 0.9690721649484536, - "grad_norm": 0.3973620533943176, - "learning_rate": 9.45503262094184e-05, - "loss": 0.7225, - "step": 188 - }, - { - "epoch": 0.9742268041237113, - "grad_norm": 0.4032728672027588, - "learning_rate": 9.447255285474783e-05, - "loss": 0.7378, - "step": 189 - }, - { - "epoch": 0.979381443298969, - "grad_norm": 0.4338615834712982, - "learning_rate": 9.439426092011875e-05, - "loss": 0.9921, - "step": 190 - }, - { - "epoch": 0.9845360824742269, - "grad_norm": 0.38775742053985596, - "learning_rate": 9.431545131846797e-05, - "loss": 0.8594, - "step": 191 - }, - { - "epoch": 0.9896907216494846, - "grad_norm": 0.5204732418060303, - "learning_rate": 9.423612496876855e-05, - "loss": 0.7099, - "step": 192 - }, - { - "epoch": 0.9948453608247423, - "grad_norm": 0.3858681917190552, - "learning_rate": 9.415628279601923e-05, - "loss": 0.7499, - "step": 193 - }, - { - "epoch": 1.0, - "grad_norm": 0.5360407829284668, - "learning_rate": 9.407592573123358e-05, - "loss": 0.747, - "step": 194 - }, - { - "epoch": 1.0, - "eval_loss": 0.7822192311286926, - "eval_runtime": 23.1976, - "eval_samples_per_second": 7.07, - "eval_steps_per_second": 1.767, - "step": 194 - }, - { - "epoch": 1.0051546391752577, - "grad_norm": 0.36987513303756714, - "learning_rate": 9.39950547114292e-05, - "loss": 0.8177, - "step": 195 - }, - { - "epoch": 1.0103092783505154, - "grad_norm": 0.38242754340171814, - "learning_rate": 9.39136706796167e-05, - "loss": 0.6535, - "step": 196 - }, - { - "epoch": 1.0154639175257731, - "grad_norm": 0.39804112911224365, - "learning_rate": 9.383177458478878e-05, - "loss": 0.7294, - "step": 197 - }, - { - "epoch": 1.0206185567010309, - "grad_norm": 0.4012157917022705, - "learning_rate": 9.374936738190914e-05, - "loss": 0.7149, - "step": 198 - }, - { - "epoch": 1.0257731958762886, - "grad_norm": 0.45552578568458557, - "learning_rate": 9.366645003190132e-05, - "loss": 0.7603, - "step": 199 - }, - { - "epoch": 1.0309278350515463, - "grad_norm": 0.4522833824157715, - "learning_rate": 9.358302350163757e-05, - "loss": 0.9213, - "step": 200 - }, - { - "epoch": 1.0360824742268042, - "grad_norm": 0.4032951295375824, - "learning_rate": 9.349908876392748e-05, - "loss": 0.6416, - "step": 201 - }, - { - "epoch": 1.041237113402062, - "grad_norm": 0.48034724593162537, - "learning_rate": 9.341464679750669e-05, - "loss": 0.7987, - "step": 202 - }, - { - "epoch": 1.0463917525773196, - "grad_norm": 0.4653926193714142, - "learning_rate": 9.33296985870255e-05, - "loss": 0.7531, - "step": 203 - }, - { - "epoch": 1.0515463917525774, - "grad_norm": 0.3766481280326843, - "learning_rate": 9.32442451230373e-05, - "loss": 0.6282, - "step": 204 - }, - { - "epoch": 1.056701030927835, - "grad_norm": 0.4457147717475891, - "learning_rate": 9.315828740198714e-05, - "loss": 0.8084, - "step": 205 - }, - { - "epoch": 1.0618556701030928, - "grad_norm": 0.3701726496219635, - "learning_rate": 9.30718264262e-05, - "loss": 0.8328, - "step": 206 - }, - { - "epoch": 1.0670103092783505, - "grad_norm": 0.3876953721046448, - "learning_rate": 9.298486320386919e-05, - "loss": 0.6424, - "step": 207 - }, - { - "epoch": 1.0721649484536082, - "grad_norm": 0.4125080704689026, - "learning_rate": 9.289739874904449e-05, - "loss": 0.6097, - "step": 208 - }, - { - "epoch": 1.077319587628866, - "grad_norm": 0.38787776231765747, - "learning_rate": 9.280943408162046e-05, - "loss": 0.6664, - "step": 209 - }, - { - "epoch": 1.0824742268041236, - "grad_norm": 0.44620412588119507, - "learning_rate": 9.272097022732443e-05, - "loss": 0.7523, - "step": 210 - }, - { - "epoch": 1.0876288659793814, - "grad_norm": 0.4375419616699219, - "learning_rate": 9.263200821770461e-05, - "loss": 0.7148, - "step": 211 - }, - { - "epoch": 1.0927835051546393, - "grad_norm": 0.5171458721160889, - "learning_rate": 9.254254909011804e-05, - "loss": 0.8076, - "step": 212 - }, - { - "epoch": 1.097938144329897, - "grad_norm": 0.4271126091480255, - "learning_rate": 9.245259388771845e-05, - "loss": 0.7198, - "step": 213 - }, - { - "epoch": 1.1030927835051547, - "grad_norm": 0.4279671311378479, - "learning_rate": 9.236214365944418e-05, - "loss": 0.7592, - "step": 214 - }, - { - "epoch": 1.1082474226804124, - "grad_norm": 0.3776322901248932, - "learning_rate": 9.22711994600059e-05, - "loss": 0.6371, - "step": 215 - }, - { - "epoch": 1.1134020618556701, - "grad_norm": 0.40393704175949097, - "learning_rate": 9.217976234987428e-05, - "loss": 0.7749, - "step": 216 - }, - { - "epoch": 1.1185567010309279, - "grad_norm": 0.3984552025794983, - "learning_rate": 9.208783339526773e-05, - "loss": 0.5648, - "step": 217 - }, - { - "epoch": 1.1237113402061856, - "grad_norm": 0.37290528416633606, - "learning_rate": 9.199541366813982e-05, - "loss": 0.7259, - "step": 218 - }, - { - "epoch": 1.1288659793814433, - "grad_norm": 0.4922865629196167, - "learning_rate": 9.190250424616693e-05, - "loss": 0.7142, - "step": 219 - }, - { - "epoch": 1.134020618556701, - "grad_norm": 0.45908308029174805, - "learning_rate": 9.180910621273555e-05, - "loss": 0.6767, - "step": 220 - }, - { - "epoch": 1.1391752577319587, - "grad_norm": 0.4036223888397217, - "learning_rate": 9.171522065692975e-05, - "loss": 0.7196, - "step": 221 - }, - { - "epoch": 1.1443298969072164, - "grad_norm": 0.4769727885723114, - "learning_rate": 9.162084867351842e-05, - "loss": 0.6734, - "step": 222 - }, - { - "epoch": 1.1494845360824741, - "grad_norm": 0.35914334654808044, - "learning_rate": 9.152599136294253e-05, - "loss": 0.5459, - "step": 223 - }, - { - "epoch": 1.1546391752577319, - "grad_norm": 0.49013835191726685, - "learning_rate": 9.14306498313023e-05, - "loss": 0.7468, - "step": 224 - }, - { - "epoch": 1.1597938144329896, - "grad_norm": 0.47017601132392883, - "learning_rate": 9.133482519034428e-05, - "loss": 0.6986, - "step": 225 - }, - { - "epoch": 1.1649484536082475, - "grad_norm": 0.43976277112960815, - "learning_rate": 9.123851855744843e-05, - "loss": 0.7512, - "step": 226 - }, - { - "epoch": 1.1701030927835052, - "grad_norm": 0.42843174934387207, - "learning_rate": 9.114173105561501e-05, - "loss": 0.7984, - "step": 227 - }, - { - "epoch": 1.175257731958763, - "grad_norm": 0.3842880129814148, - "learning_rate": 9.104446381345159e-05, - "loss": 0.7076, - "step": 228 - }, - { - "epoch": 1.1804123711340206, - "grad_norm": 0.39397862553596497, - "learning_rate": 9.094671796515978e-05, - "loss": 0.6256, - "step": 229 - }, - { - "epoch": 1.1855670103092784, - "grad_norm": 0.5267696976661682, - "learning_rate": 9.08484946505221e-05, - "loss": 0.7114, - "step": 230 - }, - { - "epoch": 1.190721649484536, - "grad_norm": 0.41206100583076477, - "learning_rate": 9.074979501488867e-05, - "loss": 0.5898, - "step": 231 - }, - { - "epoch": 1.1958762886597938, - "grad_norm": 0.4033416211605072, - "learning_rate": 9.065062020916377e-05, - "loss": 0.604, - "step": 232 - }, - { - "epoch": 1.2010309278350515, - "grad_norm": 0.43909236788749695, - "learning_rate": 9.055097138979252e-05, - "loss": 0.7177, - "step": 233 - }, - { - "epoch": 1.2061855670103092, - "grad_norm": 0.4308321475982666, - "learning_rate": 9.045084971874738e-05, - "loss": 0.6634, - "step": 234 - }, - { - "epoch": 1.211340206185567, - "grad_norm": 0.5138210654258728, - "learning_rate": 9.035025636351452e-05, - "loss": 0.6342, - "step": 235 - }, - { - "epoch": 1.2164948453608249, - "grad_norm": 0.44698837399482727, - "learning_rate": 9.024919249708035e-05, - "loss": 0.792, - "step": 236 - }, - { - "epoch": 1.2216494845360826, - "grad_norm": 0.5599440932273865, - "learning_rate": 9.014765929791768e-05, - "loss": 0.7513, - "step": 237 - }, - { - "epoch": 1.2268041237113403, - "grad_norm": 0.4447702467441559, - "learning_rate": 9.004565794997209e-05, - "loss": 0.6762, - "step": 238 - }, - { - "epoch": 1.231958762886598, - "grad_norm": 0.40888068079948425, - "learning_rate": 8.994318964264809e-05, - "loss": 0.6168, - "step": 239 - }, - { - "epoch": 1.2371134020618557, - "grad_norm": 0.48757925629615784, - "learning_rate": 8.984025557079523e-05, - "loss": 0.7511, - "step": 240 - }, - { - "epoch": 1.2422680412371134, - "grad_norm": 0.522798478603363, - "learning_rate": 8.973685693469423e-05, - "loss": 0.7837, - "step": 241 - }, - { - "epoch": 1.2474226804123711, - "grad_norm": 0.46510541439056396, - "learning_rate": 8.963299494004291e-05, - "loss": 0.7429, - "step": 242 - }, - { - "epoch": 1.2525773195876289, - "grad_norm": 0.3879891037940979, - "learning_rate": 8.952867079794218e-05, - "loss": 0.5545, - "step": 243 - }, - { - "epoch": 1.2577319587628866, - "grad_norm": 0.4209011197090149, - "learning_rate": 8.942388572488187e-05, - "loss": 0.9004, - "step": 244 - }, - { - "epoch": 1.2628865979381443, - "grad_norm": 0.462494820356369, - "learning_rate": 8.931864094272663e-05, - "loss": 0.5892, - "step": 245 - }, - { - "epoch": 1.268041237113402, - "grad_norm": 0.5652675032615662, - "learning_rate": 8.921293767870157e-05, - "loss": 0.7347, - "step": 246 - }, - { - "epoch": 1.2731958762886597, - "grad_norm": 0.45603305101394653, - "learning_rate": 8.910677716537806e-05, - "loss": 0.8051, - "step": 247 - }, - { - "epoch": 1.2783505154639174, - "grad_norm": 0.4945577085018158, - "learning_rate": 8.900016064065923e-05, - "loss": 0.7267, - "step": 248 - }, - { - "epoch": 1.2835051546391751, - "grad_norm": 0.5541755557060242, - "learning_rate": 8.889308934776572e-05, - "loss": 0.8397, - "step": 249 - }, - { - "epoch": 1.2886597938144329, - "grad_norm": 0.5028125643730164, - "learning_rate": 8.8785564535221e-05, - "loss": 0.8193, - "step": 250 - }, - { - "epoch": 1.2938144329896908, - "grad_norm": 0.48722466826438904, - "learning_rate": 8.867758745683687e-05, - "loss": 0.8063, - "step": 251 - }, - { - "epoch": 1.2989690721649485, - "grad_norm": 0.5241202116012573, - "learning_rate": 8.85691593716989e-05, - "loss": 0.8016, - "step": 252 - }, - { - "epoch": 1.3041237113402062, - "grad_norm": 0.5034804344177246, - "learning_rate": 8.84602815441517e-05, - "loss": 0.7144, - "step": 253 - }, - { - "epoch": 1.309278350515464, - "grad_norm": 0.4778522849082947, - "learning_rate": 8.835095524378414e-05, - "loss": 0.8033, - "step": 254 - }, - { - "epoch": 1.3144329896907216, - "grad_norm": 0.4429955780506134, - "learning_rate": 8.824118174541464e-05, - "loss": 0.6884, - "step": 255 - }, - { - "epoch": 1.3195876288659794, - "grad_norm": 0.5443170666694641, - "learning_rate": 8.81309623290762e-05, - "loss": 0.8507, - "step": 256 - }, - { - "epoch": 1.324742268041237, - "grad_norm": 0.47870996594429016, - "learning_rate": 8.802029828000156e-05, - "loss": 0.8128, - "step": 257 - }, - { - "epoch": 1.3298969072164948, - "grad_norm": 0.48450103402137756, - "learning_rate": 8.790919088860814e-05, - "loss": 0.7608, - "step": 258 - }, - { - "epoch": 1.3350515463917525, - "grad_norm": 0.5307578444480896, - "learning_rate": 8.779764145048308e-05, - "loss": 0.7209, - "step": 259 - }, - { - "epoch": 1.3402061855670104, - "grad_norm": 0.530427098274231, - "learning_rate": 8.768565126636806e-05, - "loss": 0.712, - "step": 260 - }, - { - "epoch": 1.3453608247422681, - "grad_norm": 0.48835986852645874, - "learning_rate": 8.757322164214413e-05, - "loss": 0.6929, - "step": 261 - }, - { - "epoch": 1.3505154639175259, - "grad_norm": 0.52711021900177, - "learning_rate": 8.746035388881655e-05, - "loss": 0.8256, - "step": 262 - }, - { - "epoch": 1.3556701030927836, - "grad_norm": 0.48932960629463196, - "learning_rate": 8.734704932249944e-05, - "loss": 0.9334, - "step": 263 - }, - { - "epoch": 1.3608247422680413, - "grad_norm": 0.5296959280967712, - "learning_rate": 8.723330926440045e-05, - "loss": 0.8442, - "step": 264 - }, - { - "epoch": 1.365979381443299, - "grad_norm": 0.4768982529640198, - "learning_rate": 8.711913504080534e-05, - "loss": 0.6466, - "step": 265 - }, - { - "epoch": 1.3711340206185567, - "grad_norm": 0.4151005744934082, - "learning_rate": 8.70045279830626e-05, - "loss": 0.6588, - "step": 266 - }, - { - "epoch": 1.3762886597938144, - "grad_norm": 0.4385485053062439, - "learning_rate": 8.688948942756778e-05, - "loss": 0.6547, - "step": 267 - }, - { - "epoch": 1.3814432989690721, - "grad_norm": 0.5252705812454224, - "learning_rate": 8.677402071574805e-05, - "loss": 0.6968, - "step": 268 - }, - { - "epoch": 1.3865979381443299, - "grad_norm": 0.5336004495620728, - "learning_rate": 8.665812319404643e-05, - "loss": 0.6702, - "step": 269 - }, - { - "epoch": 1.3917525773195876, - "grad_norm": 0.5621787905693054, - "learning_rate": 8.654179821390621e-05, - "loss": 0.8236, - "step": 270 - }, - { - "epoch": 1.3969072164948453, - "grad_norm": 0.3868958353996277, - "learning_rate": 8.642504713175508e-05, - "loss": 0.7168, - "step": 271 - }, - { - "epoch": 1.402061855670103, - "grad_norm": 0.5103661417961121, - "learning_rate": 8.630787130898943e-05, - "loss": 0.8211, - "step": 272 - }, - { - "epoch": 1.4072164948453607, - "grad_norm": 0.46545475721359253, - "learning_rate": 8.619027211195836e-05, - "loss": 0.7592, - "step": 273 - }, - { - "epoch": 1.4123711340206184, - "grad_norm": 0.4790486991405487, - "learning_rate": 8.607225091194779e-05, - "loss": 0.6835, - "step": 274 - }, - { - "epoch": 1.4175257731958764, - "grad_norm": 0.4155111610889435, - "learning_rate": 8.595380908516454e-05, - "loss": 0.5178, - "step": 275 - }, - { - "epoch": 1.422680412371134, - "grad_norm": 0.49269962310791016, - "learning_rate": 8.583494801272018e-05, - "loss": 0.7367, - "step": 276 - }, - { - "epoch": 1.4278350515463918, - "grad_norm": 0.4321769177913666, - "learning_rate": 8.571566908061497e-05, - "loss": 0.6714, - "step": 277 - }, - { - "epoch": 1.4329896907216495, - "grad_norm": 0.5094105005264282, - "learning_rate": 8.559597367972168e-05, - "loss": 0.7646, - "step": 278 - }, - { - "epoch": 1.4381443298969072, - "grad_norm": 0.5336481928825378, - "learning_rate": 8.547586320576945e-05, - "loss": 0.7603, - "step": 279 - }, - { - "epoch": 1.443298969072165, - "grad_norm": 0.46947407722473145, - "learning_rate": 8.535533905932738e-05, - "loss": 0.6531, - "step": 280 - }, - { - "epoch": 1.4484536082474226, - "grad_norm": 0.5053247213363647, - "learning_rate": 8.52344026457883e-05, - "loss": 0.753, - "step": 281 - }, - { - "epoch": 1.4536082474226804, - "grad_norm": 0.5084853172302246, - "learning_rate": 8.511305537535237e-05, - "loss": 0.6503, - "step": 282 - }, - { - "epoch": 1.458762886597938, - "grad_norm": 0.49093201756477356, - "learning_rate": 8.499129866301057e-05, - "loss": 0.689, - "step": 283 - }, - { - "epoch": 1.463917525773196, - "grad_norm": 0.4668569564819336, - "learning_rate": 8.48691339285283e-05, - "loss": 0.726, - "step": 284 - }, - { - "epoch": 1.4690721649484537, - "grad_norm": 0.5379082560539246, - "learning_rate": 8.474656259642873e-05, - "loss": 0.7205, - "step": 285 - }, - { - "epoch": 1.4742268041237114, - "grad_norm": 0.6103803515434265, - "learning_rate": 8.46235860959763e-05, - "loss": 0.8544, - "step": 286 - }, - { - "epoch": 1.4793814432989691, - "grad_norm": 0.5668134093284607, - "learning_rate": 8.450020586115987e-05, - "loss": 0.7428, - "step": 287 - }, - { - "epoch": 1.4845360824742269, - "grad_norm": 0.4947795271873474, - "learning_rate": 8.437642333067625e-05, - "loss": 0.7288, - "step": 288 - }, - { - "epoch": 1.4896907216494846, - "grad_norm": 0.683354377746582, - "learning_rate": 8.42522399479132e-05, - "loss": 0.7857, - "step": 289 - }, - { - "epoch": 1.4948453608247423, - "grad_norm": 0.42545896768569946, - "learning_rate": 8.412765716093272e-05, - "loss": 0.6486, - "step": 290 - }, - { - "epoch": 1.5, - "grad_norm": 0.5766370296478271, - "learning_rate": 8.40026764224541e-05, - "loss": 0.8488, - "step": 291 - }, - { - "epoch": 1.5, - "eval_loss": 0.7756565809249878, - "eval_runtime": 23.1361, - "eval_samples_per_second": 7.088, - "eval_steps_per_second": 1.772, - "step": 291 - }, - { - "epoch": 1.5051546391752577, - "grad_norm": 0.47217854857444763, - "learning_rate": 8.387729918983706e-05, - "loss": 0.5576, - "step": 292 - }, - { - "epoch": 1.5103092783505154, - "grad_norm": 0.5849595665931702, - "learning_rate": 8.375152692506468e-05, - "loss": 0.9562, - "step": 293 - }, - { - "epoch": 1.5154639175257731, - "grad_norm": 0.5576279163360596, - "learning_rate": 8.362536109472636e-05, - "loss": 0.8353, - "step": 294 - }, - { - "epoch": 1.5206185567010309, - "grad_norm": 0.5278394222259521, - "learning_rate": 8.349880317000082e-05, - "loss": 0.6668, - "step": 295 - }, - { - "epoch": 1.5257731958762886, - "grad_norm": 0.47538498044013977, - "learning_rate": 8.337185462663878e-05, - "loss": 0.6829, - "step": 296 - }, - { - "epoch": 1.5309278350515463, - "grad_norm": 0.5417651534080505, - "learning_rate": 8.32445169449459e-05, - "loss": 0.9411, - "step": 297 - }, - { - "epoch": 1.536082474226804, - "grad_norm": 0.516247034072876, - "learning_rate": 8.311679160976539e-05, - "loss": 0.8907, - "step": 298 - }, - { - "epoch": 1.5412371134020617, - "grad_norm": 0.5147285461425781, - "learning_rate": 8.29886801104608e-05, - "loss": 0.7747, - "step": 299 - }, - { - "epoch": 1.5463917525773194, - "grad_norm": 0.4982938766479492, - "learning_rate": 8.286018394089863e-05, - "loss": 0.7649, - "step": 300 - }, - { - "epoch": 1.5515463917525774, - "grad_norm": 0.5095208883285522, - "learning_rate": 8.273130459943086e-05, - "loss": 0.7414, - "step": 301 - }, - { - "epoch": 1.556701030927835, - "grad_norm": 0.6313266754150391, - "learning_rate": 8.260204358887754e-05, - "loss": 0.9327, - "step": 302 - }, - { - "epoch": 1.5618556701030928, - "grad_norm": 0.46112972497940063, - "learning_rate": 8.247240241650918e-05, - "loss": 0.6451, - "step": 303 - }, - { - "epoch": 1.5670103092783505, - "grad_norm": 0.5244841575622559, - "learning_rate": 8.234238259402935e-05, - "loss": 0.7223, - "step": 304 - }, - { - "epoch": 1.5721649484536082, - "grad_norm": 0.4885084927082062, - "learning_rate": 8.221198563755682e-05, - "loss": 0.6624, - "step": 305 - }, - { - "epoch": 1.577319587628866, - "grad_norm": 0.47980961203575134, - "learning_rate": 8.208121306760805e-05, - "loss": 0.7423, - "step": 306 - }, - { - "epoch": 1.5824742268041239, - "grad_norm": 0.4056549072265625, - "learning_rate": 8.195006640907942e-05, - "loss": 0.588, - "step": 307 - }, - { - "epoch": 1.5876288659793816, - "grad_norm": 0.5350937843322754, - "learning_rate": 8.181854719122939e-05, - "loss": 0.742, - "step": 308 - }, - { - "epoch": 1.5927835051546393, - "grad_norm": 0.555401086807251, - "learning_rate": 8.168665694766073e-05, - "loss": 0.8262, - "step": 309 - }, - { - "epoch": 1.597938144329897, - "grad_norm": 0.4927338659763336, - "learning_rate": 8.155439721630264e-05, - "loss": 0.7753, - "step": 310 - }, - { - "epoch": 1.6030927835051547, - "grad_norm": 0.4449557363986969, - "learning_rate": 8.142176953939279e-05, - "loss": 0.6069, - "step": 311 - }, - { - "epoch": 1.6082474226804124, - "grad_norm": 0.4355365037918091, - "learning_rate": 8.128877546345933e-05, - "loss": 0.5786, - "step": 312 - }, - { - "epoch": 1.6134020618556701, - "grad_norm": 0.5652406215667725, - "learning_rate": 8.115541653930286e-05, - "loss": 0.648, - "step": 313 - }, - { - "epoch": 1.6185567010309279, - "grad_norm": 0.4682961702346802, - "learning_rate": 8.102169432197842e-05, - "loss": 0.6993, - "step": 314 - }, - { - "epoch": 1.6237113402061856, - "grad_norm": 0.4309576451778412, - "learning_rate": 8.088761037077718e-05, - "loss": 0.843, - "step": 315 - }, - { - "epoch": 1.6288659793814433, - "grad_norm": 0.4121072292327881, - "learning_rate": 8.075316624920848e-05, - "loss": 0.569, - "step": 316 - }, - { - "epoch": 1.634020618556701, - "grad_norm": 0.5206684470176697, - "learning_rate": 8.061836352498145e-05, - "loss": 0.7025, - "step": 317 - }, - { - "epoch": 1.6391752577319587, - "grad_norm": 0.5165379643440247, - "learning_rate": 8.048320376998673e-05, - "loss": 0.6356, - "step": 318 - }, - { - "epoch": 1.6443298969072164, - "grad_norm": 0.5222402811050415, - "learning_rate": 8.034768856027826e-05, - "loss": 0.8059, - "step": 319 - }, - { - "epoch": 1.6494845360824741, - "grad_norm": 0.6080288887023926, - "learning_rate": 8.021181947605473e-05, - "loss": 0.7947, - "step": 320 - }, - { - "epoch": 1.6546391752577319, - "grad_norm": 0.47382211685180664, - "learning_rate": 8.007559810164133e-05, - "loss": 0.7139, - "step": 321 - }, - { - "epoch": 1.6597938144329896, - "grad_norm": 0.568452775478363, - "learning_rate": 7.993902602547113e-05, - "loss": 0.6434, - "step": 322 - }, - { - "epoch": 1.6649484536082473, - "grad_norm": 0.5808526277542114, - "learning_rate": 7.980210484006666e-05, - "loss": 0.7782, - "step": 323 - }, - { - "epoch": 1.670103092783505, - "grad_norm": 0.4880453050136566, - "learning_rate": 7.966483614202128e-05, - "loss": 0.7838, - "step": 324 - }, - { - "epoch": 1.675257731958763, - "grad_norm": 0.4492298662662506, - "learning_rate": 7.952722153198054e-05, - "loss": 0.5719, - "step": 325 - }, - { - "epoch": 1.6804123711340206, - "grad_norm": 0.5690252184867859, - "learning_rate": 7.938926261462366e-05, - "loss": 0.7023, - "step": 326 - }, - { - "epoch": 1.6855670103092784, - "grad_norm": 0.4502432644367218, - "learning_rate": 7.925096099864464e-05, - "loss": 0.5499, - "step": 327 - }, - { - "epoch": 1.690721649484536, - "grad_norm": 0.5307354927062988, - "learning_rate": 7.911231829673356e-05, - "loss": 0.7422, - "step": 328 - }, - { - "epoch": 1.6958762886597938, - "grad_norm": 0.45356300473213196, - "learning_rate": 7.897333612555785e-05, - "loss": 0.5429, - "step": 329 - }, - { - "epoch": 1.7010309278350515, - "grad_norm": 0.5033542513847351, - "learning_rate": 7.883401610574336e-05, - "loss": 0.7098, - "step": 330 - }, - { - "epoch": 1.7061855670103094, - "grad_norm": 0.5916339159011841, - "learning_rate": 7.869435986185547e-05, - "loss": 0.8832, - "step": 331 - }, - { - "epoch": 1.7113402061855671, - "grad_norm": 0.49946147203445435, - "learning_rate": 7.855436902238017e-05, - "loss": 0.8333, - "step": 332 - }, - { - "epoch": 1.7164948453608249, - "grad_norm": 0.4719221293926239, - "learning_rate": 7.841404521970505e-05, - "loss": 0.56, - "step": 333 - }, - { - "epoch": 1.7216494845360826, - "grad_norm": 0.4418396055698395, - "learning_rate": 7.82733900901003e-05, - "loss": 0.4716, - "step": 334 - }, - { - "epoch": 1.7268041237113403, - "grad_norm": 0.45411449670791626, - "learning_rate": 7.813240527369959e-05, - "loss": 0.6581, - "step": 335 - }, - { - "epoch": 1.731958762886598, - "grad_norm": 0.48833197355270386, - "learning_rate": 7.799109241448091e-05, - "loss": 0.652, - "step": 336 - }, - { - "epoch": 1.7371134020618557, - "grad_norm": 0.5444779396057129, - "learning_rate": 7.784945316024756e-05, - "loss": 0.7405, - "step": 337 - }, - { - "epoch": 1.7422680412371134, - "grad_norm": 0.4810863733291626, - "learning_rate": 7.770748916260875e-05, - "loss": 0.7171, - "step": 338 - }, - { - "epoch": 1.7474226804123711, - "grad_norm": 0.49916332960128784, - "learning_rate": 7.756520207696041e-05, - "loss": 0.7158, - "step": 339 - }, - { - "epoch": 1.7525773195876289, - "grad_norm": 0.46805036067962646, - "learning_rate": 7.742259356246593e-05, - "loss": 0.6331, - "step": 340 - }, - { - "epoch": 1.7577319587628866, - "grad_norm": 0.48011356592178345, - "learning_rate": 7.727966528203678e-05, - "loss": 0.591, - "step": 341 - }, - { - "epoch": 1.7628865979381443, - "grad_norm": 0.4629339277744293, - "learning_rate": 7.71364189023131e-05, - "loss": 0.7329, - "step": 342 - }, - { - "epoch": 1.768041237113402, - "grad_norm": 0.5280289649963379, - "learning_rate": 7.699285609364424e-05, - "loss": 0.711, - "step": 343 - }, - { - "epoch": 1.7731958762886597, - "grad_norm": 0.6236256957054138, - "learning_rate": 7.68489785300694e-05, - "loss": 0.826, - "step": 344 - }, - { - "epoch": 1.7783505154639174, - "grad_norm": 0.49681684374809265, - "learning_rate": 7.670478788929802e-05, - "loss": 0.8561, - "step": 345 - }, - { - "epoch": 1.7835051546391751, - "grad_norm": 0.4826050400733948, - "learning_rate": 7.656028585269018e-05, - "loss": 0.6096, - "step": 346 - }, - { - "epoch": 1.7886597938144329, - "grad_norm": 0.513830840587616, - "learning_rate": 7.641547410523709e-05, - "loss": 0.6318, - "step": 347 - }, - { - "epoch": 1.7938144329896906, - "grad_norm": 0.4502808153629303, - "learning_rate": 7.627035433554138e-05, - "loss": 0.696, - "step": 348 - }, - { - "epoch": 1.7989690721649485, - "grad_norm": 0.5793465375900269, - "learning_rate": 7.612492823579745e-05, - "loss": 0.6974, - "step": 349 - }, - { - "epoch": 1.8041237113402062, - "grad_norm": 0.4899560809135437, - "learning_rate": 7.597919750177168e-05, - "loss": 0.6863, - "step": 350 - }, - { - "epoch": 1.809278350515464, - "grad_norm": 0.527481198310852, - "learning_rate": 7.583316383278273e-05, - "loss": 0.6467, - "step": 351 - }, - { - "epoch": 1.8144329896907216, - "grad_norm": 0.5642676949501038, - "learning_rate": 7.568682893168164e-05, - "loss": 0.71, - "step": 352 - }, - { - "epoch": 1.8195876288659794, - "grad_norm": 0.46447527408599854, - "learning_rate": 7.554019450483208e-05, - "loss": 0.791, - "step": 353 - }, - { - "epoch": 1.824742268041237, - "grad_norm": 0.5443849563598633, - "learning_rate": 7.539326226209031e-05, - "loss": 0.6473, - "step": 354 - }, - { - "epoch": 1.829896907216495, - "grad_norm": 0.4533563554286957, - "learning_rate": 7.524603391678541e-05, - "loss": 0.8136, - "step": 355 - }, - { - "epoch": 1.8350515463917527, - "grad_norm": 0.5911179780960083, - "learning_rate": 7.509851118569915e-05, - "loss": 0.6774, - "step": 356 - }, - { - "epoch": 1.8402061855670104, - "grad_norm": 0.38583889603614807, - "learning_rate": 7.495069578904608e-05, - "loss": 0.5882, - "step": 357 - }, - { - "epoch": 1.8453608247422681, - "grad_norm": 0.492844820022583, - "learning_rate": 7.48025894504534e-05, - "loss": 0.6018, - "step": 358 - }, - { - "epoch": 1.8505154639175259, - "grad_norm": 0.5465859770774841, - "learning_rate": 7.465419389694092e-05, - "loss": 0.7754, - "step": 359 - }, - { - "epoch": 1.8556701030927836, - "grad_norm": 0.5601467490196228, - "learning_rate": 7.450551085890087e-05, - "loss": 0.8233, - "step": 360 - }, - { - "epoch": 1.8608247422680413, - "grad_norm": 0.4624144434928894, - "learning_rate": 7.435654207007773e-05, - "loss": 0.6372, - "step": 361 - }, - { - "epoch": 1.865979381443299, - "grad_norm": 0.5944560766220093, - "learning_rate": 7.420728926754803e-05, - "loss": 0.8245, - "step": 362 - }, - { - "epoch": 1.8711340206185567, - "grad_norm": 0.44903764128685, - "learning_rate": 7.405775419170014e-05, - "loss": 0.5653, - "step": 363 - }, - { - "epoch": 1.8762886597938144, - "grad_norm": 0.401335746049881, - "learning_rate": 7.390793858621386e-05, - "loss": 0.658, - "step": 364 - }, - { - "epoch": 1.8814432989690721, - "grad_norm": 0.5450597405433655, - "learning_rate": 7.375784419804019e-05, - "loss": 0.6493, - "step": 365 - }, - { - "epoch": 1.8865979381443299, - "grad_norm": 0.4412764608860016, - "learning_rate": 7.360747277738094e-05, - "loss": 0.6466, - "step": 366 - }, - { - "epoch": 1.8917525773195876, - "grad_norm": 0.503523588180542, - "learning_rate": 7.345682607766826e-05, - "loss": 0.8579, - "step": 367 - }, - { - "epoch": 1.8969072164948453, - "grad_norm": 0.48517486453056335, - "learning_rate": 7.330590585554428e-05, - "loss": 0.7231, - "step": 368 - }, - { - "epoch": 1.902061855670103, - "grad_norm": 0.39123615622520447, - "learning_rate": 7.315471387084056e-05, - "loss": 0.5414, - "step": 369 - }, - { - "epoch": 1.9072164948453607, - "grad_norm": 0.5755649209022522, - "learning_rate": 7.300325188655761e-05, - "loss": 0.8224, - "step": 370 - }, - { - "epoch": 1.9123711340206184, - "grad_norm": 0.5162664651870728, - "learning_rate": 7.285152166884432e-05, - "loss": 0.7323, - "step": 371 - }, - { - "epoch": 1.9175257731958761, - "grad_norm": 0.47386038303375244, - "learning_rate": 7.269952498697734e-05, - "loss": 0.7753, - "step": 372 - }, - { - "epoch": 1.922680412371134, - "grad_norm": 0.5395263433456421, - "learning_rate": 7.25472636133405e-05, - "loss": 0.623, - "step": 373 - }, - { - "epoch": 1.9278350515463918, - "grad_norm": 0.5538620352745056, - "learning_rate": 7.23947393234041e-05, - "loss": 0.8091, - "step": 374 - }, - { - "epoch": 1.9329896907216495, - "grad_norm": 0.615933358669281, - "learning_rate": 7.224195389570422e-05, - "loss": 0.7262, - "step": 375 - }, - { - "epoch": 1.9381443298969072, - "grad_norm": 0.6319516897201538, - "learning_rate": 7.208890911182197e-05, - "loss": 0.9344, - "step": 376 - }, - { - "epoch": 1.943298969072165, - "grad_norm": 0.5680528879165649, - "learning_rate": 7.193560675636277e-05, - "loss": 0.8644, - "step": 377 - }, - { - "epoch": 1.9484536082474226, - "grad_norm": 0.5286463499069214, - "learning_rate": 7.178204861693545e-05, - "loss": 0.7695, - "step": 378 - }, - { - "epoch": 1.9536082474226806, - "grad_norm": 0.43873798847198486, - "learning_rate": 7.162823648413151e-05, - "loss": 0.6283, - "step": 379 - }, - { - "epoch": 1.9587628865979383, - "grad_norm": 0.4513707458972931, - "learning_rate": 7.14741721515041e-05, - "loss": 0.6292, - "step": 380 - }, - { - "epoch": 1.963917525773196, - "grad_norm": 0.4606168568134308, - "learning_rate": 7.131985741554728e-05, - "loss": 0.6355, - "step": 381 - }, - { - "epoch": 1.9690721649484537, - "grad_norm": 0.5343955159187317, - "learning_rate": 7.116529407567489e-05, - "loss": 0.7579, - "step": 382 - }, - { - "epoch": 1.9742268041237114, - "grad_norm": 0.5064263343811035, - "learning_rate": 7.101048393419977e-05, - "loss": 0.6727, - "step": 383 - }, - { - "epoch": 1.9793814432989691, - "grad_norm": 0.4762507677078247, - "learning_rate": 7.085542879631253e-05, - "loss": 0.7425, - "step": 384 - }, - { - "epoch": 1.9845360824742269, - "grad_norm": 0.44575661420822144, - "learning_rate": 7.070013047006068e-05, - "loss": 0.6514, - "step": 385 - }, - { - "epoch": 1.9896907216494846, - "grad_norm": 0.44127196073532104, - "learning_rate": 7.054459076632743e-05, - "loss": 0.7233, - "step": 386 - }, - { - "epoch": 1.9948453608247423, - "grad_norm": 0.44311100244522095, - "learning_rate": 7.038881149881058e-05, - "loss": 0.672, - "step": 387 - }, - { - "epoch": 2.0, - "grad_norm": 0.6681380867958069, - "learning_rate": 7.02327944840015e-05, - "loss": 0.7942, - "step": 388 - }, - { - "epoch": 2.0, - "eval_loss": 0.7649276256561279, - "eval_runtime": 23.0997, - "eval_samples_per_second": 7.1, - "eval_steps_per_second": 1.775, - "step": 388 - }, - { - "epoch": 2.0051546391752577, - "grad_norm": 0.4572388827800751, - "learning_rate": 7.007654154116377e-05, - "loss": 0.6377, - "step": 389 - }, - { - "epoch": 2.0103092783505154, - "grad_norm": 0.42942386865615845, - "learning_rate": 6.992005449231208e-05, - "loss": 0.6381, - "step": 390 - }, - { - "epoch": 2.015463917525773, - "grad_norm": 0.5012478828430176, - "learning_rate": 6.976333516219096e-05, - "loss": 0.6658, - "step": 391 - }, - { - "epoch": 2.020618556701031, - "grad_norm": 0.5113167762756348, - "learning_rate": 6.960638537825352e-05, - "loss": 0.5872, - "step": 392 - }, - { - "epoch": 2.0257731958762886, - "grad_norm": 0.5024546980857849, - "learning_rate": 6.944920697064004e-05, - "loss": 0.7016, - "step": 393 - }, - { - "epoch": 2.0309278350515463, - "grad_norm": 0.5500175952911377, - "learning_rate": 6.929180177215678e-05, - "loss": 0.6198, - "step": 394 - }, - { - "epoch": 2.036082474226804, - "grad_norm": 0.468348890542984, - "learning_rate": 6.91341716182545e-05, - "loss": 0.6204, - "step": 395 - }, - { - "epoch": 2.0412371134020617, - "grad_norm": 0.5198677778244019, - "learning_rate": 6.897631834700709e-05, - "loss": 0.5496, - "step": 396 - }, - { - "epoch": 2.0463917525773194, - "grad_norm": 0.5778740644454956, - "learning_rate": 6.881824379909017e-05, - "loss": 0.6795, - "step": 397 - }, - { - "epoch": 2.051546391752577, - "grad_norm": 0.44100233912467957, - "learning_rate": 6.865994981775957e-05, - "loss": 0.5305, - "step": 398 - }, - { - "epoch": 2.056701030927835, - "grad_norm": 0.5090631246566772, - "learning_rate": 6.850143824882986e-05, - "loss": 0.5958, - "step": 399 - }, - { - "epoch": 2.0618556701030926, - "grad_norm": 0.577616810798645, - "learning_rate": 6.834271094065283e-05, - "loss": 0.7075, - "step": 400 - }, - { - "epoch": 2.0670103092783507, - "grad_norm": 0.5909703373908997, - "learning_rate": 6.818376974409593e-05, - "loss": 0.629, - "step": 401 - }, - { - "epoch": 2.0721649484536084, - "grad_norm": 0.5738269686698914, - "learning_rate": 6.802461651252073e-05, - "loss": 0.7301, - "step": 402 - }, - { - "epoch": 2.077319587628866, - "grad_norm": 0.49154719710350037, - "learning_rate": 6.786525310176123e-05, - "loss": 0.6386, - "step": 403 - }, - { - "epoch": 2.082474226804124, - "grad_norm": 0.5145403742790222, - "learning_rate": 6.770568137010226e-05, - "loss": 0.6848, - "step": 404 - }, - { - "epoch": 2.0876288659793816, - "grad_norm": 0.496865838766098, - "learning_rate": 6.754590317825785e-05, - "loss": 0.605, - "step": 405 - }, - { - "epoch": 2.0927835051546393, - "grad_norm": 0.6585299372673035, - "learning_rate": 6.738592038934946e-05, - "loss": 0.7737, - "step": 406 - }, - { - "epoch": 2.097938144329897, - "grad_norm": 0.5088334679603577, - "learning_rate": 6.722573486888427e-05, - "loss": 0.6252, - "step": 407 - }, - { - "epoch": 2.1030927835051547, - "grad_norm": 0.5195786356925964, - "learning_rate": 6.706534848473352e-05, - "loss": 0.78, - "step": 408 - }, - { - "epoch": 2.1082474226804124, - "grad_norm": 0.6322928071022034, - "learning_rate": 6.69047631071106e-05, - "loss": 0.7107, - "step": 409 - }, - { - "epoch": 2.11340206185567, - "grad_norm": 0.5255240797996521, - "learning_rate": 6.674398060854931e-05, - "loss": 0.7664, - "step": 410 - }, - { - "epoch": 2.118556701030928, - "grad_norm": 0.5459422469139099, - "learning_rate": 6.658300286388203e-05, - "loss": 0.6449, - "step": 411 - }, - { - "epoch": 2.1237113402061856, - "grad_norm": 0.3948241174221039, - "learning_rate": 6.642183175021779e-05, - "loss": 0.5072, - "step": 412 - }, - { - "epoch": 2.1288659793814433, - "grad_norm": 0.6196532845497131, - "learning_rate": 6.62604691469205e-05, - "loss": 0.7733, - "step": 413 - }, - { - "epoch": 2.134020618556701, - "grad_norm": 0.6511134505271912, - "learning_rate": 6.609891693558692e-05, - "loss": 0.5305, - "step": 414 - }, - { - "epoch": 2.1391752577319587, - "grad_norm": 0.6522036194801331, - "learning_rate": 6.59371770000248e-05, - "loss": 0.7147, - "step": 415 - }, - { - "epoch": 2.1443298969072164, - "grad_norm": 0.532319188117981, - "learning_rate": 6.577525122623084e-05, - "loss": 0.6214, - "step": 416 - }, - { - "epoch": 2.149484536082474, - "grad_norm": 0.5139403343200684, - "learning_rate": 6.561314150236882e-05, - "loss": 0.551, - "step": 417 - }, - { - "epoch": 2.154639175257732, - "grad_norm": 0.6814868450164795, - "learning_rate": 6.545084971874738e-05, - "loss": 0.6843, - "step": 418 - }, - { - "epoch": 2.1597938144329896, - "grad_norm": 0.6147984862327576, - "learning_rate": 6.528837776779819e-05, - "loss": 0.5977, - "step": 419 - }, - { - "epoch": 2.1649484536082473, - "grad_norm": 0.5623170137405396, - "learning_rate": 6.51257275440538e-05, - "loss": 0.6759, - "step": 420 - }, - { - "epoch": 2.170103092783505, - "grad_norm": 0.491099089384079, - "learning_rate": 6.496290094412546e-05, - "loss": 0.5957, - "step": 421 - }, - { - "epoch": 2.1752577319587627, - "grad_norm": 0.6292416453361511, - "learning_rate": 6.479989986668118e-05, - "loss": 0.6852, - "step": 422 - }, - { - "epoch": 2.1804123711340204, - "grad_norm": 0.5607630610466003, - "learning_rate": 6.463672621242342e-05, - "loss": 0.5422, - "step": 423 - }, - { - "epoch": 2.1855670103092786, - "grad_norm": 0.5499788522720337, - "learning_rate": 6.447338188406704e-05, - "loss": 0.7426, - "step": 424 - }, - { - "epoch": 2.1907216494845363, - "grad_norm": 0.5566446781158447, - "learning_rate": 6.430986878631707e-05, - "loss": 0.518, - "step": 425 - }, - { - "epoch": 2.195876288659794, - "grad_norm": 0.5721368789672852, - "learning_rate": 6.41461888258465e-05, - "loss": 0.5814, - "step": 426 - }, - { - "epoch": 2.2010309278350517, - "grad_norm": 0.6252114176750183, - "learning_rate": 6.398234391127406e-05, - "loss": 0.8248, - "step": 427 - }, - { - "epoch": 2.2061855670103094, - "grad_norm": 0.5612536668777466, - "learning_rate": 6.381833595314195e-05, - "loss": 0.6407, - "step": 428 - }, - { - "epoch": 2.211340206185567, - "grad_norm": 0.6003265380859375, - "learning_rate": 6.365416686389358e-05, - "loss": 0.7016, - "step": 429 - }, - { - "epoch": 2.216494845360825, - "grad_norm": 0.4541587829589844, - "learning_rate": 6.348983855785121e-05, - "loss": 0.5283, - "step": 430 - }, - { - "epoch": 2.2216494845360826, - "grad_norm": 0.5990555882453918, - "learning_rate": 6.332535295119377e-05, - "loss": 0.6052, - "step": 431 - }, - { - "epoch": 2.2268041237113403, - "grad_norm": 0.5104555487632751, - "learning_rate": 6.31607119619343e-05, - "loss": 0.551, - "step": 432 - }, - { - "epoch": 2.231958762886598, - "grad_norm": 0.5985530018806458, - "learning_rate": 6.299591750989779e-05, - "loss": 0.6921, - "step": 433 - }, - { - "epoch": 2.2371134020618557, - "grad_norm": 0.5501850843429565, - "learning_rate": 6.283097151669869e-05, - "loss": 0.6221, - "step": 434 - }, - { - "epoch": 2.2422680412371134, - "grad_norm": 0.6366755366325378, - "learning_rate": 6.266587590571852e-05, - "loss": 0.8114, - "step": 435 - }, - { - "epoch": 2.247422680412371, - "grad_norm": 0.6541351675987244, - "learning_rate": 6.250063260208346e-05, - "loss": 0.7157, - "step": 436 - }, - { - "epoch": 2.252577319587629, - "grad_norm": 0.4696384370326996, - "learning_rate": 6.233524353264187e-05, - "loss": 0.5158, - "step": 437 - }, - { - "epoch": 2.2577319587628866, - "grad_norm": 0.5908606648445129, - "learning_rate": 6.216971062594179e-05, - "loss": 0.5988, - "step": 438 - }, - { - "epoch": 2.2628865979381443, - "grad_norm": 0.6051414608955383, - "learning_rate": 6.200403581220861e-05, - "loss": 0.5305, - "step": 439 - }, - { - "epoch": 2.268041237113402, - "grad_norm": 0.544257402420044, - "learning_rate": 6.183822102332234e-05, - "loss": 0.5912, - "step": 440 - }, - { - "epoch": 2.2731958762886597, - "grad_norm": 0.6186553239822388, - "learning_rate": 6.167226819279528e-05, - "loss": 0.7164, - "step": 441 - }, - { - "epoch": 2.2783505154639174, - "grad_norm": 0.6269963979721069, - "learning_rate": 6.150617925574933e-05, - "loss": 0.7621, - "step": 442 - }, - { - "epoch": 2.283505154639175, - "grad_norm": 0.5552319288253784, - "learning_rate": 6.13399561488935e-05, - "loss": 0.6343, - "step": 443 - }, - { - "epoch": 2.288659793814433, - "grad_norm": 0.5796776413917542, - "learning_rate": 6.117360081050136e-05, - "loss": 0.6439, - "step": 444 - }, - { - "epoch": 2.2938144329896906, - "grad_norm": 0.7522923946380615, - "learning_rate": 6.1007115180388285e-05, - "loss": 0.716, - "step": 445 - }, - { - "epoch": 2.2989690721649483, - "grad_norm": 0.6246132850646973, - "learning_rate": 6.0840501199889046e-05, - "loss": 0.5752, - "step": 446 - }, - { - "epoch": 2.304123711340206, - "grad_norm": 0.6404234766960144, - "learning_rate": 6.067376081183499e-05, - "loss": 0.6072, - "step": 447 - }, - { - "epoch": 2.3092783505154637, - "grad_norm": 0.5800334811210632, - "learning_rate": 6.050689596053151e-05, - "loss": 0.6807, - "step": 448 - }, - { - "epoch": 2.3144329896907214, - "grad_norm": 0.5738640427589417, - "learning_rate": 6.0339908591735296e-05, - "loss": 0.6723, - "step": 449 - }, - { - "epoch": 2.319587628865979, - "grad_norm": 0.5659841895103455, - "learning_rate": 6.01728006526317e-05, - "loss": 0.5924, - "step": 450 - }, - { - "epoch": 2.3247422680412373, - "grad_norm": 0.6528278589248657, - "learning_rate": 6.0005574091811964e-05, - "loss": 0.5736, - "step": 451 - }, - { - "epoch": 2.329896907216495, - "grad_norm": 0.5550704002380371, - "learning_rate": 5.9838230859250586e-05, - "loss": 0.5671, - "step": 452 - }, - { - "epoch": 2.3350515463917527, - "grad_norm": 0.5788085460662842, - "learning_rate": 5.967077290628249e-05, - "loss": 0.6304, - "step": 453 - }, - { - "epoch": 2.3402061855670104, - "grad_norm": 0.5700084567070007, - "learning_rate": 5.950320218558037e-05, - "loss": 0.6385, - "step": 454 - }, - { - "epoch": 2.345360824742268, - "grad_norm": 0.7324642539024353, - "learning_rate": 5.9335520651131814e-05, - "loss": 0.6061, - "step": 455 - }, - { - "epoch": 2.350515463917526, - "grad_norm": 0.5059091448783875, - "learning_rate": 5.9167730258216627e-05, - "loss": 0.5316, - "step": 456 - }, - { - "epoch": 2.3556701030927836, - "grad_norm": 0.678951621055603, - "learning_rate": 5.899983296338392e-05, - "loss": 0.6013, - "step": 457 - }, - { - "epoch": 2.3608247422680413, - "grad_norm": 0.6194437742233276, - "learning_rate": 5.8831830724429384e-05, - "loss": 0.5817, - "step": 458 - }, - { - "epoch": 2.365979381443299, - "grad_norm": 0.6131486892700195, - "learning_rate": 5.866372550037242e-05, - "loss": 0.5523, - "step": 459 - }, - { - "epoch": 2.3711340206185567, - "grad_norm": 0.5392377376556396, - "learning_rate": 5.849551925143334e-05, - "loss": 0.6512, - "step": 460 - }, - { - "epoch": 2.3762886597938144, - "grad_norm": 0.6425219774246216, - "learning_rate": 5.8327213939010414e-05, - "loss": 0.6352, - "step": 461 - }, - { - "epoch": 2.381443298969072, - "grad_norm": 0.5376563668251038, - "learning_rate": 5.815881152565712e-05, - "loss": 0.6266, - "step": 462 - }, - { - "epoch": 2.38659793814433, - "grad_norm": 0.6099573969841003, - "learning_rate": 5.799031397505913e-05, - "loss": 0.6225, - "step": 463 - }, - { - "epoch": 2.3917525773195876, - "grad_norm": 0.5236539244651794, - "learning_rate": 5.782172325201155e-05, - "loss": 0.6142, - "step": 464 - }, - { - "epoch": 2.3969072164948453, - "grad_norm": 0.5796985626220703, - "learning_rate": 5.7653041322395895e-05, - "loss": 0.5774, - "step": 465 - }, - { - "epoch": 2.402061855670103, - "grad_norm": 0.6779174208641052, - "learning_rate": 5.748427015315722e-05, - "loss": 0.7775, - "step": 466 - }, - { - "epoch": 2.4072164948453607, - "grad_norm": 0.5431979894638062, - "learning_rate": 5.7315411712281186e-05, - "loss": 0.5797, - "step": 467 - }, - { - "epoch": 2.4123711340206184, - "grad_norm": 0.5350040197372437, - "learning_rate": 5.714646796877108e-05, - "loss": 0.5288, - "step": 468 - }, - { - "epoch": 2.417525773195876, - "grad_norm": 0.5466572642326355, - "learning_rate": 5.697744089262491e-05, - "loss": 0.5502, - "step": 469 - }, - { - "epoch": 2.422680412371134, - "grad_norm": 0.6810032725334167, - "learning_rate": 5.680833245481234e-05, - "loss": 0.581, - "step": 470 - }, - { - "epoch": 2.4278350515463916, - "grad_norm": 0.6316937208175659, - "learning_rate": 5.6639144627251816e-05, - "loss": 0.7774, - "step": 471 - }, - { - "epoch": 2.4329896907216497, - "grad_norm": 0.5792400240898132, - "learning_rate": 5.646987938278753e-05, - "loss": 0.5606, - "step": 472 - }, - { - "epoch": 2.4381443298969074, - "grad_norm": 0.5614747405052185, - "learning_rate": 5.630053869516635e-05, - "loss": 0.5707, - "step": 473 - }, - { - "epoch": 2.443298969072165, - "grad_norm": 0.6040056347846985, - "learning_rate": 5.6131124539014926e-05, - "loss": 0.5709, - "step": 474 - }, - { - "epoch": 2.448453608247423, - "grad_norm": 0.5701522827148438, - "learning_rate": 5.596163888981656e-05, - "loss": 0.6838, - "step": 475 - }, - { - "epoch": 2.4536082474226806, - "grad_norm": 0.6044667363166809, - "learning_rate": 5.5792083723888225e-05, - "loss": 0.6885, - "step": 476 - }, - { - "epoch": 2.4587628865979383, - "grad_norm": 0.5631812214851379, - "learning_rate": 5.5622461018357486e-05, - "loss": 0.6584, - "step": 477 - }, - { - "epoch": 2.463917525773196, - "grad_norm": 0.5131292939186096, - "learning_rate": 5.5452772751139496e-05, - "loss": 0.4903, - "step": 478 - }, - { - "epoch": 2.4690721649484537, - "grad_norm": 0.7719168066978455, - "learning_rate": 5.5283020900913886e-05, - "loss": 0.673, - "step": 479 - }, - { - "epoch": 2.4742268041237114, - "grad_norm": 0.7306579947471619, - "learning_rate": 5.511320744710171e-05, - "loss": 0.9067, - "step": 480 - }, - { - "epoch": 2.479381443298969, - "grad_norm": 0.7012883424758911, - "learning_rate": 5.494333436984238e-05, - "loss": 0.6512, - "step": 481 - }, - { - "epoch": 2.484536082474227, - "grad_norm": 0.5695425868034363, - "learning_rate": 5.477340364997051e-05, - "loss": 0.6639, - "step": 482 - }, - { - "epoch": 2.4896907216494846, - "grad_norm": 0.561549723148346, - "learning_rate": 5.460341726899291e-05, - "loss": 0.6025, - "step": 483 - }, - { - "epoch": 2.4948453608247423, - "grad_norm": 0.6385191679000854, - "learning_rate": 5.4433377209065414e-05, - "loss": 0.6205, - "step": 484 - }, - { - "epoch": 2.5, - "grad_norm": 0.5434442758560181, - "learning_rate": 5.4263285452969806e-05, - "loss": 0.5898, - "step": 485 - }, - { - "epoch": 2.5, - "eval_loss": 0.7724881768226624, - "eval_runtime": 23.1178, - "eval_samples_per_second": 7.094, - "eval_steps_per_second": 1.774, - "step": 485 - }, - { - "epoch": 2.5051546391752577, - "grad_norm": 0.6231464147567749, - "learning_rate": 5.409314398409067e-05, - "loss": 0.5591, - "step": 486 - }, - { - "epoch": 2.5103092783505154, - "grad_norm": 0.5185215473175049, - "learning_rate": 5.392295478639225e-05, - "loss": 0.6953, - "step": 487 - }, - { - "epoch": 2.515463917525773, - "grad_norm": 0.6416254639625549, - "learning_rate": 5.3752719844395405e-05, - "loss": 0.7254, - "step": 488 - }, - { - "epoch": 2.520618556701031, - "grad_norm": 0.5942103266716003, - "learning_rate": 5.358244114315434e-05, - "loss": 0.629, - "step": 489 - }, - { - "epoch": 2.5257731958762886, - "grad_norm": 0.5755306482315063, - "learning_rate": 5.341212066823355e-05, - "loss": 0.6762, - "step": 490 - }, - { - "epoch": 2.5309278350515463, - "grad_norm": 0.6905494332313538, - "learning_rate": 5.324176040568465e-05, - "loss": 0.6028, - "step": 491 - }, - { - "epoch": 2.536082474226804, - "grad_norm": 0.6803380846977234, - "learning_rate": 5.307136234202318e-05, - "loss": 0.6943, - "step": 492 - }, - { - "epoch": 2.5412371134020617, - "grad_norm": 0.5133246779441833, - "learning_rate": 5.290092846420548e-05, - "loss": 0.6591, - "step": 493 - }, - { - "epoch": 2.5463917525773194, - "grad_norm": 0.5719724893569946, - "learning_rate": 5.27304607596055e-05, - "loss": 0.7672, - "step": 494 - }, - { - "epoch": 2.551546391752577, - "grad_norm": 0.5757763981819153, - "learning_rate": 5.255996121599167e-05, - "loss": 0.683, - "step": 495 - }, - { - "epoch": 2.556701030927835, - "grad_norm": 0.6311424374580383, - "learning_rate": 5.2389431821503606e-05, - "loss": 0.6142, - "step": 496 - }, - { - "epoch": 2.5618556701030926, - "grad_norm": 0.653990626335144, - "learning_rate": 5.221887456462907e-05, - "loss": 0.7109, - "step": 497 - }, - { - "epoch": 2.5670103092783503, - "grad_norm": 0.6488648653030396, - "learning_rate": 5.2048291434180716e-05, - "loss": 0.6484, - "step": 498 - }, - { - "epoch": 2.572164948453608, - "grad_norm": 0.6430789828300476, - "learning_rate": 5.1877684419272875e-05, - "loss": 0.6893, - "step": 499 - }, - { - "epoch": 2.5773195876288657, - "grad_norm": 0.701564610004425, - "learning_rate": 5.1707055509298396e-05, - "loss": 0.6976, - "step": 500 - }, - { - "epoch": 2.582474226804124, - "grad_norm": 0.6223208904266357, - "learning_rate": 5.153640669390546e-05, - "loss": 0.7153, - "step": 501 - }, - { - "epoch": 2.5876288659793816, - "grad_norm": 0.7090323567390442, - "learning_rate": 5.1365739962974304e-05, - "loss": 0.6964, - "step": 502 - }, - { - "epoch": 2.5927835051546393, - "grad_norm": 0.6134834885597229, - "learning_rate": 5.119505730659413e-05, - "loss": 0.5904, - "step": 503 - }, - { - "epoch": 2.597938144329897, - "grad_norm": 0.6040003895759583, - "learning_rate": 5.102436071503982e-05, - "loss": 0.5459, - "step": 504 - }, - { - "epoch": 2.6030927835051547, - "grad_norm": 0.6588661670684814, - "learning_rate": 5.0853652178748746e-05, - "loss": 0.5639, - "step": 505 - }, - { - "epoch": 2.6082474226804124, - "grad_norm": 0.6262742280960083, - "learning_rate": 5.068293368829755e-05, - "loss": 0.6609, - "step": 506 - }, - { - "epoch": 2.61340206185567, - "grad_norm": 0.6228057742118835, - "learning_rate": 5.0512207234379004e-05, - "loss": 0.5804, - "step": 507 - }, - { - "epoch": 2.618556701030928, - "grad_norm": 0.6566773056983948, - "learning_rate": 5.0341474807778663e-05, - "loss": 0.7437, - "step": 508 - }, - { - "epoch": 2.6237113402061856, - "grad_norm": 0.5632613301277161, - "learning_rate": 5.017073839935178e-05, - "loss": 0.6294, - "step": 509 - }, - { - "epoch": 2.6288659793814433, - "grad_norm": 0.7143430113792419, - "learning_rate": 5e-05, - "loss": 0.6383, - "step": 510 - }, - { - "epoch": 2.634020618556701, - "grad_norm": 0.5768882036209106, - "learning_rate": 4.982926160064823e-05, - "loss": 0.569, - "step": 511 - }, - { - "epoch": 2.6391752577319587, - "grad_norm": 0.6064475178718567, - "learning_rate": 4.965852519222134e-05, - "loss": 0.5433, - "step": 512 - }, - { - "epoch": 2.6443298969072164, - "grad_norm": 0.6445603966712952, - "learning_rate": 4.948779276562101e-05, - "loss": 0.623, - "step": 513 - }, - { - "epoch": 2.649484536082474, - "grad_norm": 0.6420986652374268, - "learning_rate": 4.9317066311702456e-05, - "loss": 0.7876, - "step": 514 - }, - { - "epoch": 2.654639175257732, - "grad_norm": 0.7606147527694702, - "learning_rate": 4.9146347821251266e-05, - "loss": 0.6335, - "step": 515 - }, - { - "epoch": 2.6597938144329896, - "grad_norm": 0.6954572200775146, - "learning_rate": 4.89756392849602e-05, - "loss": 0.7617, - "step": 516 - }, - { - "epoch": 2.6649484536082473, - "grad_norm": 0.7447741031646729, - "learning_rate": 4.880494269340588e-05, - "loss": 0.6967, - "step": 517 - }, - { - "epoch": 2.670103092783505, - "grad_norm": 0.553658127784729, - "learning_rate": 4.863426003702572e-05, - "loss": 0.6838, - "step": 518 - }, - { - "epoch": 2.675257731958763, - "grad_norm": 0.5830461978912354, - "learning_rate": 4.8463593306094555e-05, - "loss": 0.5642, - "step": 519 - }, - { - "epoch": 2.680412371134021, - "grad_norm": 0.4478597342967987, - "learning_rate": 4.829294449070161e-05, - "loss": 0.5929, - "step": 520 - }, - { - "epoch": 2.6855670103092786, - "grad_norm": 0.5738862752914429, - "learning_rate": 4.8122315580727136e-05, - "loss": 0.5355, - "step": 521 - }, - { - "epoch": 2.6907216494845363, - "grad_norm": 0.6429448127746582, - "learning_rate": 4.795170856581929e-05, - "loss": 0.7282, - "step": 522 - }, - { - "epoch": 2.695876288659794, - "grad_norm": 0.6432612538337708, - "learning_rate": 4.778112543537094e-05, - "loss": 0.7107, - "step": 523 - }, - { - "epoch": 2.7010309278350517, - "grad_norm": 0.6002117991447449, - "learning_rate": 4.7610568178496405e-05, - "loss": 0.5271, - "step": 524 - }, - { - "epoch": 2.7061855670103094, - "grad_norm": 0.6462201476097107, - "learning_rate": 4.744003878400835e-05, - "loss": 0.6236, - "step": 525 - }, - { - "epoch": 2.711340206185567, - "grad_norm": 0.6209389567375183, - "learning_rate": 4.726953924039451e-05, - "loss": 0.6871, - "step": 526 - }, - { - "epoch": 2.716494845360825, - "grad_norm": 0.5495082139968872, - "learning_rate": 4.709907153579454e-05, - "loss": 0.5909, - "step": 527 - }, - { - "epoch": 2.7216494845360826, - "grad_norm": 0.6679218411445618, - "learning_rate": 4.692863765797683e-05, - "loss": 0.6767, - "step": 528 - }, - { - "epoch": 2.7268041237113403, - "grad_norm": 0.5396760702133179, - "learning_rate": 4.675823959431535e-05, - "loss": 0.5304, - "step": 529 - }, - { - "epoch": 2.731958762886598, - "grad_norm": 0.6522016525268555, - "learning_rate": 4.658787933176646e-05, - "loss": 0.5896, - "step": 530 - }, - { - "epoch": 2.7371134020618557, - "grad_norm": 0.6318926811218262, - "learning_rate": 4.641755885684566e-05, - "loss": 0.8534, - "step": 531 - }, - { - "epoch": 2.7422680412371134, - "grad_norm": 0.6719669103622437, - "learning_rate": 4.624728015560461e-05, - "loss": 0.703, - "step": 532 - }, - { - "epoch": 2.747422680412371, - "grad_norm": 0.7230966091156006, - "learning_rate": 4.607704521360776e-05, - "loss": 0.7014, - "step": 533 - }, - { - "epoch": 2.752577319587629, - "grad_norm": 0.5768046379089355, - "learning_rate": 4.590685601590936e-05, - "loss": 0.5369, - "step": 534 - }, - { - "epoch": 2.7577319587628866, - "grad_norm": 0.7714840769767761, - "learning_rate": 4.57367145470302e-05, - "loss": 0.6511, - "step": 535 - }, - { - "epoch": 2.7628865979381443, - "grad_norm": 0.660317063331604, - "learning_rate": 4.5566622790934604e-05, - "loss": 0.6605, - "step": 536 - }, - { - "epoch": 2.768041237113402, - "grad_norm": 0.6877678632736206, - "learning_rate": 4.5396582731007095e-05, - "loss": 0.7078, - "step": 537 - }, - { - "epoch": 2.7731958762886597, - "grad_norm": 0.6704768538475037, - "learning_rate": 4.52265963500295e-05, - "loss": 0.6474, - "step": 538 - }, - { - "epoch": 2.7783505154639174, - "grad_norm": 0.7046148777008057, - "learning_rate": 4.505666563015763e-05, - "loss": 0.5439, - "step": 539 - }, - { - "epoch": 2.783505154639175, - "grad_norm": 0.6655017733573914, - "learning_rate": 4.4886792552898286e-05, - "loss": 0.8095, - "step": 540 - }, - { - "epoch": 2.788659793814433, - "grad_norm": 0.7060091495513916, - "learning_rate": 4.471697909908613e-05, - "loss": 0.66, - "step": 541 - }, - { - "epoch": 2.7938144329896906, - "grad_norm": 0.6830212473869324, - "learning_rate": 4.454722724886051e-05, - "loss": 0.7703, - "step": 542 - }, - { - "epoch": 2.7989690721649483, - "grad_norm": 0.7285298109054565, - "learning_rate": 4.437753898164254e-05, - "loss": 0.7986, - "step": 543 - }, - { - "epoch": 2.804123711340206, - "grad_norm": 0.6340286731719971, - "learning_rate": 4.420791627611179e-05, - "loss": 0.5987, - "step": 544 - }, - { - "epoch": 2.8092783505154637, - "grad_norm": 0.6459646224975586, - "learning_rate": 4.403836111018346e-05, - "loss": 0.7153, - "step": 545 - }, - { - "epoch": 2.8144329896907214, - "grad_norm": 0.5636230707168579, - "learning_rate": 4.3868875460985085e-05, - "loss": 0.5108, - "step": 546 - }, - { - "epoch": 2.819587628865979, - "grad_norm": 0.65835040807724, - "learning_rate": 4.369946130483364e-05, - "loss": 0.6884, - "step": 547 - }, - { - "epoch": 2.824742268041237, - "grad_norm": 0.7029833197593689, - "learning_rate": 4.353012061721249e-05, - "loss": 0.69, - "step": 548 - }, - { - "epoch": 2.829896907216495, - "grad_norm": 0.8330891728401184, - "learning_rate": 4.336085537274818e-05, - "loss": 0.7947, - "step": 549 - }, - { - "epoch": 2.8350515463917527, - "grad_norm": 0.5834147334098816, - "learning_rate": 4.319166754518768e-05, - "loss": 0.5213, - "step": 550 - }, - { - "epoch": 2.8402061855670104, - "grad_norm": 0.7061026692390442, - "learning_rate": 4.3022559107375106e-05, - "loss": 0.7841, - "step": 551 - }, - { - "epoch": 2.845360824742268, - "grad_norm": 0.5975331664085388, - "learning_rate": 4.285353203122893e-05, - "loss": 0.6662, - "step": 552 - }, - { - "epoch": 2.850515463917526, - "grad_norm": 0.6245263814926147, - "learning_rate": 4.268458828771883e-05, - "loss": 0.6272, - "step": 553 - }, - { - "epoch": 2.8556701030927836, - "grad_norm": 0.611504852771759, - "learning_rate": 4.251572984684281e-05, - "loss": 0.6917, - "step": 554 - }, - { - "epoch": 2.8608247422680413, - "grad_norm": 0.7087809443473816, - "learning_rate": 4.234695867760412e-05, - "loss": 0.6055, - "step": 555 - }, - { - "epoch": 2.865979381443299, - "grad_norm": 0.6241830587387085, - "learning_rate": 4.2178276747988446e-05, - "loss": 0.7149, - "step": 556 - }, - { - "epoch": 2.8711340206185567, - "grad_norm": 0.6535329222679138, - "learning_rate": 4.200968602494087e-05, - "loss": 0.628, - "step": 557 - }, - { - "epoch": 2.8762886597938144, - "grad_norm": 0.515200138092041, - "learning_rate": 4.18411884743429e-05, - "loss": 0.5361, - "step": 558 - }, - { - "epoch": 2.881443298969072, - "grad_norm": 0.7569003105163574, - "learning_rate": 4.16727860609896e-05, - "loss": 0.6474, - "step": 559 - }, - { - "epoch": 2.88659793814433, - "grad_norm": 0.6560523509979248, - "learning_rate": 4.150448074856667e-05, - "loss": 0.7475, - "step": 560 - }, - { - "epoch": 2.8917525773195876, - "grad_norm": 0.7731671333312988, - "learning_rate": 4.1336274499627596e-05, - "loss": 0.7221, - "step": 561 - }, - { - "epoch": 2.8969072164948453, - "grad_norm": 0.748393714427948, - "learning_rate": 4.1168169275570635e-05, - "loss": 0.6456, - "step": 562 - }, - { - "epoch": 2.902061855670103, - "grad_norm": 0.5793806910514832, - "learning_rate": 4.1000167036616113e-05, - "loss": 0.6022, - "step": 563 - }, - { - "epoch": 2.9072164948453607, - "grad_norm": 0.6097270250320435, - "learning_rate": 4.083226974178339e-05, - "loss": 0.6723, - "step": 564 - }, - { - "epoch": 2.9123711340206184, - "grad_norm": 0.7190105319023132, - "learning_rate": 4.066447934886819e-05, - "loss": 0.6841, - "step": 565 - }, - { - "epoch": 2.917525773195876, - "grad_norm": 0.781724750995636, - "learning_rate": 4.049679781441965e-05, - "loss": 0.6372, - "step": 566 - }, - { - "epoch": 2.9226804123711343, - "grad_norm": 0.5826302766799927, - "learning_rate": 4.0329227093717515e-05, - "loss": 0.617, - "step": 567 - }, - { - "epoch": 2.927835051546392, - "grad_norm": 0.5542965531349182, - "learning_rate": 4.016176914074944e-05, - "loss": 0.6842, - "step": 568 - }, - { - "epoch": 2.9329896907216497, - "grad_norm": 0.6216481328010559, - "learning_rate": 3.999442590818804e-05, - "loss": 0.7328, - "step": 569 - }, - { - "epoch": 2.9381443298969074, - "grad_norm": 0.7261362075805664, - "learning_rate": 3.982719934736832e-05, - "loss": 0.7162, - "step": 570 - }, - { - "epoch": 2.943298969072165, - "grad_norm": 0.8381164073944092, - "learning_rate": 3.9660091408264716e-05, - "loss": 0.6958, - "step": 571 - }, - { - "epoch": 2.948453608247423, - "grad_norm": 0.5935752391815186, - "learning_rate": 3.949310403946849e-05, - "loss": 0.6205, - "step": 572 - }, - { - "epoch": 2.9536082474226806, - "grad_norm": 0.7071570754051208, - "learning_rate": 3.9326239188165025e-05, - "loss": 0.7817, - "step": 573 - }, - { - "epoch": 2.9587628865979383, - "grad_norm": 0.6648340821266174, - "learning_rate": 3.915949880011096e-05, - "loss": 0.7229, - "step": 574 - }, - { - "epoch": 2.963917525773196, - "grad_norm": 0.5541516542434692, - "learning_rate": 3.899288481961173e-05, - "loss": 0.5545, - "step": 575 - }, - { - "epoch": 2.9690721649484537, - "grad_norm": 0.7823721766471863, - "learning_rate": 3.8826399189498654e-05, - "loss": 0.784, - "step": 576 - }, - { - "epoch": 2.9742268041237114, - "grad_norm": 0.7309272885322571, - "learning_rate": 3.86600438511065e-05, - "loss": 0.7443, - "step": 577 - }, - { - "epoch": 2.979381443298969, - "grad_norm": 0.6938033699989319, - "learning_rate": 3.8493820744250685e-05, - "loss": 0.7407, - "step": 578 - }, - { - "epoch": 2.984536082474227, - "grad_norm": 0.49541375041007996, - "learning_rate": 3.832773180720475e-05, - "loss": 0.6053, - "step": 579 - }, - { - "epoch": 2.9896907216494846, - "grad_norm": 0.6114165782928467, - "learning_rate": 3.8161778976677666e-05, - "loss": 0.5553, - "step": 580 - }, - { - "epoch": 2.9948453608247423, - "grad_norm": 0.7064359188079834, - "learning_rate": 3.79959641877914e-05, - "loss": 0.5511, - "step": 581 - }, - { - "epoch": 3.0, - "grad_norm": 0.6850186586380005, - "learning_rate": 3.783028937405821e-05, - "loss": 0.5452, - "step": 582 - }, - { - "epoch": 3.0, - "eval_loss": 0.7680767178535461, - "eval_runtime": 23.1292, - "eval_samples_per_second": 7.091, - "eval_steps_per_second": 1.773, - "step": 582 - }, - { - "epoch": 3.0051546391752577, - "grad_norm": 0.592870831489563, - "learning_rate": 3.766475646735815e-05, - "loss": 0.6334, - "step": 583 - }, - { - "epoch": 3.0103092783505154, - "grad_norm": 0.7153403162956238, - "learning_rate": 3.7499367397916555e-05, - "loss": 0.6852, - "step": 584 - }, - { - "epoch": 3.015463917525773, - "grad_norm": 0.6092976927757263, - "learning_rate": 3.733412409428148e-05, - "loss": 0.5763, - "step": 585 - }, - { - "epoch": 3.020618556701031, - "grad_norm": 0.6821876168251038, - "learning_rate": 3.716902848330133e-05, - "loss": 0.5305, - "step": 586 - }, - { - "epoch": 3.0257731958762886, - "grad_norm": 0.6120645403862, - "learning_rate": 3.7004082490102226e-05, - "loss": 0.6668, - "step": 587 - }, - { - "epoch": 3.0309278350515463, - "grad_norm": 0.5387507081031799, - "learning_rate": 3.6839288038065734e-05, - "loss": 0.5388, - "step": 588 - }, - { - "epoch": 3.036082474226804, - "grad_norm": 0.6281033158302307, - "learning_rate": 3.667464704880625e-05, - "loss": 0.6401, - "step": 589 - }, - { - "epoch": 3.0412371134020617, - "grad_norm": 0.7044920921325684, - "learning_rate": 3.651016144214878e-05, - "loss": 0.5057, - "step": 590 - }, - { - "epoch": 3.0463917525773194, - "grad_norm": 0.7941780090332031, - "learning_rate": 3.634583313610644e-05, - "loss": 0.8192, - "step": 591 - }, - { - "epoch": 3.051546391752577, - "grad_norm": 0.7709544897079468, - "learning_rate": 3.618166404685805e-05, - "loss": 0.4853, - "step": 592 - }, - { - "epoch": 3.056701030927835, - "grad_norm": 0.5864732265472412, - "learning_rate": 3.601765608872595e-05, - "loss": 0.6387, - "step": 593 - }, - { - "epoch": 3.0618556701030926, - "grad_norm": 0.8262513875961304, - "learning_rate": 3.585381117415349e-05, - "loss": 0.602, - "step": 594 - }, - { - "epoch": 3.0670103092783507, - "grad_norm": 0.6160711050033569, - "learning_rate": 3.5690131213682944e-05, - "loss": 0.4651, - "step": 595 - }, - { - "epoch": 3.0721649484536084, - "grad_norm": 0.7176710367202759, - "learning_rate": 3.5526618115932975e-05, - "loss": 0.8127, - "step": 596 - }, - { - "epoch": 3.077319587628866, - "grad_norm": 0.7125350832939148, - "learning_rate": 3.53632737875766e-05, - "loss": 0.4828, - "step": 597 - }, - { - "epoch": 3.082474226804124, - "grad_norm": 0.7212585210800171, - "learning_rate": 3.5200100133318834e-05, - "loss": 0.5628, - "step": 598 - }, - { - "epoch": 3.0876288659793816, - "grad_norm": 0.5740189552307129, - "learning_rate": 3.5037099055874536e-05, - "loss": 0.4883, - "step": 599 - }, - { - "epoch": 3.0927835051546393, - "grad_norm": 0.7745830416679382, - "learning_rate": 3.487427245594622e-05, - "loss": 0.566, - "step": 600 - }, - { - "epoch": 3.097938144329897, - "grad_norm": 0.6833390593528748, - "learning_rate": 3.47116222322018e-05, - "loss": 0.5527, - "step": 601 - }, - { - "epoch": 3.1030927835051547, - "grad_norm": 0.6211826205253601, - "learning_rate": 3.4549150281252636e-05, - "loss": 0.5606, - "step": 602 - }, - { - "epoch": 3.1082474226804124, - "grad_norm": 0.6371477246284485, - "learning_rate": 3.4386858497631205e-05, - "loss": 0.5087, - "step": 603 - }, - { - "epoch": 3.11340206185567, - "grad_norm": 0.688470721244812, - "learning_rate": 3.422474877376917e-05, - "loss": 0.5553, - "step": 604 - }, - { - "epoch": 3.118556701030928, - "grad_norm": 0.6175742149353027, - "learning_rate": 3.406282299997521e-05, - "loss": 0.6158, - "step": 605 - }, - { - "epoch": 3.1237113402061856, - "grad_norm": 0.7345418334007263, - "learning_rate": 3.3901083064413095e-05, - "loss": 0.6047, - "step": 606 - }, - { - "epoch": 3.1288659793814433, - "grad_norm": 0.5087317824363708, - "learning_rate": 3.3739530853079516e-05, - "loss": 0.4364, - "step": 607 - }, - { - "epoch": 3.134020618556701, - "grad_norm": 0.6300753355026245, - "learning_rate": 3.357816824978222e-05, - "loss": 0.5034, - "step": 608 - }, - { - "epoch": 3.1391752577319587, - "grad_norm": 0.6126660704612732, - "learning_rate": 3.341699713611799e-05, - "loss": 0.5927, - "step": 609 - }, - { - "epoch": 3.1443298969072164, - "grad_norm": 0.7594387531280518, - "learning_rate": 3.325601939145069e-05, - "loss": 0.5538, - "step": 610 - }, - { - "epoch": 3.149484536082474, - "grad_norm": 0.7265405654907227, - "learning_rate": 3.309523689288941e-05, - "loss": 0.5529, - "step": 611 - }, - { - "epoch": 3.154639175257732, - "grad_norm": 0.6953518986701965, - "learning_rate": 3.293465151526649e-05, - "loss": 0.6362, - "step": 612 - }, - { - "epoch": 3.1597938144329896, - "grad_norm": 0.6807132363319397, - "learning_rate": 3.277426513111575e-05, - "loss": 0.6603, - "step": 613 - }, - { - "epoch": 3.1649484536082473, - "grad_norm": 0.6613560914993286, - "learning_rate": 3.261407961065056e-05, - "loss": 0.6489, - "step": 614 - }, - { - "epoch": 3.170103092783505, - "grad_norm": 0.6142688989639282, - "learning_rate": 3.245409682174217e-05, - "loss": 0.4752, - "step": 615 - }, - { - "epoch": 3.1752577319587627, - "grad_norm": 0.6799762845039368, - "learning_rate": 3.229431862989775e-05, - "loss": 0.6954, - "step": 616 - }, - { - "epoch": 3.1804123711340204, - "grad_norm": 0.732398509979248, - "learning_rate": 3.2134746898238774e-05, - "loss": 0.5572, - "step": 617 - }, - { - "epoch": 3.1855670103092786, - "grad_norm": 0.7211857438087463, - "learning_rate": 3.197538348747927e-05, - "loss": 0.5869, - "step": 618 - }, - { - "epoch": 3.1907216494845363, - "grad_norm": 0.5751710534095764, - "learning_rate": 3.181623025590405e-05, - "loss": 0.4213, - "step": 619 - }, - { - "epoch": 3.195876288659794, - "grad_norm": 0.4845646917819977, - "learning_rate": 3.165728905934718e-05, - "loss": 0.3849, - "step": 620 - }, - { - "epoch": 3.2010309278350517, - "grad_norm": 0.7036471366882324, - "learning_rate": 3.149856175117014e-05, - "loss": 0.4576, - "step": 621 - }, - { - "epoch": 3.2061855670103094, - "grad_norm": 0.7593416571617126, - "learning_rate": 3.134005018224044e-05, - "loss": 0.6383, - "step": 622 - }, - { - "epoch": 3.211340206185567, - "grad_norm": 0.8135774731636047, - "learning_rate": 3.118175620090983e-05, - "loss": 0.5818, - "step": 623 - }, - { - "epoch": 3.216494845360825, - "grad_norm": 0.6283600330352783, - "learning_rate": 3.1023681652992926e-05, - "loss": 0.6409, - "step": 624 - }, - { - "epoch": 3.2216494845360826, - "grad_norm": 0.7765624523162842, - "learning_rate": 3.086582838174551e-05, - "loss": 0.6392, - "step": 625 - }, - { - "epoch": 3.2268041237113403, - "grad_norm": 0.6492854356765747, - "learning_rate": 3.070819822784323e-05, - "loss": 0.5593, - "step": 626 - }, - { - "epoch": 3.231958762886598, - "grad_norm": 0.8030527234077454, - "learning_rate": 3.055079302935997e-05, - "loss": 0.548, - "step": 627 - }, - { - "epoch": 3.2371134020618557, - "grad_norm": 0.7499942779541016, - "learning_rate": 3.0393614621746498e-05, - "loss": 0.5565, - "step": 628 - }, - { - "epoch": 3.2422680412371134, - "grad_norm": 0.7912932634353638, - "learning_rate": 3.023666483780905e-05, - "loss": 0.6234, - "step": 629 - }, - { - "epoch": 3.247422680412371, - "grad_norm": 0.6806544661521912, - "learning_rate": 3.007994550768793e-05, - "loss": 0.5514, - "step": 630 - }, - { - "epoch": 3.252577319587629, - "grad_norm": 0.7280873656272888, - "learning_rate": 2.9923458458836258e-05, - "loss": 0.6465, - "step": 631 - }, - { - "epoch": 3.2577319587628866, - "grad_norm": 0.7153916954994202, - "learning_rate": 2.9767205515998518e-05, - "loss": 0.4821, - "step": 632 - }, - { - "epoch": 3.2628865979381443, - "grad_norm": 0.9249881505966187, - "learning_rate": 2.9611188501189435e-05, - "loss": 0.6613, - "step": 633 - }, - { - "epoch": 3.268041237113402, - "grad_norm": 0.7101225256919861, - "learning_rate": 2.9455409233672592e-05, - "loss": 0.5088, - "step": 634 - }, - { - "epoch": 3.2731958762886597, - "grad_norm": 0.7100300192832947, - "learning_rate": 2.929986952993933e-05, - "loss": 0.6544, - "step": 635 - }, - { - "epoch": 3.2783505154639174, - "grad_norm": 0.8734927177429199, - "learning_rate": 2.9144571203687476e-05, - "loss": 0.5974, - "step": 636 - }, - { - "epoch": 3.283505154639175, - "grad_norm": 0.7215682864189148, - "learning_rate": 2.8989516065800238e-05, - "loss": 0.565, - "step": 637 - }, - { - "epoch": 3.288659793814433, - "grad_norm": 0.7901187539100647, - "learning_rate": 2.8834705924325118e-05, - "loss": 0.6572, - "step": 638 - }, - { - "epoch": 3.2938144329896906, - "grad_norm": 0.6137844920158386, - "learning_rate": 2.8680142584452742e-05, - "loss": 0.5265, - "step": 639 - }, - { - "epoch": 3.2989690721649483, - "grad_norm": 0.7701372504234314, - "learning_rate": 2.8525827848495913e-05, - "loss": 0.68, - "step": 640 - }, - { - "epoch": 3.304123711340206, - "grad_norm": 0.681596040725708, - "learning_rate": 2.83717635158685e-05, - "loss": 0.6126, - "step": 641 - }, - { - "epoch": 3.3092783505154637, - "grad_norm": 0.7642390131950378, - "learning_rate": 2.8217951383064544e-05, - "loss": 0.7179, - "step": 642 - }, - { - "epoch": 3.3144329896907214, - "grad_norm": 0.6880099773406982, - "learning_rate": 2.8064393243637222e-05, - "loss": 0.5507, - "step": 643 - }, - { - "epoch": 3.319587628865979, - "grad_norm": 0.6608781814575195, - "learning_rate": 2.791109088817803e-05, - "loss": 0.4494, - "step": 644 - }, - { - "epoch": 3.3247422680412373, - "grad_norm": 0.685088038444519, - "learning_rate": 2.7758046104295797e-05, - "loss": 0.6755, - "step": 645 - }, - { - "epoch": 3.329896907216495, - "grad_norm": 0.8195577263832092, - "learning_rate": 2.760526067659591e-05, - "loss": 0.5689, - "step": 646 - }, - { - "epoch": 3.3350515463917527, - "grad_norm": 0.801682710647583, - "learning_rate": 2.7452736386659516e-05, - "loss": 0.7178, - "step": 647 - }, - { - "epoch": 3.3402061855670104, - "grad_norm": 0.653991162776947, - "learning_rate": 2.7300475013022663e-05, - "loss": 0.571, - "step": 648 - }, - { - "epoch": 3.345360824742268, - "grad_norm": 0.6898659467697144, - "learning_rate": 2.7148478331155702e-05, - "loss": 0.5027, - "step": 649 - }, - { - "epoch": 3.350515463917526, - "grad_norm": 0.7034061551094055, - "learning_rate": 2.6996748113442394e-05, - "loss": 0.4802, - "step": 650 - }, - { - "epoch": 3.3556701030927836, - "grad_norm": 0.6520366072654724, - "learning_rate": 2.6845286129159464e-05, - "loss": 0.4855, - "step": 651 - }, - { - "epoch": 3.3608247422680413, - "grad_norm": 0.6832021474838257, - "learning_rate": 2.669409414445574e-05, - "loss": 0.5634, - "step": 652 - }, - { - "epoch": 3.365979381443299, - "grad_norm": 0.8851689696311951, - "learning_rate": 2.6543173922331743e-05, - "loss": 0.7024, - "step": 653 - }, - { - "epoch": 3.3711340206185567, - "grad_norm": 0.6908465623855591, - "learning_rate": 2.639252722261908e-05, - "loss": 0.5088, - "step": 654 - }, - { - "epoch": 3.3762886597938144, - "grad_norm": 0.7850653529167175, - "learning_rate": 2.624215580195981e-05, - "loss": 0.5754, - "step": 655 - }, - { - "epoch": 3.381443298969072, - "grad_norm": 0.7114818096160889, - "learning_rate": 2.6092061413786156e-05, - "loss": 0.5193, - "step": 656 - }, - { - "epoch": 3.38659793814433, - "grad_norm": 0.7497783899307251, - "learning_rate": 2.5942245808299886e-05, - "loss": 0.5521, - "step": 657 - }, - { - "epoch": 3.3917525773195876, - "grad_norm": 0.7602093815803528, - "learning_rate": 2.5792710732451997e-05, - "loss": 0.6786, - "step": 658 - }, - { - "epoch": 3.3969072164948453, - "grad_norm": 0.6801767945289612, - "learning_rate": 2.56434579299223e-05, - "loss": 0.6006, - "step": 659 - }, - { - "epoch": 3.402061855670103, - "grad_norm": 0.8054990172386169, - "learning_rate": 2.5494489141099153e-05, - "loss": 0.6026, - "step": 660 - }, - { - "epoch": 3.4072164948453607, - "grad_norm": 0.5966134071350098, - "learning_rate": 2.534580610305909e-05, - "loss": 0.4467, - "step": 661 - }, - { - "epoch": 3.4123711340206184, - "grad_norm": 0.6399171352386475, - "learning_rate": 2.5197410549546595e-05, - "loss": 0.3739, - "step": 662 - }, - { - "epoch": 3.417525773195876, - "grad_norm": 0.6846553683280945, - "learning_rate": 2.5049304210953933e-05, - "loss": 0.5254, - "step": 663 - }, - { - "epoch": 3.422680412371134, - "grad_norm": 0.7457582950592041, - "learning_rate": 2.4901488814300856e-05, - "loss": 0.5307, - "step": 664 - }, - { - "epoch": 3.4278350515463916, - "grad_norm": 0.7888783812522888, - "learning_rate": 2.4753966083214615e-05, - "loss": 0.6, - "step": 665 - }, - { - "epoch": 3.4329896907216497, - "grad_norm": 0.7287904024124146, - "learning_rate": 2.4606737737909697e-05, - "loss": 0.5551, - "step": 666 - }, - { - "epoch": 3.4381443298969074, - "grad_norm": 0.852834939956665, - "learning_rate": 2.4459805495167942e-05, - "loss": 0.651, - "step": 667 - }, - { - "epoch": 3.443298969072165, - "grad_norm": 0.7840898633003235, - "learning_rate": 2.4313171068318357e-05, - "loss": 0.6072, - "step": 668 - }, - { - "epoch": 3.448453608247423, - "grad_norm": 0.8575794696807861, - "learning_rate": 2.4166836167217283e-05, - "loss": 0.6781, - "step": 669 - }, - { - "epoch": 3.4536082474226806, - "grad_norm": 0.6471691131591797, - "learning_rate": 2.4020802498228335e-05, - "loss": 0.5674, - "step": 670 - }, - { - "epoch": 3.4587628865979383, - "grad_norm": 0.7855519652366638, - "learning_rate": 2.3875071764202563e-05, - "loss": 0.5972, - "step": 671 - }, - { - "epoch": 3.463917525773196, - "grad_norm": 0.8715437054634094, - "learning_rate": 2.3729645664458638e-05, - "loss": 0.6441, - "step": 672 - }, - { - "epoch": 3.4690721649484537, - "grad_norm": 0.6959824562072754, - "learning_rate": 2.3584525894762928e-05, - "loss": 0.4333, - "step": 673 - }, - { - "epoch": 3.4742268041237114, - "grad_norm": 0.6880579590797424, - "learning_rate": 2.3439714147309845e-05, - "loss": 0.5053, - "step": 674 - }, - { - "epoch": 3.479381443298969, - "grad_norm": 0.7788439393043518, - "learning_rate": 2.329521211070199e-05, - "loss": 0.5467, - "step": 675 - }, - { - "epoch": 3.484536082474227, - "grad_norm": 0.7182348966598511, - "learning_rate": 2.3151021469930613e-05, - "loss": 0.6482, - "step": 676 - }, - { - "epoch": 3.4896907216494846, - "grad_norm": 0.794500470161438, - "learning_rate": 2.3007143906355767e-05, - "loss": 0.5767, - "step": 677 - }, - { - "epoch": 3.4948453608247423, - "grad_norm": 0.9387006759643555, - "learning_rate": 2.2863581097686925e-05, - "loss": 0.8179, - "step": 678 - }, - { - "epoch": 3.5, - "grad_norm": 0.6931821703910828, - "learning_rate": 2.2720334717963222e-05, - "loss": 0.5789, - "step": 679 - }, - { - "epoch": 3.5, - "eval_loss": 0.7876521944999695, - "eval_runtime": 23.1822, - "eval_samples_per_second": 7.074, - "eval_steps_per_second": 1.769, - "step": 679 - }, - { - "epoch": 3.5051546391752577, - "grad_norm": 0.6769412159919739, - "learning_rate": 2.2577406437534054e-05, - "loss": 0.5329, - "step": 680 - }, - { - "epoch": 3.5103092783505154, - "grad_norm": 0.683117151260376, - "learning_rate": 2.2434797923039598e-05, - "loss": 0.5284, - "step": 681 - }, - { - "epoch": 3.515463917525773, - "grad_norm": 0.790727972984314, - "learning_rate": 2.2292510837391267e-05, - "loss": 0.612, - "step": 682 - }, - { - "epoch": 3.520618556701031, - "grad_norm": 0.7108373641967773, - "learning_rate": 2.2150546839752438e-05, - "loss": 0.6705, - "step": 683 - }, - { - "epoch": 3.5257731958762886, - "grad_norm": 0.8616272807121277, - "learning_rate": 2.2008907585519095e-05, - "loss": 0.5653, - "step": 684 - }, - { - "epoch": 3.5309278350515463, - "grad_norm": 0.8035832047462463, - "learning_rate": 2.186759472630045e-05, - "loss": 0.8008, - "step": 685 - }, - { - "epoch": 3.536082474226804, - "grad_norm": 0.7284449338912964, - "learning_rate": 2.172660990989971e-05, - "loss": 0.56, - "step": 686 - }, - { - "epoch": 3.5412371134020617, - "grad_norm": 0.803911566734314, - "learning_rate": 2.1585954780294947e-05, - "loss": 0.6312, - "step": 687 - }, - { - "epoch": 3.5463917525773194, - "grad_norm": 0.8152459859848022, - "learning_rate": 2.144563097761984e-05, - "loss": 0.5366, - "step": 688 - }, - { - "epoch": 3.551546391752577, - "grad_norm": 0.7566794157028198, - "learning_rate": 2.130564013814453e-05, - "loss": 0.6201, - "step": 689 - }, - { - "epoch": 3.556701030927835, - "grad_norm": 0.6466950178146362, - "learning_rate": 2.1165983894256647e-05, - "loss": 0.5122, - "step": 690 - }, - { - "epoch": 3.5618556701030926, - "grad_norm": 0.8573073148727417, - "learning_rate": 2.102666387444215e-05, - "loss": 0.6787, - "step": 691 - }, - { - "epoch": 3.5670103092783503, - "grad_norm": 0.6495526432991028, - "learning_rate": 2.0887681703266453e-05, - "loss": 0.4294, - "step": 692 - }, - { - "epoch": 3.572164948453608, - "grad_norm": 0.6471603512763977, - "learning_rate": 2.0749039001355375e-05, - "loss": 0.5999, - "step": 693 - }, - { - "epoch": 3.5773195876288657, - "grad_norm": 0.7354085445404053, - "learning_rate": 2.061073738537635e-05, - "loss": 0.5685, - "step": 694 - }, - { - "epoch": 3.582474226804124, - "grad_norm": 0.7679943442344666, - "learning_rate": 2.0472778468019454e-05, - "loss": 0.5547, - "step": 695 - }, - { - "epoch": 3.5876288659793816, - "grad_norm": 0.6642536520957947, - "learning_rate": 2.0335163857978744e-05, - "loss": 0.4903, - "step": 696 - }, - { - "epoch": 3.5927835051546393, - "grad_norm": 0.927869439125061, - "learning_rate": 2.019789515993336e-05, - "loss": 0.6191, - "step": 697 - }, - { - "epoch": 3.597938144329897, - "grad_norm": 0.718420684337616, - "learning_rate": 2.0060973974528874e-05, - "loss": 0.4906, - "step": 698 - }, - { - "epoch": 3.6030927835051547, - "grad_norm": 0.7137137651443481, - "learning_rate": 1.992440189835869e-05, - "loss": 0.7192, - "step": 699 - }, - { - "epoch": 3.6082474226804124, - "grad_norm": 0.8115518093109131, - "learning_rate": 1.9788180523945277e-05, - "loss": 0.5906, - "step": 700 - }, - { - "epoch": 3.61340206185567, - "grad_norm": 0.8083131313323975, - "learning_rate": 1.9652311439721764e-05, - "loss": 0.6711, - "step": 701 - }, - { - "epoch": 3.618556701030928, - "grad_norm": 0.6981199383735657, - "learning_rate": 1.9516796230013272e-05, - "loss": 0.598, - "step": 702 - }, - { - "epoch": 3.6237113402061856, - "grad_norm": 0.6912441253662109, - "learning_rate": 1.9381636475018577e-05, - "loss": 0.5254, - "step": 703 - }, - { - "epoch": 3.6288659793814433, - "grad_norm": 0.6640885472297668, - "learning_rate": 1.9246833750791526e-05, - "loss": 0.4806, - "step": 704 - }, - { - "epoch": 3.634020618556701, - "grad_norm": 0.6553522944450378, - "learning_rate": 1.9112389629222823e-05, - "loss": 0.6355, - "step": 705 - }, - { - "epoch": 3.6391752577319587, - "grad_norm": 0.8154643177986145, - "learning_rate": 1.8978305678021595e-05, - "loss": 0.6503, - "step": 706 - }, - { - "epoch": 3.6443298969072164, - "grad_norm": 0.6225295066833496, - "learning_rate": 1.884458346069713e-05, - "loss": 0.5938, - "step": 707 - }, - { - "epoch": 3.649484536082474, - "grad_norm": 0.8098758459091187, - "learning_rate": 1.8711224536540678e-05, - "loss": 0.5258, - "step": 708 - }, - { - "epoch": 3.654639175257732, - "grad_norm": 0.7377306818962097, - "learning_rate": 1.857823046060722e-05, - "loss": 0.6682, - "step": 709 - }, - { - "epoch": 3.6597938144329896, - "grad_norm": 0.7113179564476013, - "learning_rate": 1.8445602783697374e-05, - "loss": 0.5529, - "step": 710 - }, - { - "epoch": 3.6649484536082473, - "grad_norm": 0.7635980248451233, - "learning_rate": 1.831334305233928e-05, - "loss": 0.5538, - "step": 711 - }, - { - "epoch": 3.670103092783505, - "grad_norm": 0.6800134181976318, - "learning_rate": 1.8181452808770637e-05, - "loss": 0.5837, - "step": 712 - }, - { - "epoch": 3.675257731958763, - "grad_norm": 0.6108891367912292, - "learning_rate": 1.804993359092059e-05, - "loss": 0.5765, - "step": 713 - }, - { - "epoch": 3.680412371134021, - "grad_norm": 0.6556738018989563, - "learning_rate": 1.7918786932391944e-05, - "loss": 0.5426, - "step": 714 - }, - { - "epoch": 3.6855670103092786, - "grad_norm": 0.7539121508598328, - "learning_rate": 1.778801436244319e-05, - "loss": 0.6723, - "step": 715 - }, - { - "epoch": 3.6907216494845363, - "grad_norm": 0.7283245325088501, - "learning_rate": 1.765761740597065e-05, - "loss": 0.6965, - "step": 716 - }, - { - "epoch": 3.695876288659794, - "grad_norm": 0.7831310033798218, - "learning_rate": 1.7527597583490822e-05, - "loss": 0.6255, - "step": 717 - }, - { - "epoch": 3.7010309278350517, - "grad_norm": 0.5926414132118225, - "learning_rate": 1.739795641112248e-05, - "loss": 0.4947, - "step": 718 - }, - { - "epoch": 3.7061855670103094, - "grad_norm": 0.8325262069702148, - "learning_rate": 1.726869540056915e-05, - "loss": 0.6241, - "step": 719 - }, - { - "epoch": 3.711340206185567, - "grad_norm": 0.69277423620224, - "learning_rate": 1.713981605910137e-05, - "loss": 0.6247, - "step": 720 - }, - { - "epoch": 3.716494845360825, - "grad_norm": 0.7472510933876038, - "learning_rate": 1.70113198895392e-05, - "loss": 0.525, - "step": 721 - }, - { - "epoch": 3.7216494845360826, - "grad_norm": 0.8484485149383545, - "learning_rate": 1.6883208390234628e-05, - "loss": 0.5437, - "step": 722 - }, - { - "epoch": 3.7268041237113403, - "grad_norm": 0.6205007433891296, - "learning_rate": 1.6755483055054105e-05, - "loss": 0.4387, - "step": 723 - }, - { - "epoch": 3.731958762886598, - "grad_norm": 0.7974679470062256, - "learning_rate": 1.662814537336122e-05, - "loss": 0.5756, - "step": 724 - }, - { - "epoch": 3.7371134020618557, - "grad_norm": 0.9275572896003723, - "learning_rate": 1.650119682999918e-05, - "loss": 0.7086, - "step": 725 - }, - { - "epoch": 3.7422680412371134, - "grad_norm": 0.5912320017814636, - "learning_rate": 1.6374638905273643e-05, - "loss": 0.4894, - "step": 726 - }, - { - "epoch": 3.747422680412371, - "grad_norm": 0.8025346994400024, - "learning_rate": 1.624847307493534e-05, - "loss": 0.5677, - "step": 727 - }, - { - "epoch": 3.752577319587629, - "grad_norm": 0.8356587886810303, - "learning_rate": 1.6122700810162966e-05, - "loss": 0.6478, - "step": 728 - }, - { - "epoch": 3.7577319587628866, - "grad_norm": 0.7288328409194946, - "learning_rate": 1.5997323577545915e-05, - "loss": 0.6042, - "step": 729 - }, - { - "epoch": 3.7628865979381443, - "grad_norm": 0.7492033839225769, - "learning_rate": 1.5872342839067306e-05, - "loss": 0.5439, - "step": 730 - }, - { - "epoch": 3.768041237113402, - "grad_norm": 0.7219738364219666, - "learning_rate": 1.5747760052086803e-05, - "loss": 0.7247, - "step": 731 - }, - { - "epoch": 3.7731958762886597, - "grad_norm": 0.7529350519180298, - "learning_rate": 1.5623576669323743e-05, - "loss": 0.595, - "step": 732 - }, - { - "epoch": 3.7783505154639174, - "grad_norm": 0.613925576210022, - "learning_rate": 1.5499794138840122e-05, - "loss": 0.5348, - "step": 733 - }, - { - "epoch": 3.783505154639175, - "grad_norm": 0.7281404733657837, - "learning_rate": 1.5376413904023722e-05, - "loss": 0.431, - "step": 734 - }, - { - "epoch": 3.788659793814433, - "grad_norm": 0.6786961555480957, - "learning_rate": 1.525343740357128e-05, - "loss": 0.5334, - "step": 735 - }, - { - "epoch": 3.7938144329896906, - "grad_norm": 0.7997889518737793, - "learning_rate": 1.5130866071471717e-05, - "loss": 0.5712, - "step": 736 - }, - { - "epoch": 3.7989690721649483, - "grad_norm": 0.8244373798370361, - "learning_rate": 1.500870133698945e-05, - "loss": 0.6237, - "step": 737 - }, - { - "epoch": 3.804123711340206, - "grad_norm": 0.7046230435371399, - "learning_rate": 1.4886944624647647e-05, - "loss": 0.5052, - "step": 738 - }, - { - "epoch": 3.8092783505154637, - "grad_norm": 0.6828339695930481, - "learning_rate": 1.4765597354211713e-05, - "loss": 0.4986, - "step": 739 - }, - { - "epoch": 3.8144329896907214, - "grad_norm": 0.7806636691093445, - "learning_rate": 1.4644660940672627e-05, - "loss": 0.5256, - "step": 740 - }, - { - "epoch": 3.819587628865979, - "grad_norm": 0.7063342928886414, - "learning_rate": 1.4524136794230547e-05, - "loss": 0.6, - "step": 741 - }, - { - "epoch": 3.824742268041237, - "grad_norm": 0.6564049124717712, - "learning_rate": 1.4404026320278318e-05, - "loss": 0.4549, - "step": 742 - }, - { - "epoch": 3.829896907216495, - "grad_norm": 0.737053394317627, - "learning_rate": 1.4284330919385036e-05, - "loss": 0.5977, - "step": 743 - }, - { - "epoch": 3.8350515463917527, - "grad_norm": 0.8713696002960205, - "learning_rate": 1.4165051987279831e-05, - "loss": 0.7163, - "step": 744 - }, - { - "epoch": 3.8402061855670104, - "grad_norm": 0.8080871105194092, - "learning_rate": 1.404619091483546e-05, - "loss": 0.728, - "step": 745 - }, - { - "epoch": 3.845360824742268, - "grad_norm": 0.7074190378189087, - "learning_rate": 1.3927749088052217e-05, - "loss": 0.5355, - "step": 746 - }, - { - "epoch": 3.850515463917526, - "grad_norm": 0.887178897857666, - "learning_rate": 1.3809727888041668e-05, - "loss": 0.6736, - "step": 747 - }, - { - "epoch": 3.8556701030927836, - "grad_norm": 0.7117917537689209, - "learning_rate": 1.3692128691010592e-05, - "loss": 0.5781, - "step": 748 - }, - { - "epoch": 3.8608247422680413, - "grad_norm": 1.025062918663025, - "learning_rate": 1.3574952868244922e-05, - "loss": 0.6942, - "step": 749 - }, - { - "epoch": 3.865979381443299, - "grad_norm": 0.8618969917297363, - "learning_rate": 1.3458201786093794e-05, - "loss": 0.6821, - "step": 750 - }, - { - "epoch": 3.8711340206185567, - "grad_norm": 0.7979894876480103, - "learning_rate": 1.334187680595358e-05, - "loss": 0.5799, - "step": 751 - }, - { - "epoch": 3.8762886597938144, - "grad_norm": 0.6518426537513733, - "learning_rate": 1.3225979284251954e-05, - "loss": 0.5948, - "step": 752 - }, - { - "epoch": 3.881443298969072, - "grad_norm": 0.8365257382392883, - "learning_rate": 1.3110510572432221e-05, - "loss": 0.7303, - "step": 753 - }, - { - "epoch": 3.88659793814433, - "grad_norm": 0.7343977093696594, - "learning_rate": 1.2995472016937404e-05, - "loss": 0.5519, - "step": 754 - }, - { - "epoch": 3.8917525773195876, - "grad_norm": 0.7648212909698486, - "learning_rate": 1.2880864959194665e-05, - "loss": 0.6492, - "step": 755 - }, - { - "epoch": 3.8969072164948453, - "grad_norm": 0.7290052771568298, - "learning_rate": 1.2766690735599568e-05, - "loss": 0.5893, - "step": 756 - }, - { - "epoch": 3.902061855670103, - "grad_norm": 0.5812563300132751, - "learning_rate": 1.2652950677500574e-05, - "loss": 0.5686, - "step": 757 - }, - { - "epoch": 3.9072164948453607, - "grad_norm": 0.6635538339614868, - "learning_rate": 1.253964611118345e-05, - "loss": 0.4879, - "step": 758 - }, - { - "epoch": 3.9123711340206184, - "grad_norm": 0.6084232926368713, - "learning_rate": 1.2426778357855873e-05, - "loss": 0.5395, - "step": 759 - }, - { - "epoch": 3.917525773195876, - "grad_norm": 0.7855112552642822, - "learning_rate": 1.2314348733631959e-05, - "loss": 0.6793, - "step": 760 - }, - { - "epoch": 3.9226804123711343, - "grad_norm": 0.7807559370994568, - "learning_rate": 1.2202358549516923e-05, - "loss": 0.6824, - "step": 761 - }, - { - "epoch": 3.927835051546392, - "grad_norm": 0.7157235741615295, - "learning_rate": 1.209080911139187e-05, - "loss": 0.5723, - "step": 762 - }, - { - "epoch": 3.9329896907216497, - "grad_norm": 0.7934853434562683, - "learning_rate": 1.1979701719998453e-05, - "loss": 0.6191, - "step": 763 - }, - { - "epoch": 3.9381443298969074, - "grad_norm": 0.7907642722129822, - "learning_rate": 1.1869037670923815e-05, - "loss": 0.7329, - "step": 764 - }, - { - "epoch": 3.943298969072165, - "grad_norm": 0.7775223851203918, - "learning_rate": 1.1758818254585369e-05, - "loss": 0.6165, - "step": 765 - }, - { - "epoch": 3.948453608247423, - "grad_norm": 0.7809783220291138, - "learning_rate": 1.164904475621587e-05, - "loss": 0.681, - "step": 766 - }, - { - "epoch": 3.9536082474226806, - "grad_norm": 0.8733285069465637, - "learning_rate": 1.1539718455848309e-05, - "loss": 0.4686, - "step": 767 - }, - { - "epoch": 3.9587628865979383, - "grad_norm": 0.7542945146560669, - "learning_rate": 1.1430840628301093e-05, - "loss": 0.5553, - "step": 768 - }, - { - "epoch": 3.963917525773196, - "grad_norm": 0.790187656879425, - "learning_rate": 1.1322412543163135e-05, - "loss": 0.6635, - "step": 769 - }, - { - "epoch": 3.9690721649484537, - "grad_norm": 0.721284806728363, - "learning_rate": 1.1214435464779006e-05, - "loss": 0.6948, - "step": 770 - }, - { - "epoch": 3.9742268041237114, - "grad_norm": 0.6923427581787109, - "learning_rate": 1.1106910652234276e-05, - "loss": 0.5599, - "step": 771 - }, - { - "epoch": 3.979381443298969, - "grad_norm": 0.7891685962677002, - "learning_rate": 1.099983935934077e-05, - "loss": 0.6763, - "step": 772 - }, - { - "epoch": 3.984536082474227, - "grad_norm": 0.8703778386116028, - "learning_rate": 1.089322283462197e-05, - "loss": 0.6884, - "step": 773 - }, - { - "epoch": 3.9896907216494846, - "grad_norm": 0.629092812538147, - "learning_rate": 1.0787062321298442e-05, - "loss": 0.5368, - "step": 774 - }, - { - "epoch": 3.9948453608247423, - "grad_norm": 0.7654589414596558, - "learning_rate": 1.0681359057273388e-05, - "loss": 0.6477, - "step": 775 - }, - { - "epoch": 4.0, - "grad_norm": 1.1668832302093506, - "learning_rate": 1.0576114275118131e-05, - "loss": 0.6873, - "step": 776 - }, - { - "epoch": 4.0, - "eval_loss": 0.7821245193481445, - "eval_runtime": 23.21, - "eval_samples_per_second": 7.066, - "eval_steps_per_second": 1.766, - "step": 776 - }, - { - "epoch": 4.005154639175258, - "grad_norm": 0.6904922127723694, - "learning_rate": 1.0471329202057823e-05, - "loss": 0.4518, - "step": 777 - }, - { - "epoch": 4.010309278350515, - "grad_norm": 0.5978090763092041, - "learning_rate": 1.0367005059957096e-05, - "loss": 0.4717, - "step": 778 - }, - { - "epoch": 4.015463917525773, - "grad_norm": 0.800383985042572, - "learning_rate": 1.0263143065305769e-05, - "loss": 0.4304, - "step": 779 - }, - { - "epoch": 4.020618556701031, - "grad_norm": 0.693153440952301, - "learning_rate": 1.0159744429204777e-05, - "loss": 0.5344, - "step": 780 - }, - { - "epoch": 4.025773195876289, - "grad_norm": 0.6497305035591125, - "learning_rate": 1.005681035735192e-05, - "loss": 0.5306, - "step": 781 - }, - { - "epoch": 4.030927835051546, - "grad_norm": 0.7015767097473145, - "learning_rate": 9.954342050027921e-06, - "loss": 0.5403, - "step": 782 - }, - { - "epoch": 4.036082474226804, - "grad_norm": 0.6772336363792419, - "learning_rate": 9.852340702082318e-06, - "loss": 0.4334, - "step": 783 - }, - { - "epoch": 4.041237113402062, - "grad_norm": 0.6541298627853394, - "learning_rate": 9.750807502919652e-06, - "loss": 0.5742, - "step": 784 - }, - { - "epoch": 4.046391752577319, - "grad_norm": 0.7539450526237488, - "learning_rate": 9.64974363648548e-06, - "loss": 0.7293, - "step": 785 - }, - { - "epoch": 4.051546391752577, - "grad_norm": 0.783802330493927, - "learning_rate": 9.549150281252633e-06, - "loss": 0.5296, - "step": 786 - }, - { - "epoch": 4.056701030927835, - "grad_norm": 0.6802383661270142, - "learning_rate": 9.449028610207494e-06, - "loss": 0.5059, - "step": 787 - }, - { - "epoch": 4.061855670103093, - "grad_norm": 0.8454770445823669, - "learning_rate": 9.349379790836243e-06, - "loss": 0.6427, - "step": 788 - }, - { - "epoch": 4.06701030927835, - "grad_norm": 0.7464700937271118, - "learning_rate": 9.25020498511135e-06, - "loss": 0.6154, - "step": 789 - }, - { - "epoch": 4.072164948453608, - "grad_norm": 0.7701639533042908, - "learning_rate": 9.151505349477902e-06, - "loss": 0.5741, - "step": 790 - }, - { - "epoch": 4.077319587628866, - "grad_norm": 0.6774284243583679, - "learning_rate": 9.053282034840238e-06, - "loss": 0.5858, - "step": 791 - }, - { - "epoch": 4.082474226804123, - "grad_norm": 0.637546718120575, - "learning_rate": 8.955536186548425e-06, - "loss": 0.5001, - "step": 792 - }, - { - "epoch": 4.087628865979381, - "grad_norm": 0.7409276366233826, - "learning_rate": 8.858268944384995e-06, - "loss": 0.5082, - "step": 793 - }, - { - "epoch": 4.092783505154639, - "grad_norm": 0.8038662075996399, - "learning_rate": 8.761481442551573e-06, - "loss": 0.6967, - "step": 794 - }, - { - "epoch": 4.097938144329897, - "grad_norm": 0.7973031997680664, - "learning_rate": 8.665174809655708e-06, - "loss": 0.6006, - "step": 795 - }, - { - "epoch": 4.103092783505154, - "grad_norm": 0.7798672318458557, - "learning_rate": 8.569350168697704e-06, - "loss": 0.6076, - "step": 796 - }, - { - "epoch": 4.108247422680412, - "grad_norm": 0.7757202982902527, - "learning_rate": 8.474008637057478e-06, - "loss": 0.5881, - "step": 797 - }, - { - "epoch": 4.11340206185567, - "grad_norm": 0.8978819251060486, - "learning_rate": 8.379151326481587e-06, - "loss": 0.5522, - "step": 798 - }, - { - "epoch": 4.118556701030927, - "grad_norm": 0.8775174617767334, - "learning_rate": 8.284779343070265e-06, - "loss": 0.5599, - "step": 799 - }, - { - "epoch": 4.123711340206185, - "grad_norm": 0.6384215950965881, - "learning_rate": 8.19089378726447e-06, - "loss": 0.466, - "step": 800 - }, - { - "epoch": 4.128865979381443, - "grad_norm": 0.7570362091064453, - "learning_rate": 8.097495753833078e-06, - "loss": 0.6081, - "step": 801 - }, - { - "epoch": 4.134020618556701, - "grad_norm": 0.8209741711616516, - "learning_rate": 8.004586331860175e-06, - "loss": 0.6016, - "step": 802 - }, - { - "epoch": 4.139175257731959, - "grad_norm": 0.7955187559127808, - "learning_rate": 7.91216660473228e-06, - "loss": 0.5968, - "step": 803 - }, - { - "epoch": 4.144329896907217, - "grad_norm": 0.7099952101707458, - "learning_rate": 7.820237650125712e-06, - "loss": 0.6445, - "step": 804 - }, - { - "epoch": 4.149484536082475, - "grad_norm": 0.923795759677887, - "learning_rate": 7.728800539994113e-06, - "loss": 0.7184, - "step": 805 - }, - { - "epoch": 4.154639175257732, - "grad_norm": 0.7572796940803528, - "learning_rate": 7.637856340555822e-06, - "loss": 0.5521, - "step": 806 - }, - { - "epoch": 4.15979381443299, - "grad_norm": 0.823486864566803, - "learning_rate": 7.547406112281557e-06, - "loss": 0.5439, - "step": 807 - }, - { - "epoch": 4.164948453608248, - "grad_norm": 0.7394459247589111, - "learning_rate": 7.457450909881969e-06, - "loss": 0.499, - "step": 808 - }, - { - "epoch": 4.170103092783505, - "grad_norm": 0.8787947297096252, - "learning_rate": 7.367991782295391e-06, - "loss": 0.4774, - "step": 809 - }, - { - "epoch": 4.175257731958763, - "grad_norm": 0.931903600692749, - "learning_rate": 7.2790297726755716e-06, - "loss": 0.4642, - "step": 810 - }, - { - "epoch": 4.180412371134021, - "grad_norm": 0.8398101329803467, - "learning_rate": 7.190565918379549e-06, - "loss": 0.636, - "step": 811 - }, - { - "epoch": 4.185567010309279, - "grad_norm": 0.8237578868865967, - "learning_rate": 7.1026012509555265e-06, - "loss": 0.5771, - "step": 812 - }, - { - "epoch": 4.190721649484536, - "grad_norm": 0.8667364716529846, - "learning_rate": 7.015136796130828e-06, - "loss": 0.6154, - "step": 813 - }, - { - "epoch": 4.195876288659794, - "grad_norm": 0.803131103515625, - "learning_rate": 6.928173573800006e-06, - "loss": 0.5299, - "step": 814 - }, - { - "epoch": 4.201030927835052, - "grad_norm": 0.8482971787452698, - "learning_rate": 6.8417125980128675e-06, - "loss": 0.6238, - "step": 815 - }, - { - "epoch": 4.206185567010309, - "grad_norm": 0.7783701419830322, - "learning_rate": 6.755754876962711e-06, - "loss": 0.5216, - "step": 816 - }, - { - "epoch": 4.211340206185567, - "grad_norm": 0.6552199125289917, - "learning_rate": 6.670301412974511e-06, - "loss": 0.4832, - "step": 817 - }, - { - "epoch": 4.216494845360825, - "grad_norm": 0.8719028830528259, - "learning_rate": 6.585353202493322e-06, - "loss": 0.575, - "step": 818 - }, - { - "epoch": 4.221649484536083, - "grad_norm": 0.7946240305900574, - "learning_rate": 6.500911236072532e-06, - "loss": 0.648, - "step": 819 - }, - { - "epoch": 4.22680412371134, - "grad_norm": 0.7797208428382874, - "learning_rate": 6.416976498362432e-06, - "loss": 0.4849, - "step": 820 } ], "logging_steps": 1, @@ -5838,7 +69,7 @@ "attributes": {} } }, - "total_flos": 3.171226860461752e+17, + "total_flos": 1935445139128320.0, "train_batch_size": 4, "trial_name": null, "trial_params": null