diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,23329 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.3854739652870494, + "eval_steps": 71, + "global_step": 3276, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 11.078738059471418, + "learning_rate": 1e-08, + "loss": 0.9787, + "step": 1 + }, + { + "epoch": 0.0, + "eval_loss": 0.9907150864601135, + "eval_runtime": 6914.8687, + "eval_samples_per_second": 41.995, + "eval_steps_per_second": 2.1, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 11.611292476924348, + "learning_rate": 2e-08, + "loss": 1.0125, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 11.680888573298947, + "learning_rate": 3.0000000000000004e-08, + "loss": 0.9921, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 11.223600490850279, + "learning_rate": 4e-08, + "loss": 0.9905, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 13.16229268115578, + "learning_rate": 5.0000000000000004e-08, + "loss": 1.0453, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 11.866725221480236, + "learning_rate": 6.000000000000001e-08, + "loss": 1.0109, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 11.384069184647979, + "learning_rate": 7e-08, + "loss": 0.9907, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 11.950365392573932, + "learning_rate": 8e-08, + "loss": 1.0039, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 11.269911995803245, + "learning_rate": 9e-08, + "loss": 0.992, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 11.784110104038653, + "learning_rate": 1.0000000000000001e-07, + "loss": 0.9909, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 11.530150291016858, + "learning_rate": 1.1e-07, + "loss": 0.9717, + "step": 11 + }, + { + "epoch": 0.01, + "grad_norm": 11.46072146393034, + "learning_rate": 1.2000000000000002e-07, + "loss": 0.9813, + "step": 12 + }, + { + "epoch": 0.01, + "grad_norm": 10.983926810290162, + "learning_rate": 1.3e-07, + "loss": 1.0025, + "step": 13 + }, + { + "epoch": 0.01, + "grad_norm": 11.258765973186346, + "learning_rate": 1.4e-07, + "loss": 0.9927, + "step": 14 + }, + { + "epoch": 0.01, + "grad_norm": 11.213060806605753, + "learning_rate": 1.5000000000000002e-07, + "loss": 0.9729, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 11.271084778147063, + "learning_rate": 1.6e-07, + "loss": 0.9861, + "step": 16 + }, + { + "epoch": 0.01, + "grad_norm": 11.40785228773617, + "learning_rate": 1.7000000000000001e-07, + "loss": 1.004, + "step": 17 + }, + { + "epoch": 0.01, + "grad_norm": 11.071262500540902, + "learning_rate": 1.8e-07, + "loss": 0.999, + "step": 18 + }, + { + "epoch": 0.01, + "grad_norm": 10.939577936304328, + "learning_rate": 1.9e-07, + "loss": 0.9951, + "step": 19 + }, + { + "epoch": 0.01, + "grad_norm": 10.543379116586797, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.9573, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 10.648729634073065, + "learning_rate": 2.1000000000000003e-07, + "loss": 0.95, + "step": 21 + }, + { + "epoch": 0.01, + "grad_norm": 9.442791775540536, + "learning_rate": 2.2e-07, + "loss": 0.9315, + "step": 22 + }, + { + "epoch": 0.01, + "grad_norm": 9.458344300581993, + "learning_rate": 2.3000000000000002e-07, + "loss": 0.942, + "step": 23 + }, + { + "epoch": 0.01, + "grad_norm": 9.249373081558684, + "learning_rate": 2.4000000000000003e-07, + "loss": 0.9065, + "step": 24 + }, + { + "epoch": 0.01, + "grad_norm": 9.104568818785538, + "learning_rate": 2.5000000000000004e-07, + "loss": 0.9251, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 8.776787693093286, + "learning_rate": 2.6e-07, + "loss": 0.9381, + "step": 26 + }, + { + "epoch": 0.01, + "grad_norm": 8.809034639899487, + "learning_rate": 2.7e-07, + "loss": 0.8958, + "step": 27 + }, + { + "epoch": 0.01, + "grad_norm": 8.588186330852137, + "learning_rate": 2.8e-07, + "loss": 0.9345, + "step": 28 + }, + { + "epoch": 0.01, + "grad_norm": 7.054903161127684, + "learning_rate": 2.9000000000000003e-07, + "loss": 0.926, + "step": 29 + }, + { + "epoch": 0.01, + "grad_norm": 5.409541396614469, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.8518, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 5.079371789402381, + "learning_rate": 3.1000000000000005e-07, + "loss": 0.8247, + "step": 31 + }, + { + "epoch": 0.01, + "grad_norm": 5.285936528164377, + "learning_rate": 3.2e-07, + "loss": 0.8839, + "step": 32 + }, + { + "epoch": 0.01, + "grad_norm": 5.0378580616482385, + "learning_rate": 3.3e-07, + "loss": 0.8482, + "step": 33 + }, + { + "epoch": 0.01, + "grad_norm": 4.8906753179918745, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.8848, + "step": 34 + }, + { + "epoch": 0.01, + "grad_norm": 4.673664611548944, + "learning_rate": 3.5000000000000004e-07, + "loss": 0.8514, + "step": 35 + }, + { + "epoch": 0.02, + "grad_norm": 4.672079486287008, + "learning_rate": 3.6e-07, + "loss": 0.8531, + "step": 36 + }, + { + "epoch": 0.02, + "grad_norm": 4.534392734183333, + "learning_rate": 3.7e-07, + "loss": 0.8408, + "step": 37 + }, + { + "epoch": 0.02, + "grad_norm": 4.426822133821268, + "learning_rate": 3.8e-07, + "loss": 0.8421, + "step": 38 + }, + { + "epoch": 0.02, + "grad_norm": 4.050598149897309, + "learning_rate": 3.9e-07, + "loss": 0.8143, + "step": 39 + }, + { + "epoch": 0.02, + "grad_norm": 3.3681521726267727, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.7697, + "step": 40 + }, + { + "epoch": 0.02, + "grad_norm": 3.168148018142974, + "learning_rate": 4.1000000000000004e-07, + "loss": 0.7522, + "step": 41 + }, + { + "epoch": 0.02, + "grad_norm": 2.988199405686652, + "learning_rate": 4.2000000000000006e-07, + "loss": 0.7721, + "step": 42 + }, + { + "epoch": 0.02, + "grad_norm": 2.7711442613950577, + "learning_rate": 4.3e-07, + "loss": 0.7777, + "step": 43 + }, + { + "epoch": 0.02, + "grad_norm": 2.3134967643199253, + "learning_rate": 4.4e-07, + "loss": 0.7478, + "step": 44 + }, + { + "epoch": 0.02, + "grad_norm": 2.102821646941925, + "learning_rate": 4.5000000000000003e-07, + "loss": 0.7614, + "step": 45 + }, + { + "epoch": 0.02, + "grad_norm": 2.806705011993839, + "learning_rate": 4.6000000000000004e-07, + "loss": 0.7344, + "step": 46 + }, + { + "epoch": 0.02, + "grad_norm": 2.07648392275885, + "learning_rate": 4.7000000000000005e-07, + "loss": 0.7469, + "step": 47 + }, + { + "epoch": 0.02, + "grad_norm": 1.9203144322745775, + "learning_rate": 4.800000000000001e-07, + "loss": 0.7611, + "step": 48 + }, + { + "epoch": 0.02, + "grad_norm": 1.8850095224554737, + "learning_rate": 4.900000000000001e-07, + "loss": 0.7892, + "step": 49 + }, + { + "epoch": 0.02, + "grad_norm": 1.8950995227228287, + "learning_rate": 5.000000000000001e-07, + "loss": 0.7249, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 2.220629645536677, + "learning_rate": 5.1e-07, + "loss": 0.7286, + "step": 51 + }, + { + "epoch": 0.02, + "grad_norm": 1.6217121963418404, + "learning_rate": 5.2e-07, + "loss": 0.72, + "step": 52 + }, + { + "epoch": 0.02, + "grad_norm": 1.7536100967521506, + "learning_rate": 5.3e-07, + "loss": 0.7303, + "step": 53 + }, + { + "epoch": 0.02, + "grad_norm": 1.8632645115840507, + "learning_rate": 5.4e-07, + "loss": 0.7041, + "step": 54 + }, + { + "epoch": 0.02, + "grad_norm": 1.6438907360560255, + "learning_rate": 5.5e-07, + "loss": 0.6778, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 1.4220708453129016, + "learning_rate": 5.6e-07, + "loss": 0.6928, + "step": 56 + }, + { + "epoch": 0.02, + "grad_norm": 1.3065263284110715, + "learning_rate": 5.7e-07, + "loss": 0.6705, + "step": 57 + }, + { + "epoch": 0.02, + "grad_norm": 1.160277550171688, + "learning_rate": 5.800000000000001e-07, + "loss": 0.714, + "step": 58 + }, + { + "epoch": 0.03, + "grad_norm": 1.0619267911744315, + "learning_rate": 5.900000000000001e-07, + "loss": 0.6578, + "step": 59 + }, + { + "epoch": 0.03, + "grad_norm": 1.0978405000752092, + "learning_rate": 6.000000000000001e-07, + "loss": 0.6827, + "step": 60 + }, + { + "epoch": 0.03, + "grad_norm": 1.0203948283683908, + "learning_rate": 6.100000000000001e-07, + "loss": 0.6945, + "step": 61 + }, + { + "epoch": 0.03, + "grad_norm": 1.0376432923241783, + "learning_rate": 6.200000000000001e-07, + "loss": 0.6785, + "step": 62 + }, + { + "epoch": 0.03, + "grad_norm": 1.1199898784937596, + "learning_rate": 6.3e-07, + "loss": 0.6879, + "step": 63 + }, + { + "epoch": 0.03, + "grad_norm": 0.9323677710694938, + "learning_rate": 6.4e-07, + "loss": 0.6958, + "step": 64 + }, + { + "epoch": 0.03, + "grad_norm": 0.9242075211877638, + "learning_rate": 6.5e-07, + "loss": 0.6632, + "step": 65 + }, + { + "epoch": 0.03, + "grad_norm": 0.9436047323924038, + "learning_rate": 6.6e-07, + "loss": 0.679, + "step": 66 + }, + { + "epoch": 0.03, + "grad_norm": 0.8822704045541839, + "learning_rate": 6.7e-07, + "loss": 0.6936, + "step": 67 + }, + { + "epoch": 0.03, + "grad_norm": 0.8869148114540986, + "learning_rate": 6.800000000000001e-07, + "loss": 0.6598, + "step": 68 + }, + { + "epoch": 0.03, + "grad_norm": 0.9122382330757471, + "learning_rate": 6.900000000000001e-07, + "loss": 0.6733, + "step": 69 + }, + { + "epoch": 0.03, + "grad_norm": 0.9213292233226152, + "learning_rate": 7.000000000000001e-07, + "loss": 0.6868, + "step": 70 + }, + { + "epoch": 0.03, + "grad_norm": 0.7942384246431622, + "learning_rate": 7.1e-07, + "loss": 0.6438, + "step": 71 + }, + { + "epoch": 0.03, + "eval_loss": 0.6540535092353821, + "eval_runtime": 6906.1355, + "eval_samples_per_second": 42.048, + "eval_steps_per_second": 2.102, + "step": 71 + }, + { + "epoch": 0.03, + "grad_norm": 0.8205470446647939, + "learning_rate": 7.2e-07, + "loss": 0.6462, + "step": 72 + }, + { + "epoch": 0.03, + "grad_norm": 0.7946802028186414, + "learning_rate": 7.3e-07, + "loss": 0.6806, + "step": 73 + }, + { + "epoch": 0.03, + "grad_norm": 0.8828845496654094, + "learning_rate": 7.4e-07, + "loss": 0.6664, + "step": 74 + }, + { + "epoch": 0.03, + "grad_norm": 0.9383931456659222, + "learning_rate": 7.5e-07, + "loss": 0.6137, + "step": 75 + }, + { + "epoch": 0.03, + "grad_norm": 0.8776340800086451, + "learning_rate": 7.6e-07, + "loss": 0.6453, + "step": 76 + }, + { + "epoch": 0.03, + "grad_norm": 0.7928098345984148, + "learning_rate": 7.7e-07, + "loss": 0.6245, + "step": 77 + }, + { + "epoch": 0.03, + "grad_norm": 0.8548051569752384, + "learning_rate": 7.8e-07, + "loss": 0.6588, + "step": 78 + }, + { + "epoch": 0.03, + "grad_norm": 0.7989762163409693, + "learning_rate": 7.900000000000001e-07, + "loss": 0.6225, + "step": 79 + }, + { + "epoch": 0.03, + "grad_norm": 0.7487196972831345, + "learning_rate": 8.000000000000001e-07, + "loss": 0.6306, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 0.7503963380748643, + "learning_rate": 8.100000000000001e-07, + "loss": 0.642, + "step": 81 + }, + { + "epoch": 0.04, + "grad_norm": 0.7715020340302331, + "learning_rate": 8.200000000000001e-07, + "loss": 0.637, + "step": 82 + }, + { + "epoch": 0.04, + "grad_norm": 0.7461624365856538, + "learning_rate": 8.300000000000001e-07, + "loss": 0.6456, + "step": 83 + }, + { + "epoch": 0.04, + "grad_norm": 0.7873038726512404, + "learning_rate": 8.400000000000001e-07, + "loss": 0.6301, + "step": 84 + }, + { + "epoch": 0.04, + "grad_norm": 0.8371533025271144, + "learning_rate": 8.500000000000001e-07, + "loss": 0.6294, + "step": 85 + }, + { + "epoch": 0.04, + "grad_norm": 0.7722902390890481, + "learning_rate": 8.6e-07, + "loss": 0.6285, + "step": 86 + }, + { + "epoch": 0.04, + "grad_norm": 0.7349010275266745, + "learning_rate": 8.7e-07, + "loss": 0.656, + "step": 87 + }, + { + "epoch": 0.04, + "grad_norm": 0.8710625748918476, + "learning_rate": 8.8e-07, + "loss": 0.6156, + "step": 88 + }, + { + "epoch": 0.04, + "grad_norm": 0.7833176365407506, + "learning_rate": 8.900000000000001e-07, + "loss": 0.6329, + "step": 89 + }, + { + "epoch": 0.04, + "grad_norm": 0.7956237199904019, + "learning_rate": 9.000000000000001e-07, + "loss": 0.604, + "step": 90 + }, + { + "epoch": 0.04, + "grad_norm": 0.7663333987942103, + "learning_rate": 9.100000000000001e-07, + "loss": 0.6132, + "step": 91 + }, + { + "epoch": 0.04, + "grad_norm": 0.7350037692506446, + "learning_rate": 9.200000000000001e-07, + "loss": 0.5959, + "step": 92 + }, + { + "epoch": 0.04, + "grad_norm": 0.7095481904743137, + "learning_rate": 9.300000000000001e-07, + "loss": 0.581, + "step": 93 + }, + { + "epoch": 0.04, + "grad_norm": 0.7669373034765433, + "learning_rate": 9.400000000000001e-07, + "loss": 0.6059, + "step": 94 + }, + { + "epoch": 0.04, + "grad_norm": 0.7042131948418571, + "learning_rate": 9.500000000000001e-07, + "loss": 0.6159, + "step": 95 + }, + { + "epoch": 0.04, + "grad_norm": 0.7415353323840956, + "learning_rate": 9.600000000000001e-07, + "loss": 0.6046, + "step": 96 + }, + { + "epoch": 0.04, + "grad_norm": 0.7038043696743754, + "learning_rate": 9.7e-07, + "loss": 0.6313, + "step": 97 + }, + { + "epoch": 0.04, + "grad_norm": 0.7144659980129581, + "learning_rate": 9.800000000000001e-07, + "loss": 0.6091, + "step": 98 + }, + { + "epoch": 0.04, + "grad_norm": 0.7198651744206456, + "learning_rate": 9.9e-07, + "loss": 0.6296, + "step": 99 + }, + { + "epoch": 0.04, + "grad_norm": 0.7518090561897931, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.6172, + "step": 100 + }, + { + "epoch": 0.04, + "grad_norm": 0.7176227042870604, + "learning_rate": 1.01e-06, + "loss": 0.5913, + "step": 101 + }, + { + "epoch": 0.04, + "grad_norm": 0.7227768155981023, + "learning_rate": 1.02e-06, + "loss": 0.592, + "step": 102 + }, + { + "epoch": 0.04, + "grad_norm": 0.7111683414742458, + "learning_rate": 1.03e-06, + "loss": 0.625, + "step": 103 + }, + { + "epoch": 0.04, + "grad_norm": 0.6992806259847806, + "learning_rate": 1.04e-06, + "loss": 0.5955, + "step": 104 + }, + { + "epoch": 0.04, + "grad_norm": 0.673513907571938, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.6244, + "step": 105 + }, + { + "epoch": 0.05, + "grad_norm": 0.6626454057249019, + "learning_rate": 1.06e-06, + "loss": 0.6246, + "step": 106 + }, + { + "epoch": 0.05, + "grad_norm": 0.6807547559648992, + "learning_rate": 1.0700000000000001e-06, + "loss": 0.6302, + "step": 107 + }, + { + "epoch": 0.05, + "grad_norm": 0.7118070037431555, + "learning_rate": 1.08e-06, + "loss": 0.612, + "step": 108 + }, + { + "epoch": 0.05, + "grad_norm": 0.7101111808553712, + "learning_rate": 1.0900000000000002e-06, + "loss": 0.6299, + "step": 109 + }, + { + "epoch": 0.05, + "grad_norm": 0.7361716975541328, + "learning_rate": 1.1e-06, + "loss": 0.5889, + "step": 110 + }, + { + "epoch": 0.05, + "grad_norm": 0.7507623271913958, + "learning_rate": 1.1100000000000002e-06, + "loss": 0.5939, + "step": 111 + }, + { + "epoch": 0.05, + "grad_norm": 0.69232482698427, + "learning_rate": 1.12e-06, + "loss": 0.5857, + "step": 112 + }, + { + "epoch": 0.05, + "grad_norm": 0.7560268686291741, + "learning_rate": 1.1300000000000002e-06, + "loss": 0.5926, + "step": 113 + }, + { + "epoch": 0.05, + "grad_norm": 0.6909950381406327, + "learning_rate": 1.14e-06, + "loss": 0.6259, + "step": 114 + }, + { + "epoch": 0.05, + "grad_norm": 0.705457142068379, + "learning_rate": 1.1500000000000002e-06, + "loss": 0.594, + "step": 115 + }, + { + "epoch": 0.05, + "grad_norm": 0.7479733597296523, + "learning_rate": 1.1600000000000001e-06, + "loss": 0.5941, + "step": 116 + }, + { + "epoch": 0.05, + "grad_norm": 0.6703707169078938, + "learning_rate": 1.1700000000000002e-06, + "loss": 0.6002, + "step": 117 + }, + { + "epoch": 0.05, + "grad_norm": 0.7142454896237415, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.5721, + "step": 118 + }, + { + "epoch": 0.05, + "grad_norm": 0.6410921583173411, + "learning_rate": 1.19e-06, + "loss": 0.5549, + "step": 119 + }, + { + "epoch": 0.05, + "grad_norm": 0.7371839254744956, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.6076, + "step": 120 + }, + { + "epoch": 0.05, + "grad_norm": 0.6642983393421674, + "learning_rate": 1.21e-06, + "loss": 0.6164, + "step": 121 + }, + { + "epoch": 0.05, + "grad_norm": 0.6622892175821214, + "learning_rate": 1.2200000000000002e-06, + "loss": 0.5916, + "step": 122 + }, + { + "epoch": 0.05, + "grad_norm": 0.6836278867814578, + "learning_rate": 1.23e-06, + "loss": 0.6055, + "step": 123 + }, + { + "epoch": 0.05, + "grad_norm": 0.7449640043860738, + "learning_rate": 1.2400000000000002e-06, + "loss": 0.5706, + "step": 124 + }, + { + "epoch": 0.05, + "grad_norm": 0.7030914815199217, + "learning_rate": 1.25e-06, + "loss": 0.5962, + "step": 125 + }, + { + "epoch": 0.05, + "grad_norm": 0.6636464764371871, + "learning_rate": 1.26e-06, + "loss": 0.6004, + "step": 126 + }, + { + "epoch": 0.05, + "grad_norm": 0.7292101825675744, + "learning_rate": 1.2700000000000001e-06, + "loss": 0.6002, + "step": 127 + }, + { + "epoch": 0.05, + "grad_norm": 0.7002155035767813, + "learning_rate": 1.28e-06, + "loss": 0.597, + "step": 128 + }, + { + "epoch": 0.06, + "grad_norm": 0.697947240938124, + "learning_rate": 1.2900000000000001e-06, + "loss": 0.5788, + "step": 129 + }, + { + "epoch": 0.06, + "grad_norm": 0.7106171095112853, + "learning_rate": 1.3e-06, + "loss": 0.5878, + "step": 130 + }, + { + "epoch": 0.06, + "grad_norm": 0.7712154049962048, + "learning_rate": 1.3100000000000002e-06, + "loss": 0.6198, + "step": 131 + }, + { + "epoch": 0.06, + "grad_norm": 0.9336735438619129, + "learning_rate": 1.32e-06, + "loss": 0.5883, + "step": 132 + }, + { + "epoch": 0.06, + "grad_norm": 0.6689366967241089, + "learning_rate": 1.3300000000000002e-06, + "loss": 0.5881, + "step": 133 + }, + { + "epoch": 0.06, + "grad_norm": 0.6553033178045365, + "learning_rate": 1.34e-06, + "loss": 0.564, + "step": 134 + }, + { + "epoch": 0.06, + "grad_norm": 0.7780421140633135, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.5939, + "step": 135 + }, + { + "epoch": 0.06, + "grad_norm": 0.7512017485981449, + "learning_rate": 1.3600000000000001e-06, + "loss": 0.5967, + "step": 136 + }, + { + "epoch": 0.06, + "grad_norm": 0.7273946733225628, + "learning_rate": 1.3700000000000002e-06, + "loss": 0.593, + "step": 137 + }, + { + "epoch": 0.06, + "grad_norm": 0.7070418593498196, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.5848, + "step": 138 + }, + { + "epoch": 0.06, + "grad_norm": 0.7716412837183378, + "learning_rate": 1.3900000000000002e-06, + "loss": 0.6087, + "step": 139 + }, + { + "epoch": 0.06, + "grad_norm": 0.7211339598744784, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.5711, + "step": 140 + }, + { + "epoch": 0.06, + "grad_norm": 0.6962486858771757, + "learning_rate": 1.41e-06, + "loss": 0.5791, + "step": 141 + }, + { + "epoch": 0.06, + "grad_norm": 0.7001118725454738, + "learning_rate": 1.42e-06, + "loss": 0.5772, + "step": 142 + }, + { + "epoch": 0.06, + "eval_loss": 0.5818271040916443, + "eval_runtime": 6911.9695, + "eval_samples_per_second": 42.012, + "eval_steps_per_second": 2.101, + "step": 142 + }, + { + "epoch": 0.06, + "grad_norm": 0.764854682764742, + "learning_rate": 1.43e-06, + "loss": 0.6009, + "step": 143 + }, + { + "epoch": 0.06, + "grad_norm": 0.730261637210628, + "learning_rate": 1.44e-06, + "loss": 0.5785, + "step": 144 + }, + { + "epoch": 0.06, + "grad_norm": 0.8038474557194998, + "learning_rate": 1.45e-06, + "loss": 0.607, + "step": 145 + }, + { + "epoch": 0.06, + "grad_norm": 0.6835086511414872, + "learning_rate": 1.46e-06, + "loss": 0.5825, + "step": 146 + }, + { + "epoch": 0.06, + "grad_norm": 0.7505101234005623, + "learning_rate": 1.4700000000000001e-06, + "loss": 0.6049, + "step": 147 + }, + { + "epoch": 0.06, + "grad_norm": 0.7840302962113346, + "learning_rate": 1.48e-06, + "loss": 0.5569, + "step": 148 + }, + { + "epoch": 0.06, + "grad_norm": 0.6936446529890948, + "learning_rate": 1.4900000000000001e-06, + "loss": 0.5713, + "step": 149 + }, + { + "epoch": 0.06, + "grad_norm": 0.7331878475223359, + "learning_rate": 1.5e-06, + "loss": 0.588, + "step": 150 + }, + { + "epoch": 0.06, + "grad_norm": 0.6745518982361585, + "learning_rate": 1.5100000000000002e-06, + "loss": 0.5576, + "step": 151 + }, + { + "epoch": 0.06, + "grad_norm": 0.6950585712176994, + "learning_rate": 1.52e-06, + "loss": 0.5853, + "step": 152 + }, + { + "epoch": 0.07, + "grad_norm": 0.7339007955624004, + "learning_rate": 1.5300000000000002e-06, + "loss": 0.5887, + "step": 153 + }, + { + "epoch": 0.07, + "grad_norm": 0.7086482597013368, + "learning_rate": 1.54e-06, + "loss": 0.5857, + "step": 154 + }, + { + "epoch": 0.07, + "grad_norm": 0.6929345752180732, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.5937, + "step": 155 + }, + { + "epoch": 0.07, + "grad_norm": 0.716359352444948, + "learning_rate": 1.56e-06, + "loss": 0.5459, + "step": 156 + }, + { + "epoch": 0.07, + "grad_norm": 0.7432089483211889, + "learning_rate": 1.5700000000000002e-06, + "loss": 0.5766, + "step": 157 + }, + { + "epoch": 0.07, + "grad_norm": 0.6876861316239075, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.5729, + "step": 158 + }, + { + "epoch": 0.07, + "grad_norm": 0.7652117324379928, + "learning_rate": 1.5900000000000002e-06, + "loss": 0.5714, + "step": 159 + }, + { + "epoch": 0.07, + "grad_norm": 0.6874557208270947, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.5786, + "step": 160 + }, + { + "epoch": 0.07, + "grad_norm": 0.6873913339602188, + "learning_rate": 1.6100000000000003e-06, + "loss": 0.588, + "step": 161 + }, + { + "epoch": 0.07, + "grad_norm": 0.683965861475918, + "learning_rate": 1.6200000000000002e-06, + "loss": 0.5772, + "step": 162 + }, + { + "epoch": 0.07, + "grad_norm": 0.774874549937846, + "learning_rate": 1.6300000000000003e-06, + "loss": 0.6, + "step": 163 + }, + { + "epoch": 0.07, + "grad_norm": 0.7021871067570723, + "learning_rate": 1.6400000000000002e-06, + "loss": 0.5689, + "step": 164 + }, + { + "epoch": 0.07, + "grad_norm": 0.7209423777996005, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.5607, + "step": 165 + }, + { + "epoch": 0.07, + "grad_norm": 0.6803679044895155, + "learning_rate": 1.6600000000000002e-06, + "loss": 0.609, + "step": 166 + }, + { + "epoch": 0.07, + "grad_norm": 0.7712655814872096, + "learning_rate": 1.6700000000000003e-06, + "loss": 0.5764, + "step": 167 + }, + { + "epoch": 0.07, + "grad_norm": 0.7619624904596949, + "learning_rate": 1.6800000000000002e-06, + "loss": 0.5599, + "step": 168 + }, + { + "epoch": 0.07, + "grad_norm": 0.669785223369481, + "learning_rate": 1.6900000000000003e-06, + "loss": 0.5518, + "step": 169 + }, + { + "epoch": 0.07, + "grad_norm": 0.7392059624798044, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.595, + "step": 170 + }, + { + "epoch": 0.07, + "grad_norm": 0.6847168836492608, + "learning_rate": 1.7100000000000004e-06, + "loss": 0.5554, + "step": 171 + }, + { + "epoch": 0.07, + "grad_norm": 0.7793064159857329, + "learning_rate": 1.72e-06, + "loss": 0.5905, + "step": 172 + }, + { + "epoch": 0.07, + "grad_norm": 0.6760936921129691, + "learning_rate": 1.73e-06, + "loss": 0.5278, + "step": 173 + }, + { + "epoch": 0.07, + "grad_norm": 0.7120881185318579, + "learning_rate": 1.74e-06, + "loss": 0.562, + "step": 174 + }, + { + "epoch": 0.07, + "grad_norm": 0.6777641027946281, + "learning_rate": 1.75e-06, + "loss": 0.5967, + "step": 175 + }, + { + "epoch": 0.08, + "grad_norm": 0.6644124127387283, + "learning_rate": 1.76e-06, + "loss": 0.5685, + "step": 176 + }, + { + "epoch": 0.08, + "grad_norm": 0.6633049014407767, + "learning_rate": 1.77e-06, + "loss": 0.5694, + "step": 177 + }, + { + "epoch": 0.08, + "grad_norm": 0.6598336805084065, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.5556, + "step": 178 + }, + { + "epoch": 0.08, + "grad_norm": 0.7006058904587646, + "learning_rate": 1.79e-06, + "loss": 0.5769, + "step": 179 + }, + { + "epoch": 0.08, + "grad_norm": 0.7269153745138233, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.5725, + "step": 180 + }, + { + "epoch": 0.08, + "grad_norm": 0.7282646554724388, + "learning_rate": 1.81e-06, + "loss": 0.5593, + "step": 181 + }, + { + "epoch": 0.08, + "grad_norm": 0.857099360195062, + "learning_rate": 1.8200000000000002e-06, + "loss": 0.5669, + "step": 182 + }, + { + "epoch": 0.08, + "grad_norm": 0.6950980962926347, + "learning_rate": 1.83e-06, + "loss": 0.5777, + "step": 183 + }, + { + "epoch": 0.08, + "grad_norm": 0.6604073281679678, + "learning_rate": 1.8400000000000002e-06, + "loss": 0.5918, + "step": 184 + }, + { + "epoch": 0.08, + "grad_norm": 0.6690212574223983, + "learning_rate": 1.85e-06, + "loss": 0.561, + "step": 185 + }, + { + "epoch": 0.08, + "grad_norm": 0.6959851809345552, + "learning_rate": 1.8600000000000002e-06, + "loss": 0.5523, + "step": 186 + }, + { + "epoch": 0.08, + "grad_norm": 3.375144332117638, + "learning_rate": 1.87e-06, + "loss": 0.5612, + "step": 187 + }, + { + "epoch": 0.08, + "grad_norm": 0.7710335154558103, + "learning_rate": 1.8800000000000002e-06, + "loss": 0.5515, + "step": 188 + }, + { + "epoch": 0.08, + "grad_norm": 0.7387587574522239, + "learning_rate": 1.8900000000000001e-06, + "loss": 0.548, + "step": 189 + }, + { + "epoch": 0.08, + "grad_norm": 0.6895693730643745, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.5525, + "step": 190 + }, + { + "epoch": 0.08, + "grad_norm": 0.6809790348929371, + "learning_rate": 1.9100000000000003e-06, + "loss": 0.5539, + "step": 191 + }, + { + "epoch": 0.08, + "grad_norm": 0.6975774380503847, + "learning_rate": 1.9200000000000003e-06, + "loss": 0.5576, + "step": 192 + }, + { + "epoch": 0.08, + "grad_norm": 0.65978668039197, + "learning_rate": 1.93e-06, + "loss": 0.5546, + "step": 193 + }, + { + "epoch": 0.08, + "grad_norm": 0.7381481487351611, + "learning_rate": 1.94e-06, + "loss": 0.5362, + "step": 194 + }, + { + "epoch": 0.08, + "grad_norm": 0.7590514250142023, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.5496, + "step": 195 + }, + { + "epoch": 0.08, + "grad_norm": 0.6814269181772578, + "learning_rate": 1.9600000000000003e-06, + "loss": 0.595, + "step": 196 + }, + { + "epoch": 0.08, + "grad_norm": 0.7290336017009336, + "learning_rate": 1.97e-06, + "loss": 0.5845, + "step": 197 + }, + { + "epoch": 0.08, + "grad_norm": 0.7087771732688267, + "learning_rate": 1.98e-06, + "loss": 0.5425, + "step": 198 + }, + { + "epoch": 0.09, + "grad_norm": 0.6740991887260622, + "learning_rate": 1.9900000000000004e-06, + "loss": 0.5697, + "step": 199 + }, + { + "epoch": 0.09, + "grad_norm": 0.7692213485546533, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.5832, + "step": 200 + }, + { + "epoch": 0.09, + "grad_norm": 0.7310442342385465, + "learning_rate": 2.0100000000000002e-06, + "loss": 0.5484, + "step": 201 + }, + { + "epoch": 0.09, + "grad_norm": 0.6807932721845763, + "learning_rate": 2.02e-06, + "loss": 0.5476, + "step": 202 + }, + { + "epoch": 0.09, + "grad_norm": 0.703371626783706, + "learning_rate": 2.0300000000000005e-06, + "loss": 0.6031, + "step": 203 + }, + { + "epoch": 0.09, + "grad_norm": 0.6802894338699406, + "learning_rate": 2.04e-06, + "loss": 0.5828, + "step": 204 + }, + { + "epoch": 0.09, + "grad_norm": 0.6652596394872644, + "learning_rate": 2.05e-06, + "loss": 0.5614, + "step": 205 + }, + { + "epoch": 0.09, + "grad_norm": 0.737938303801843, + "learning_rate": 2.06e-06, + "loss": 0.5649, + "step": 206 + }, + { + "epoch": 0.09, + "grad_norm": 0.6851616685774734, + "learning_rate": 2.07e-06, + "loss": 0.5923, + "step": 207 + }, + { + "epoch": 0.09, + "grad_norm": 0.7193070222820541, + "learning_rate": 2.08e-06, + "loss": 0.5344, + "step": 208 + }, + { + "epoch": 0.09, + "grad_norm": 0.6982256320037129, + "learning_rate": 2.09e-06, + "loss": 0.524, + "step": 209 + }, + { + "epoch": 0.09, + "grad_norm": 0.6731186167503178, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.524, + "step": 210 + }, + { + "epoch": 0.09, + "grad_norm": 1.1798176224305597, + "learning_rate": 2.11e-06, + "loss": 0.5603, + "step": 211 + }, + { + "epoch": 0.09, + "grad_norm": 0.7003894417531635, + "learning_rate": 2.12e-06, + "loss": 0.5547, + "step": 212 + }, + { + "epoch": 0.09, + "grad_norm": 0.6678276891220589, + "learning_rate": 2.13e-06, + "loss": 0.543, + "step": 213 + }, + { + "epoch": 0.09, + "eval_loss": 0.5565809607505798, + "eval_runtime": 6927.1816, + "eval_samples_per_second": 41.92, + "eval_steps_per_second": 2.096, + "step": 213 + }, + { + "epoch": 0.09, + "grad_norm": 0.6757438454080638, + "learning_rate": 2.1400000000000003e-06, + "loss": 0.5468, + "step": 214 + }, + { + "epoch": 0.09, + "grad_norm": 0.770869092645212, + "learning_rate": 2.15e-06, + "loss": 0.56, + "step": 215 + }, + { + "epoch": 0.09, + "grad_norm": 0.7073954416641812, + "learning_rate": 2.16e-06, + "loss": 0.5481, + "step": 216 + }, + { + "epoch": 0.09, + "grad_norm": 0.7155585643449607, + "learning_rate": 2.17e-06, + "loss": 0.5616, + "step": 217 + }, + { + "epoch": 0.09, + "grad_norm": 0.7290481543044689, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.5418, + "step": 218 + }, + { + "epoch": 0.09, + "grad_norm": 0.8118091409501089, + "learning_rate": 2.19e-06, + "loss": 0.5438, + "step": 219 + }, + { + "epoch": 0.09, + "grad_norm": 1.8302800073178411, + "learning_rate": 2.2e-06, + "loss": 0.5487, + "step": 220 + }, + { + "epoch": 0.09, + "grad_norm": 0.7364426707486922, + "learning_rate": 2.21e-06, + "loss": 0.5855, + "step": 221 + }, + { + "epoch": 0.09, + "grad_norm": 1.456340142180152, + "learning_rate": 2.2200000000000003e-06, + "loss": 0.5836, + "step": 222 + }, + { + "epoch": 0.1, + "grad_norm": 0.7385740541544212, + "learning_rate": 2.2300000000000002e-06, + "loss": 0.5697, + "step": 223 + }, + { + "epoch": 0.1, + "grad_norm": 0.84040439389078, + "learning_rate": 2.24e-06, + "loss": 0.5688, + "step": 224 + }, + { + "epoch": 0.1, + "grad_norm": 0.6791205410588949, + "learning_rate": 2.25e-06, + "loss": 0.5504, + "step": 225 + }, + { + "epoch": 0.1, + "grad_norm": 0.7249144826404104, + "learning_rate": 2.2600000000000004e-06, + "loss": 0.5573, + "step": 226 + }, + { + "epoch": 0.1, + "grad_norm": 0.7512914278711404, + "learning_rate": 2.2700000000000003e-06, + "loss": 0.5443, + "step": 227 + }, + { + "epoch": 0.1, + "grad_norm": 0.7283685135498201, + "learning_rate": 2.28e-06, + "loss": 0.5423, + "step": 228 + }, + { + "epoch": 0.1, + "grad_norm": 0.745183565987288, + "learning_rate": 2.29e-06, + "loss": 0.5304, + "step": 229 + }, + { + "epoch": 0.1, + "grad_norm": 0.738561325322298, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.5406, + "step": 230 + }, + { + "epoch": 0.1, + "grad_norm": 0.8285144394816474, + "learning_rate": 2.3100000000000003e-06, + "loss": 0.5299, + "step": 231 + }, + { + "epoch": 0.1, + "grad_norm": 0.6778837717707735, + "learning_rate": 2.3200000000000002e-06, + "loss": 0.5195, + "step": 232 + }, + { + "epoch": 0.1, + "grad_norm": 0.7719966350372699, + "learning_rate": 2.33e-06, + "loss": 0.5571, + "step": 233 + }, + { + "epoch": 0.1, + "grad_norm": 0.76143480039083, + "learning_rate": 2.3400000000000005e-06, + "loss": 0.5392, + "step": 234 + }, + { + "epoch": 0.1, + "grad_norm": 0.7799555692877767, + "learning_rate": 2.35e-06, + "loss": 0.5675, + "step": 235 + }, + { + "epoch": 0.1, + "grad_norm": 2.172177763095597, + "learning_rate": 2.3600000000000003e-06, + "loss": 0.5751, + "step": 236 + }, + { + "epoch": 0.1, + "grad_norm": 0.7382398234329495, + "learning_rate": 2.37e-06, + "loss": 0.541, + "step": 237 + }, + { + "epoch": 0.1, + "grad_norm": 0.7869817068341356, + "learning_rate": 2.38e-06, + "loss": 0.5616, + "step": 238 + }, + { + "epoch": 0.1, + "grad_norm": 0.747149871498409, + "learning_rate": 2.39e-06, + "loss": 0.5368, + "step": 239 + }, + { + "epoch": 0.1, + "grad_norm": 0.709533915967845, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.5401, + "step": 240 + }, + { + "epoch": 0.1, + "grad_norm": 0.8009167743191223, + "learning_rate": 2.4100000000000002e-06, + "loss": 0.5553, + "step": 241 + }, + { + "epoch": 0.1, + "grad_norm": 0.7824397361977652, + "learning_rate": 2.42e-06, + "loss": 0.5582, + "step": 242 + }, + { + "epoch": 0.1, + "grad_norm": 0.7713371690459996, + "learning_rate": 2.43e-06, + "loss": 0.5567, + "step": 243 + }, + { + "epoch": 0.1, + "grad_norm": 0.6841472190627085, + "learning_rate": 2.4400000000000004e-06, + "loss": 0.5255, + "step": 244 + }, + { + "epoch": 0.1, + "grad_norm": 0.7395314286170461, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.5474, + "step": 245 + }, + { + "epoch": 0.11, + "grad_norm": 0.9037560995126637, + "learning_rate": 2.46e-06, + "loss": 0.5443, + "step": 246 + }, + { + "epoch": 0.11, + "grad_norm": 0.7394243896953805, + "learning_rate": 2.47e-06, + "loss": 0.5654, + "step": 247 + }, + { + "epoch": 0.11, + "grad_norm": 0.7368266522326133, + "learning_rate": 2.4800000000000004e-06, + "loss": 0.5588, + "step": 248 + }, + { + "epoch": 0.11, + "grad_norm": 0.7216141071484306, + "learning_rate": 2.4900000000000003e-06, + "loss": 0.546, + "step": 249 + }, + { + "epoch": 0.11, + "grad_norm": 0.7381090709238742, + "learning_rate": 2.5e-06, + "loss": 0.5359, + "step": 250 + }, + { + "epoch": 0.11, + "grad_norm": 0.81508667979967, + "learning_rate": 2.51e-06, + "loss": 0.5831, + "step": 251 + }, + { + "epoch": 0.11, + "grad_norm": 0.7507679622871583, + "learning_rate": 2.52e-06, + "loss": 0.5536, + "step": 252 + }, + { + "epoch": 0.11, + "grad_norm": 0.725818735593307, + "learning_rate": 2.5300000000000003e-06, + "loss": 0.5301, + "step": 253 + }, + { + "epoch": 0.11, + "grad_norm": 0.7171172879542945, + "learning_rate": 2.5400000000000002e-06, + "loss": 0.562, + "step": 254 + }, + { + "epoch": 0.11, + "grad_norm": 0.8845290493858748, + "learning_rate": 2.55e-06, + "loss": 0.5641, + "step": 255 + }, + { + "epoch": 0.11, + "grad_norm": 0.7748353959313085, + "learning_rate": 2.56e-06, + "loss": 0.5617, + "step": 256 + }, + { + "epoch": 0.11, + "grad_norm": 1.4792826780398232, + "learning_rate": 2.5700000000000004e-06, + "loss": 0.5218, + "step": 257 + }, + { + "epoch": 0.11, + "grad_norm": 0.7755584686345505, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.5549, + "step": 258 + }, + { + "epoch": 0.11, + "grad_norm": 0.7144098990925435, + "learning_rate": 2.59e-06, + "loss": 0.5452, + "step": 259 + }, + { + "epoch": 0.11, + "grad_norm": 0.7016327562812078, + "learning_rate": 2.6e-06, + "loss": 0.5359, + "step": 260 + }, + { + "epoch": 0.11, + "grad_norm": 0.7674707640501007, + "learning_rate": 2.6100000000000004e-06, + "loss": 0.5492, + "step": 261 + }, + { + "epoch": 0.11, + "grad_norm": 0.6991460195923854, + "learning_rate": 2.6200000000000003e-06, + "loss": 0.5739, + "step": 262 + }, + { + "epoch": 0.11, + "grad_norm": 0.7481462435947455, + "learning_rate": 2.6300000000000002e-06, + "loss": 0.5382, + "step": 263 + }, + { + "epoch": 0.11, + "grad_norm": 0.6903104793011363, + "learning_rate": 2.64e-06, + "loss": 0.5774, + "step": 264 + }, + { + "epoch": 0.11, + "grad_norm": 0.7276434522232335, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.5395, + "step": 265 + }, + { + "epoch": 0.11, + "grad_norm": 0.7049348082872935, + "learning_rate": 2.6600000000000004e-06, + "loss": 0.5077, + "step": 266 + }, + { + "epoch": 0.11, + "grad_norm": 0.6860853746928447, + "learning_rate": 2.6700000000000003e-06, + "loss": 0.5365, + "step": 267 + }, + { + "epoch": 0.11, + "grad_norm": 0.7019598422590565, + "learning_rate": 2.68e-06, + "loss": 0.5405, + "step": 268 + }, + { + "epoch": 0.11, + "grad_norm": 0.7452321432305177, + "learning_rate": 2.6900000000000005e-06, + "loss": 0.5301, + "step": 269 + }, + { + "epoch": 0.12, + "grad_norm": 0.6869176763831081, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.5294, + "step": 270 + }, + { + "epoch": 0.12, + "grad_norm": 0.923474966878648, + "learning_rate": 2.7100000000000003e-06, + "loss": 0.5341, + "step": 271 + }, + { + "epoch": 0.12, + "grad_norm": 0.6685065451328585, + "learning_rate": 2.7200000000000002e-06, + "loss": 0.5364, + "step": 272 + }, + { + "epoch": 0.12, + "grad_norm": 0.7024497350510225, + "learning_rate": 2.7300000000000005e-06, + "loss": 0.5411, + "step": 273 + }, + { + "epoch": 0.12, + "grad_norm": 0.6954294556378668, + "learning_rate": 2.7400000000000004e-06, + "loss": 0.5599, + "step": 274 + }, + { + "epoch": 0.12, + "grad_norm": 0.7307559316445182, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.5546, + "step": 275 + }, + { + "epoch": 0.12, + "grad_norm": 0.6817035611374646, + "learning_rate": 2.7600000000000003e-06, + "loss": 0.5252, + "step": 276 + }, + { + "epoch": 0.12, + "grad_norm": 0.6749522949619997, + "learning_rate": 2.7700000000000006e-06, + "loss": 0.5329, + "step": 277 + }, + { + "epoch": 0.12, + "grad_norm": 0.6885720885016431, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.5437, + "step": 278 + }, + { + "epoch": 0.12, + "grad_norm": 0.6847069171457737, + "learning_rate": 2.7900000000000004e-06, + "loss": 0.5298, + "step": 279 + }, + { + "epoch": 0.12, + "grad_norm": 0.6540194496966271, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.5129, + "step": 280 + }, + { + "epoch": 0.12, + "grad_norm": 0.7232645465800646, + "learning_rate": 2.8100000000000006e-06, + "loss": 0.5727, + "step": 281 + }, + { + "epoch": 0.12, + "grad_norm": 0.6686647596073149, + "learning_rate": 2.82e-06, + "loss": 0.5507, + "step": 282 + }, + { + "epoch": 0.12, + "grad_norm": 0.6600278121030401, + "learning_rate": 2.83e-06, + "loss": 0.5413, + "step": 283 + }, + { + "epoch": 0.12, + "grad_norm": 0.7295788704850122, + "learning_rate": 2.84e-06, + "loss": 0.5644, + "step": 284 + }, + { + "epoch": 0.12, + "eval_loss": 0.5427850484848022, + "eval_runtime": 6910.0669, + "eval_samples_per_second": 42.024, + "eval_steps_per_second": 2.101, + "step": 284 + }, + { + "epoch": 0.12, + "grad_norm": 0.7172749652762687, + "learning_rate": 2.85e-06, + "loss": 0.5586, + "step": 285 + }, + { + "epoch": 0.12, + "grad_norm": 0.7146040704416872, + "learning_rate": 2.86e-06, + "loss": 0.5468, + "step": 286 + }, + { + "epoch": 0.12, + "grad_norm": 0.6773322028612333, + "learning_rate": 2.87e-06, + "loss": 0.5439, + "step": 287 + }, + { + "epoch": 0.12, + "grad_norm": 0.6807935236750091, + "learning_rate": 2.88e-06, + "loss": 0.541, + "step": 288 + }, + { + "epoch": 0.12, + "grad_norm": 0.7137692746953846, + "learning_rate": 2.89e-06, + "loss": 0.5299, + "step": 289 + }, + { + "epoch": 0.12, + "grad_norm": 0.7079425743756034, + "learning_rate": 2.9e-06, + "loss": 0.5591, + "step": 290 + }, + { + "epoch": 0.12, + "grad_norm": 0.6971956190090158, + "learning_rate": 2.91e-06, + "loss": 0.5515, + "step": 291 + }, + { + "epoch": 0.12, + "grad_norm": 0.7203057497367688, + "learning_rate": 2.92e-06, + "loss": 0.5335, + "step": 292 + }, + { + "epoch": 0.13, + "grad_norm": 0.6508573931612972, + "learning_rate": 2.93e-06, + "loss": 0.5242, + "step": 293 + }, + { + "epoch": 0.13, + "grad_norm": 0.682319005640009, + "learning_rate": 2.9400000000000002e-06, + "loss": 0.544, + "step": 294 + }, + { + "epoch": 0.13, + "grad_norm": 0.6467072142132468, + "learning_rate": 2.95e-06, + "loss": 0.5236, + "step": 295 + }, + { + "epoch": 0.13, + "grad_norm": 0.6776154655530738, + "learning_rate": 2.96e-06, + "loss": 0.5271, + "step": 296 + }, + { + "epoch": 0.13, + "grad_norm": 0.6938107980975571, + "learning_rate": 2.97e-06, + "loss": 0.5382, + "step": 297 + }, + { + "epoch": 0.13, + "grad_norm": 0.721981231896848, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.5911, + "step": 298 + }, + { + "epoch": 0.13, + "grad_norm": 0.712390399983667, + "learning_rate": 2.99e-06, + "loss": 0.5546, + "step": 299 + }, + { + "epoch": 0.13, + "grad_norm": 0.7104303392210323, + "learning_rate": 3e-06, + "loss": 0.5519, + "step": 300 + }, + { + "epoch": 0.13, + "grad_norm": 0.6771404308712767, + "learning_rate": 3.01e-06, + "loss": 0.535, + "step": 301 + }, + { + "epoch": 0.13, + "grad_norm": 0.6911636460013854, + "learning_rate": 3.0200000000000003e-06, + "loss": 0.5293, + "step": 302 + }, + { + "epoch": 0.13, + "grad_norm": 0.6692540043016396, + "learning_rate": 3.0300000000000002e-06, + "loss": 0.542, + "step": 303 + }, + { + "epoch": 0.13, + "grad_norm": 0.6744768013007634, + "learning_rate": 3.04e-06, + "loss": 0.515, + "step": 304 + }, + { + "epoch": 0.13, + "grad_norm": 0.670911158940749, + "learning_rate": 3.05e-06, + "loss": 0.5163, + "step": 305 + }, + { + "epoch": 0.13, + "grad_norm": 0.663474423719772, + "learning_rate": 3.0600000000000003e-06, + "loss": 0.5305, + "step": 306 + }, + { + "epoch": 0.13, + "grad_norm": 0.6925074950155179, + "learning_rate": 3.0700000000000003e-06, + "loss": 0.5415, + "step": 307 + }, + { + "epoch": 0.13, + "grad_norm": 0.6934197367843478, + "learning_rate": 3.08e-06, + "loss": 0.5525, + "step": 308 + }, + { + "epoch": 0.13, + "grad_norm": 0.6661951817270549, + "learning_rate": 3.09e-06, + "loss": 0.5608, + "step": 309 + }, + { + "epoch": 0.13, + "grad_norm": 0.6502761715692771, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.5597, + "step": 310 + }, + { + "epoch": 0.13, + "grad_norm": 0.644832829029986, + "learning_rate": 3.1100000000000003e-06, + "loss": 0.5438, + "step": 311 + }, + { + "epoch": 0.13, + "grad_norm": 0.6264165348789397, + "learning_rate": 3.12e-06, + "loss": 0.5292, + "step": 312 + }, + { + "epoch": 0.13, + "grad_norm": 0.7028440661397689, + "learning_rate": 3.13e-06, + "loss": 0.5475, + "step": 313 + }, + { + "epoch": 0.13, + "grad_norm": 0.6530445345568231, + "learning_rate": 3.1400000000000004e-06, + "loss": 0.5567, + "step": 314 + }, + { + "epoch": 0.13, + "grad_norm": 0.6365729580283129, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.5375, + "step": 315 + }, + { + "epoch": 0.14, + "grad_norm": 0.7033548602718471, + "learning_rate": 3.1600000000000002e-06, + "loss": 0.5691, + "step": 316 + }, + { + "epoch": 0.14, + "grad_norm": 0.6757049797519596, + "learning_rate": 3.17e-06, + "loss": 0.5206, + "step": 317 + }, + { + "epoch": 0.14, + "grad_norm": 0.6685908127465012, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.5304, + "step": 318 + }, + { + "epoch": 0.14, + "grad_norm": 0.6793556517907352, + "learning_rate": 3.1900000000000004e-06, + "loss": 0.5217, + "step": 319 + }, + { + "epoch": 0.14, + "grad_norm": 0.6523324979691255, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.501, + "step": 320 + }, + { + "epoch": 0.14, + "grad_norm": 0.6443915969872007, + "learning_rate": 3.21e-06, + "loss": 0.5315, + "step": 321 + }, + { + "epoch": 0.14, + "grad_norm": 0.6681792930023069, + "learning_rate": 3.2200000000000005e-06, + "loss": 0.5411, + "step": 322 + }, + { + "epoch": 0.14, + "grad_norm": 0.6976917063523176, + "learning_rate": 3.2300000000000004e-06, + "loss": 0.5356, + "step": 323 + }, + { + "epoch": 0.14, + "grad_norm": 0.6455932268651956, + "learning_rate": 3.2400000000000003e-06, + "loss": 0.5085, + "step": 324 + }, + { + "epoch": 0.14, + "grad_norm": 0.7324457281506692, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.5804, + "step": 325 + }, + { + "epoch": 0.14, + "grad_norm": 0.6887957564513499, + "learning_rate": 3.2600000000000006e-06, + "loss": 0.5315, + "step": 326 + }, + { + "epoch": 0.14, + "grad_norm": 0.6933538113778166, + "learning_rate": 3.2700000000000005e-06, + "loss": 0.529, + "step": 327 + }, + { + "epoch": 0.14, + "grad_norm": 0.6968756135404809, + "learning_rate": 3.2800000000000004e-06, + "loss": 0.509, + "step": 328 + }, + { + "epoch": 0.14, + "grad_norm": 0.6547487769661117, + "learning_rate": 3.2900000000000003e-06, + "loss": 0.5319, + "step": 329 + }, + { + "epoch": 0.14, + "grad_norm": 0.6918958083567368, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.5369, + "step": 330 + }, + { + "epoch": 0.14, + "grad_norm": 0.7537331163722618, + "learning_rate": 3.3100000000000005e-06, + "loss": 0.5559, + "step": 331 + }, + { + "epoch": 0.14, + "grad_norm": 0.6622519534641422, + "learning_rate": 3.3200000000000004e-06, + "loss": 0.5399, + "step": 332 + }, + { + "epoch": 0.14, + "grad_norm": 0.7164771428347083, + "learning_rate": 3.3300000000000003e-06, + "loss": 0.5565, + "step": 333 + }, + { + "epoch": 0.14, + "grad_norm": 0.7082376525584928, + "learning_rate": 3.3400000000000006e-06, + "loss": 0.5212, + "step": 334 + }, + { + "epoch": 0.14, + "grad_norm": 0.6726486174520817, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.5214, + "step": 335 + }, + { + "epoch": 0.14, + "grad_norm": 0.6698599823643026, + "learning_rate": 3.3600000000000004e-06, + "loss": 0.5392, + "step": 336 + }, + { + "epoch": 0.14, + "grad_norm": 0.7412977141620687, + "learning_rate": 3.3700000000000003e-06, + "loss": 0.5786, + "step": 337 + }, + { + "epoch": 0.14, + "grad_norm": 0.7232716108771607, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.5315, + "step": 338 + }, + { + "epoch": 0.14, + "grad_norm": 0.6392581387125058, + "learning_rate": 3.3900000000000006e-06, + "loss": 0.5443, + "step": 339 + }, + { + "epoch": 0.15, + "grad_norm": 0.709924249993068, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.5518, + "step": 340 + }, + { + "epoch": 0.15, + "grad_norm": 0.6805700381054698, + "learning_rate": 3.4100000000000004e-06, + "loss": 0.51, + "step": 341 + }, + { + "epoch": 0.15, + "grad_norm": 0.6404690210430515, + "learning_rate": 3.4200000000000007e-06, + "loss": 0.5482, + "step": 342 + }, + { + "epoch": 0.15, + "grad_norm": 0.7350376016254183, + "learning_rate": 3.4300000000000006e-06, + "loss": 0.4998, + "step": 343 + }, + { + "epoch": 0.15, + "grad_norm": 0.692941753107107, + "learning_rate": 3.44e-06, + "loss": 0.5174, + "step": 344 + }, + { + "epoch": 0.15, + "grad_norm": 0.6917522453335027, + "learning_rate": 3.45e-06, + "loss": 0.5252, + "step": 345 + }, + { + "epoch": 0.15, + "grad_norm": 0.7698041778535101, + "learning_rate": 3.46e-06, + "loss": 0.5116, + "step": 346 + }, + { + "epoch": 0.15, + "grad_norm": 0.6534415447218267, + "learning_rate": 3.4700000000000002e-06, + "loss": 0.5324, + "step": 347 + }, + { + "epoch": 0.15, + "grad_norm": 0.6553279473118478, + "learning_rate": 3.48e-06, + "loss": 0.4979, + "step": 348 + }, + { + "epoch": 0.15, + "grad_norm": 0.7260157286864319, + "learning_rate": 3.49e-06, + "loss": 0.5883, + "step": 349 + }, + { + "epoch": 0.15, + "grad_norm": 0.6509209097177096, + "learning_rate": 3.5e-06, + "loss": 0.5532, + "step": 350 + }, + { + "epoch": 0.15, + "grad_norm": 0.7431590152540184, + "learning_rate": 3.5100000000000003e-06, + "loss": 0.5651, + "step": 351 + }, + { + "epoch": 0.15, + "grad_norm": 0.7154996648263839, + "learning_rate": 3.52e-06, + "loss": 0.5522, + "step": 352 + }, + { + "epoch": 0.15, + "grad_norm": 0.6949365648624505, + "learning_rate": 3.53e-06, + "loss": 0.5648, + "step": 353 + }, + { + "epoch": 0.15, + "grad_norm": 0.6507409922513757, + "learning_rate": 3.54e-06, + "loss": 0.5278, + "step": 354 + }, + { + "epoch": 0.15, + "grad_norm": 0.7005428814595557, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.5555, + "step": 355 + }, + { + "epoch": 0.15, + "eval_loss": 0.5336291193962097, + "eval_runtime": 6911.9871, + "eval_samples_per_second": 42.012, + "eval_steps_per_second": 2.101, + "step": 355 + }, + { + "epoch": 0.15, + "grad_norm": 0.6932495434274139, + "learning_rate": 3.5600000000000002e-06, + "loss": 0.515, + "step": 356 + }, + { + "epoch": 0.15, + "grad_norm": 0.6592137259570553, + "learning_rate": 3.57e-06, + "loss": 0.5593, + "step": 357 + }, + { + "epoch": 0.15, + "grad_norm": 0.6621976521925507, + "learning_rate": 3.58e-06, + "loss": 0.5314, + "step": 358 + }, + { + "epoch": 0.15, + "grad_norm": 0.6743898928954292, + "learning_rate": 3.5900000000000004e-06, + "loss": 0.5458, + "step": 359 + }, + { + "epoch": 0.15, + "grad_norm": 0.6270336100237296, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.5281, + "step": 360 + }, + { + "epoch": 0.15, + "grad_norm": 0.718834892269395, + "learning_rate": 3.61e-06, + "loss": 0.54, + "step": 361 + }, + { + "epoch": 0.15, + "grad_norm": 0.6096717034209563, + "learning_rate": 3.62e-06, + "loss": 0.5023, + "step": 362 + }, + { + "epoch": 0.16, + "grad_norm": 0.6952055757879536, + "learning_rate": 3.6300000000000004e-06, + "loss": 0.5268, + "step": 363 + }, + { + "epoch": 0.16, + "grad_norm": 0.6571216773223735, + "learning_rate": 3.6400000000000003e-06, + "loss": 0.5314, + "step": 364 + }, + { + "epoch": 0.16, + "grad_norm": 0.644276415756237, + "learning_rate": 3.65e-06, + "loss": 0.5194, + "step": 365 + }, + { + "epoch": 0.16, + "grad_norm": 1.298671233861733, + "learning_rate": 3.66e-06, + "loss": 0.5117, + "step": 366 + }, + { + "epoch": 0.16, + "grad_norm": 0.7103634954229385, + "learning_rate": 3.6700000000000004e-06, + "loss": 0.5385, + "step": 367 + }, + { + "epoch": 0.16, + "grad_norm": 0.6104474479946076, + "learning_rate": 3.6800000000000003e-06, + "loss": 0.4998, + "step": 368 + }, + { + "epoch": 0.16, + "grad_norm": 0.6896768428677847, + "learning_rate": 3.6900000000000002e-06, + "loss": 0.533, + "step": 369 + }, + { + "epoch": 0.16, + "grad_norm": 0.6413918813463226, + "learning_rate": 3.7e-06, + "loss": 0.5312, + "step": 370 + }, + { + "epoch": 0.16, + "grad_norm": 0.7573960000478444, + "learning_rate": 3.7100000000000005e-06, + "loss": 0.5118, + "step": 371 + }, + { + "epoch": 0.16, + "grad_norm": 0.6754673328160921, + "learning_rate": 3.7200000000000004e-06, + "loss": 0.531, + "step": 372 + }, + { + "epoch": 0.16, + "grad_norm": 0.7380227752448969, + "learning_rate": 3.7300000000000003e-06, + "loss": 0.553, + "step": 373 + }, + { + "epoch": 0.16, + "grad_norm": 0.7101835866153544, + "learning_rate": 3.74e-06, + "loss": 0.5356, + "step": 374 + }, + { + "epoch": 0.16, + "grad_norm": 0.6854482179866157, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.5241, + "step": 375 + }, + { + "epoch": 0.16, + "grad_norm": 0.6626980381085176, + "learning_rate": 3.7600000000000004e-06, + "loss": 0.51, + "step": 376 + }, + { + "epoch": 0.16, + "grad_norm": 0.6510834607877704, + "learning_rate": 3.7700000000000003e-06, + "loss": 0.5231, + "step": 377 + }, + { + "epoch": 0.16, + "grad_norm": 0.6659115075290825, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.5164, + "step": 378 + }, + { + "epoch": 0.16, + "grad_norm": 0.6977709143095813, + "learning_rate": 3.79e-06, + "loss": 0.5529, + "step": 379 + }, + { + "epoch": 0.16, + "grad_norm": 0.6429659300311611, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.5177, + "step": 380 + }, + { + "epoch": 0.16, + "grad_norm": 0.6780609817770847, + "learning_rate": 3.8100000000000004e-06, + "loss": 0.5017, + "step": 381 + }, + { + "epoch": 0.16, + "grad_norm": 0.6312430732177394, + "learning_rate": 3.820000000000001e-06, + "loss": 0.5034, + "step": 382 + }, + { + "epoch": 0.16, + "grad_norm": 0.6557950564147815, + "learning_rate": 3.830000000000001e-06, + "loss": 0.5297, + "step": 383 + }, + { + "epoch": 0.16, + "grad_norm": 0.7189637600222241, + "learning_rate": 3.8400000000000005e-06, + "loss": 0.5396, + "step": 384 + }, + { + "epoch": 0.16, + "grad_norm": 0.7535141909531238, + "learning_rate": 3.85e-06, + "loss": 0.5572, + "step": 385 + }, + { + "epoch": 0.16, + "grad_norm": 0.6848030312310904, + "learning_rate": 3.86e-06, + "loss": 0.5135, + "step": 386 + }, + { + "epoch": 0.17, + "grad_norm": 0.6605297036299905, + "learning_rate": 3.87e-06, + "loss": 0.5576, + "step": 387 + }, + { + "epoch": 0.17, + "grad_norm": 0.6988172021818264, + "learning_rate": 3.88e-06, + "loss": 0.5281, + "step": 388 + }, + { + "epoch": 0.17, + "grad_norm": 0.6511359929973235, + "learning_rate": 3.89e-06, + "loss": 0.5148, + "step": 389 + }, + { + "epoch": 0.17, + "grad_norm": 0.74475476203548, + "learning_rate": 3.900000000000001e-06, + "loss": 0.5313, + "step": 390 + }, + { + "epoch": 0.17, + "grad_norm": 0.6768085651878416, + "learning_rate": 3.910000000000001e-06, + "loss": 0.5289, + "step": 391 + }, + { + "epoch": 0.17, + "grad_norm": 0.6590289960576611, + "learning_rate": 3.920000000000001e-06, + "loss": 0.526, + "step": 392 + }, + { + "epoch": 0.17, + "grad_norm": 0.6294447240216576, + "learning_rate": 3.9300000000000005e-06, + "loss": 0.5066, + "step": 393 + }, + { + "epoch": 0.17, + "grad_norm": 0.6795503573063048, + "learning_rate": 3.94e-06, + "loss": 0.5265, + "step": 394 + }, + { + "epoch": 0.17, + "grad_norm": 0.6327421219484214, + "learning_rate": 3.95e-06, + "loss": 0.5262, + "step": 395 + }, + { + "epoch": 0.17, + "grad_norm": 0.6702321778478743, + "learning_rate": 3.96e-06, + "loss": 0.5521, + "step": 396 + }, + { + "epoch": 0.17, + "grad_norm": 0.6601601082950667, + "learning_rate": 3.97e-06, + "loss": 0.5511, + "step": 397 + }, + { + "epoch": 0.17, + "grad_norm": 0.672312807049255, + "learning_rate": 3.980000000000001e-06, + "loss": 0.581, + "step": 398 + }, + { + "epoch": 0.17, + "grad_norm": 0.70887535443515, + "learning_rate": 3.990000000000001e-06, + "loss": 0.5693, + "step": 399 + }, + { + "epoch": 0.17, + "grad_norm": 0.674197659542202, + "learning_rate": 4.000000000000001e-06, + "loss": 0.5304, + "step": 400 + }, + { + "epoch": 0.17, + "grad_norm": 0.6367460409014842, + "learning_rate": 4.0100000000000006e-06, + "loss": 0.5081, + "step": 401 + }, + { + "epoch": 0.17, + "grad_norm": 0.6857813275083907, + "learning_rate": 4.0200000000000005e-06, + "loss": 0.4973, + "step": 402 + }, + { + "epoch": 0.17, + "grad_norm": 0.6556472016511119, + "learning_rate": 4.03e-06, + "loss": 0.5389, + "step": 403 + }, + { + "epoch": 0.17, + "grad_norm": 0.6985422078815098, + "learning_rate": 4.04e-06, + "loss": 0.5289, + "step": 404 + }, + { + "epoch": 0.17, + "grad_norm": 0.6649519288148354, + "learning_rate": 4.05e-06, + "loss": 0.5379, + "step": 405 + }, + { + "epoch": 0.17, + "grad_norm": 0.6263707838794045, + "learning_rate": 4.060000000000001e-06, + "loss": 0.5421, + "step": 406 + }, + { + "epoch": 0.17, + "grad_norm": 0.7664906236471626, + "learning_rate": 4.07e-06, + "loss": 0.5535, + "step": 407 + }, + { + "epoch": 0.17, + "grad_norm": 0.6553639230258015, + "learning_rate": 4.08e-06, + "loss": 0.5349, + "step": 408 + }, + { + "epoch": 0.17, + "grad_norm": 0.6750627011135212, + "learning_rate": 4.09e-06, + "loss": 0.5453, + "step": 409 + }, + { + "epoch": 0.18, + "grad_norm": 0.6577154298564163, + "learning_rate": 4.1e-06, + "loss": 0.5082, + "step": 410 + }, + { + "epoch": 0.18, + "grad_norm": 0.6630263227345423, + "learning_rate": 4.1100000000000005e-06, + "loss": 0.5856, + "step": 411 + }, + { + "epoch": 0.18, + "grad_norm": 0.6574426137463745, + "learning_rate": 4.12e-06, + "loss": 0.5384, + "step": 412 + }, + { + "epoch": 0.18, + "grad_norm": 0.6988289040575354, + "learning_rate": 4.13e-06, + "loss": 0.5399, + "step": 413 + }, + { + "epoch": 0.18, + "grad_norm": 0.6306882029774344, + "learning_rate": 4.14e-06, + "loss": 0.5336, + "step": 414 + }, + { + "epoch": 0.18, + "grad_norm": 0.7179639205026864, + "learning_rate": 4.15e-06, + "loss": 0.518, + "step": 415 + }, + { + "epoch": 0.18, + "grad_norm": 0.6516513551525797, + "learning_rate": 4.16e-06, + "loss": 0.5257, + "step": 416 + }, + { + "epoch": 0.18, + "grad_norm": 0.6601218324564917, + "learning_rate": 4.17e-06, + "loss": 0.5289, + "step": 417 + }, + { + "epoch": 0.18, + "grad_norm": 0.7211814488287548, + "learning_rate": 4.18e-06, + "loss": 0.5276, + "step": 418 + }, + { + "epoch": 0.18, + "grad_norm": 0.6699099078689619, + "learning_rate": 4.1900000000000005e-06, + "loss": 0.5244, + "step": 419 + }, + { + "epoch": 0.18, + "grad_norm": 0.6916838990628401, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.5181, + "step": 420 + }, + { + "epoch": 0.18, + "grad_norm": 0.6545058846876347, + "learning_rate": 4.21e-06, + "loss": 0.5475, + "step": 421 + }, + { + "epoch": 0.18, + "grad_norm": 0.7236179142807477, + "learning_rate": 4.22e-06, + "loss": 0.5049, + "step": 422 + }, + { + "epoch": 0.18, + "grad_norm": 0.6610815551752731, + "learning_rate": 4.23e-06, + "loss": 0.5463, + "step": 423 + }, + { + "epoch": 0.18, + "grad_norm": 0.6821422605987203, + "learning_rate": 4.24e-06, + "loss": 0.5307, + "step": 424 + }, + { + "epoch": 0.18, + "grad_norm": 0.6431222169441803, + "learning_rate": 4.25e-06, + "loss": 0.521, + "step": 425 + }, + { + "epoch": 0.18, + "grad_norm": 0.6952795202176767, + "learning_rate": 4.26e-06, + "loss": 0.514, + "step": 426 + }, + { + "epoch": 0.18, + "eval_loss": 0.5265827775001526, + "eval_runtime": 6909.3691, + "eval_samples_per_second": 42.028, + "eval_steps_per_second": 2.101, + "step": 426 + }, + { + "epoch": 0.18, + "grad_norm": 0.6134347482653176, + "learning_rate": 4.270000000000001e-06, + "loss": 0.5251, + "step": 427 + }, + { + "epoch": 0.18, + "grad_norm": 0.7147159318510563, + "learning_rate": 4.2800000000000005e-06, + "loss": 0.5309, + "step": 428 + }, + { + "epoch": 0.18, + "grad_norm": 0.7026933031500548, + "learning_rate": 4.2900000000000004e-06, + "loss": 0.5336, + "step": 429 + }, + { + "epoch": 0.18, + "grad_norm": 0.7005957676093472, + "learning_rate": 4.3e-06, + "loss": 0.5518, + "step": 430 + }, + { + "epoch": 0.18, + "grad_norm": 0.697347284426043, + "learning_rate": 4.31e-06, + "loss": 0.5222, + "step": 431 + }, + { + "epoch": 0.18, + "grad_norm": 0.6618459969675489, + "learning_rate": 4.32e-06, + "loss": 0.5077, + "step": 432 + }, + { + "epoch": 0.18, + "grad_norm": 0.6436931259589193, + "learning_rate": 4.33e-06, + "loss": 0.5155, + "step": 433 + }, + { + "epoch": 0.19, + "grad_norm": 0.6589831621019552, + "learning_rate": 4.34e-06, + "loss": 0.5293, + "step": 434 + }, + { + "epoch": 0.19, + "grad_norm": 0.6487932842171147, + "learning_rate": 4.350000000000001e-06, + "loss": 0.5234, + "step": 435 + }, + { + "epoch": 0.19, + "grad_norm": 0.6388950573406754, + "learning_rate": 4.360000000000001e-06, + "loss": 0.5247, + "step": 436 + }, + { + "epoch": 0.19, + "grad_norm": 0.6484480185597394, + "learning_rate": 4.3700000000000005e-06, + "loss": 0.5493, + "step": 437 + }, + { + "epoch": 0.19, + "grad_norm": 0.6656552397893157, + "learning_rate": 4.38e-06, + "loss": 0.5122, + "step": 438 + }, + { + "epoch": 0.19, + "grad_norm": 0.6100914831240443, + "learning_rate": 4.39e-06, + "loss": 0.5223, + "step": 439 + }, + { + "epoch": 0.19, + "grad_norm": 0.6806948048082133, + "learning_rate": 4.4e-06, + "loss": 0.5366, + "step": 440 + }, + { + "epoch": 0.19, + "grad_norm": 0.6792634934940536, + "learning_rate": 4.41e-06, + "loss": 0.5371, + "step": 441 + }, + { + "epoch": 0.19, + "grad_norm": 0.6163101784317894, + "learning_rate": 4.42e-06, + "loss": 0.5001, + "step": 442 + }, + { + "epoch": 0.19, + "grad_norm": 0.6711982512846896, + "learning_rate": 4.430000000000001e-06, + "loss": 0.5431, + "step": 443 + }, + { + "epoch": 0.19, + "grad_norm": 0.6656044261856343, + "learning_rate": 4.440000000000001e-06, + "loss": 0.4961, + "step": 444 + }, + { + "epoch": 0.19, + "grad_norm": 0.6090638329534794, + "learning_rate": 4.450000000000001e-06, + "loss": 0.5124, + "step": 445 + }, + { + "epoch": 0.19, + "grad_norm": 0.6642264166031999, + "learning_rate": 4.4600000000000005e-06, + "loss": 0.5401, + "step": 446 + }, + { + "epoch": 0.19, + "grad_norm": 0.6146557964679851, + "learning_rate": 4.47e-06, + "loss": 0.521, + "step": 447 + }, + { + "epoch": 0.19, + "grad_norm": 0.6540634116911361, + "learning_rate": 4.48e-06, + "loss": 0.5472, + "step": 448 + }, + { + "epoch": 0.19, + "grad_norm": 0.6394159843807653, + "learning_rate": 4.49e-06, + "loss": 0.4984, + "step": 449 + }, + { + "epoch": 0.19, + "grad_norm": 0.6320717472264333, + "learning_rate": 4.5e-06, + "loss": 0.539, + "step": 450 + }, + { + "epoch": 0.19, + "grad_norm": 0.6335986050650263, + "learning_rate": 4.510000000000001e-06, + "loss": 0.5279, + "step": 451 + }, + { + "epoch": 0.19, + "grad_norm": 0.6238922538054539, + "learning_rate": 4.520000000000001e-06, + "loss": 0.5632, + "step": 452 + }, + { + "epoch": 0.19, + "grad_norm": 0.6628688965370182, + "learning_rate": 4.530000000000001e-06, + "loss": 0.5417, + "step": 453 + }, + { + "epoch": 0.19, + "grad_norm": 0.6559295875868244, + "learning_rate": 4.540000000000001e-06, + "loss": 0.5127, + "step": 454 + }, + { + "epoch": 0.19, + "grad_norm": 0.6429887683654244, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.5116, + "step": 455 + }, + { + "epoch": 0.19, + "grad_norm": 0.6925428894756206, + "learning_rate": 4.56e-06, + "loss": 0.5264, + "step": 456 + }, + { + "epoch": 0.2, + "grad_norm": 0.6604588640436396, + "learning_rate": 4.57e-06, + "loss": 0.5284, + "step": 457 + }, + { + "epoch": 0.2, + "grad_norm": 0.6392985738009899, + "learning_rate": 4.58e-06, + "loss": 0.5188, + "step": 458 + }, + { + "epoch": 0.2, + "grad_norm": 0.6765577462503485, + "learning_rate": 4.590000000000001e-06, + "loss": 0.514, + "step": 459 + }, + { + "epoch": 0.2, + "grad_norm": 0.8495240761386383, + "learning_rate": 4.600000000000001e-06, + "loss": 0.5268, + "step": 460 + }, + { + "epoch": 0.2, + "grad_norm": 0.6836958527754253, + "learning_rate": 4.610000000000001e-06, + "loss": 0.5305, + "step": 461 + }, + { + "epoch": 0.2, + "grad_norm": 0.6502080642686838, + "learning_rate": 4.620000000000001e-06, + "loss": 0.5178, + "step": 462 + }, + { + "epoch": 0.2, + "grad_norm": 0.6342249164223178, + "learning_rate": 4.6300000000000006e-06, + "loss": 0.5217, + "step": 463 + }, + { + "epoch": 0.2, + "grad_norm": 0.6777938665629211, + "learning_rate": 4.6400000000000005e-06, + "loss": 0.534, + "step": 464 + }, + { + "epoch": 0.2, + "grad_norm": 0.7062865741492855, + "learning_rate": 4.65e-06, + "loss": 0.5077, + "step": 465 + }, + { + "epoch": 0.2, + "grad_norm": 0.6383144907337746, + "learning_rate": 4.66e-06, + "loss": 0.5134, + "step": 466 + }, + { + "epoch": 0.2, + "grad_norm": 0.6730064655141996, + "learning_rate": 4.670000000000001e-06, + "loss": 0.552, + "step": 467 + }, + { + "epoch": 0.2, + "grad_norm": 0.6394012481326041, + "learning_rate": 4.680000000000001e-06, + "loss": 0.4908, + "step": 468 + }, + { + "epoch": 0.2, + "grad_norm": 0.7061191203371422, + "learning_rate": 4.69e-06, + "loss": 0.5354, + "step": 469 + }, + { + "epoch": 0.2, + "grad_norm": 0.6289046687203929, + "learning_rate": 4.7e-06, + "loss": 0.524, + "step": 470 + }, + { + "epoch": 0.2, + "grad_norm": 0.6880276584214321, + "learning_rate": 4.71e-06, + "loss": 0.5317, + "step": 471 + }, + { + "epoch": 0.2, + "grad_norm": 0.6626736935804997, + "learning_rate": 4.7200000000000005e-06, + "loss": 0.5293, + "step": 472 + }, + { + "epoch": 0.2, + "grad_norm": 0.6704211606772497, + "learning_rate": 4.7300000000000005e-06, + "loss": 0.5135, + "step": 473 + }, + { + "epoch": 0.2, + "grad_norm": 0.6671310077095398, + "learning_rate": 4.74e-06, + "loss": 0.5256, + "step": 474 + }, + { + "epoch": 0.2, + "grad_norm": 0.637683155549069, + "learning_rate": 4.75e-06, + "loss": 0.5113, + "step": 475 + }, + { + "epoch": 0.2, + "grad_norm": 0.7243606567809928, + "learning_rate": 4.76e-06, + "loss": 0.5222, + "step": 476 + }, + { + "epoch": 0.2, + "grad_norm": 0.6898226976279818, + "learning_rate": 4.77e-06, + "loss": 0.5434, + "step": 477 + }, + { + "epoch": 0.2, + "grad_norm": 0.7102119430718769, + "learning_rate": 4.78e-06, + "loss": 0.5584, + "step": 478 + }, + { + "epoch": 0.2, + "grad_norm": 0.6357099843026002, + "learning_rate": 4.79e-06, + "loss": 0.5377, + "step": 479 + }, + { + "epoch": 0.21, + "grad_norm": 0.7078871919110719, + "learning_rate": 4.800000000000001e-06, + "loss": 0.51, + "step": 480 + }, + { + "epoch": 0.21, + "grad_norm": 0.7004243288260213, + "learning_rate": 4.8100000000000005e-06, + "loss": 0.5098, + "step": 481 + }, + { + "epoch": 0.21, + "grad_norm": 0.7218886636410997, + "learning_rate": 4.8200000000000004e-06, + "loss": 0.5473, + "step": 482 + }, + { + "epoch": 0.21, + "grad_norm": 0.6908387236569687, + "learning_rate": 4.83e-06, + "loss": 0.5155, + "step": 483 + }, + { + "epoch": 0.21, + "grad_norm": 0.7042724565499465, + "learning_rate": 4.84e-06, + "loss": 0.5152, + "step": 484 + }, + { + "epoch": 0.21, + "grad_norm": 0.7088947897764216, + "learning_rate": 4.85e-06, + "loss": 0.5417, + "step": 485 + }, + { + "epoch": 0.21, + "grad_norm": 0.6279155418925114, + "learning_rate": 4.86e-06, + "loss": 0.5349, + "step": 486 + }, + { + "epoch": 0.21, + "grad_norm": 0.7207841117522383, + "learning_rate": 4.87e-06, + "loss": 0.5261, + "step": 487 + }, + { + "epoch": 0.21, + "grad_norm": 0.6652618430741107, + "learning_rate": 4.880000000000001e-06, + "loss": 0.5355, + "step": 488 + }, + { + "epoch": 0.21, + "grad_norm": 0.6296330144736813, + "learning_rate": 4.890000000000001e-06, + "loss": 0.5302, + "step": 489 + }, + { + "epoch": 0.21, + "grad_norm": 0.6197754222224622, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.4987, + "step": 490 + }, + { + "epoch": 0.21, + "grad_norm": 0.642766778231823, + "learning_rate": 4.9100000000000004e-06, + "loss": 0.5247, + "step": 491 + }, + { + "epoch": 0.21, + "grad_norm": 0.643792980820932, + "learning_rate": 4.92e-06, + "loss": 0.5347, + "step": 492 + }, + { + "epoch": 0.21, + "grad_norm": 0.625452787303321, + "learning_rate": 4.93e-06, + "loss": 0.499, + "step": 493 + }, + { + "epoch": 0.21, + "grad_norm": 0.5964088928926613, + "learning_rate": 4.94e-06, + "loss": 0.5407, + "step": 494 + }, + { + "epoch": 0.21, + "grad_norm": 0.6500032443873983, + "learning_rate": 4.95e-06, + "loss": 0.5464, + "step": 495 + }, + { + "epoch": 0.21, + "grad_norm": 0.6460792843444659, + "learning_rate": 4.960000000000001e-06, + "loss": 0.4949, + "step": 496 + }, + { + "epoch": 0.21, + "grad_norm": 0.7395445647719444, + "learning_rate": 4.970000000000001e-06, + "loss": 0.5291, + "step": 497 + }, + { + "epoch": 0.21, + "eval_loss": 0.5214883685112, + "eval_runtime": 6920.669, + "eval_samples_per_second": 41.96, + "eval_steps_per_second": 2.098, + "step": 497 + }, + { + "epoch": 0.21, + "grad_norm": 0.6537595312676089, + "learning_rate": 4.980000000000001e-06, + "loss": 0.5176, + "step": 498 + }, + { + "epoch": 0.21, + "grad_norm": 0.6463304730768601, + "learning_rate": 4.9900000000000005e-06, + "loss": 0.5206, + "step": 499 + }, + { + "epoch": 0.21, + "grad_norm": 0.6973321215012371, + "learning_rate": 5e-06, + "loss": 0.5404, + "step": 500 + }, + { + "epoch": 0.21, + "grad_norm": 0.6779339517089825, + "learning_rate": 4.999999293914693e-06, + "loss": 0.5073, + "step": 501 + }, + { + "epoch": 0.21, + "grad_norm": 0.6916291409636109, + "learning_rate": 4.99999717565917e-06, + "loss": 0.5277, + "step": 502 + }, + { + "epoch": 0.21, + "grad_norm": 0.6019781233779008, + "learning_rate": 4.999993645234629e-06, + "loss": 0.488, + "step": 503 + }, + { + "epoch": 0.22, + "grad_norm": 0.6767553900584968, + "learning_rate": 4.999988702643063e-06, + "loss": 0.5517, + "step": 504 + }, + { + "epoch": 0.22, + "grad_norm": 0.6399171255962077, + "learning_rate": 4.999982347887264e-06, + "loss": 0.5363, + "step": 505 + }, + { + "epoch": 0.22, + "grad_norm": 0.6957517244632938, + "learning_rate": 4.999974580970822e-06, + "loss": 0.5425, + "step": 506 + }, + { + "epoch": 0.22, + "grad_norm": 0.6303242233074258, + "learning_rate": 4.999965401898124e-06, + "loss": 0.5359, + "step": 507 + }, + { + "epoch": 0.22, + "grad_norm": 0.6857079786662916, + "learning_rate": 4.999954810674355e-06, + "loss": 0.4941, + "step": 508 + }, + { + "epoch": 0.22, + "grad_norm": 0.6875532162149117, + "learning_rate": 4.999942807305497e-06, + "loss": 0.5345, + "step": 509 + }, + { + "epoch": 0.22, + "grad_norm": 0.6275865235654622, + "learning_rate": 4.9999293917983325e-06, + "loss": 0.5099, + "step": 510 + }, + { + "epoch": 0.22, + "grad_norm": 0.7718521188057177, + "learning_rate": 4.999914564160437e-06, + "loss": 0.5181, + "step": 511 + }, + { + "epoch": 0.22, + "grad_norm": 0.6534743581262111, + "learning_rate": 4.999898324400187e-06, + "loss": 0.5126, + "step": 512 + }, + { + "epoch": 0.22, + "grad_norm": 0.674911757416486, + "learning_rate": 4.999880672526757e-06, + "loss": 0.5368, + "step": 513 + }, + { + "epoch": 0.22, + "grad_norm": 0.764101502643344, + "learning_rate": 4.999861608550116e-06, + "loss": 0.5105, + "step": 514 + }, + { + "epoch": 0.22, + "grad_norm": 0.6607294039970933, + "learning_rate": 4.999841132481035e-06, + "loss": 0.5231, + "step": 515 + }, + { + "epoch": 0.22, + "grad_norm": 0.7724967744040852, + "learning_rate": 4.999819244331078e-06, + "loss": 0.5489, + "step": 516 + }, + { + "epoch": 0.22, + "grad_norm": 0.6730774325770473, + "learning_rate": 4.99979594411261e-06, + "loss": 0.5326, + "step": 517 + }, + { + "epoch": 0.22, + "grad_norm": 0.7136063574644014, + "learning_rate": 4.999771231838792e-06, + "loss": 0.5144, + "step": 518 + }, + { + "epoch": 0.22, + "grad_norm": 0.6477743762407223, + "learning_rate": 4.999745107523583e-06, + "loss": 0.5111, + "step": 519 + }, + { + "epoch": 0.22, + "grad_norm": 0.6916139988086453, + "learning_rate": 4.999717571181742e-06, + "loss": 0.529, + "step": 520 + }, + { + "epoch": 0.22, + "grad_norm": 0.6564109960699416, + "learning_rate": 4.999688622828821e-06, + "loss": 0.5101, + "step": 521 + }, + { + "epoch": 0.22, + "grad_norm": 0.6610425029900775, + "learning_rate": 4.999658262481173e-06, + "loss": 0.5095, + "step": 522 + }, + { + "epoch": 0.22, + "grad_norm": 0.6062090231312307, + "learning_rate": 4.999626490155947e-06, + "loss": 0.5072, + "step": 523 + }, + { + "epoch": 0.22, + "grad_norm": 0.629980273760487, + "learning_rate": 4.999593305871091e-06, + "loss": 0.5382, + "step": 524 + }, + { + "epoch": 0.22, + "grad_norm": 0.6597246057010635, + "learning_rate": 4.999558709645349e-06, + "loss": 0.5307, + "step": 525 + }, + { + "epoch": 0.22, + "grad_norm": 0.636745442438904, + "learning_rate": 4.999522701498263e-06, + "loss": 0.5302, + "step": 526 + }, + { + "epoch": 0.23, + "grad_norm": 0.6324462143762214, + "learning_rate": 4.999485281450174e-06, + "loss": 0.4996, + "step": 527 + }, + { + "epoch": 0.23, + "grad_norm": 0.6069558558098991, + "learning_rate": 4.99944644952222e-06, + "loss": 0.5058, + "step": 528 + }, + { + "epoch": 0.23, + "grad_norm": 0.6006305147544736, + "learning_rate": 4.999406205736334e-06, + "loss": 0.5218, + "step": 529 + }, + { + "epoch": 0.23, + "grad_norm": 0.6354602393667919, + "learning_rate": 4.9993645501152485e-06, + "loss": 0.498, + "step": 530 + }, + { + "epoch": 0.23, + "grad_norm": 0.6299647790639523, + "learning_rate": 4.999321482682495e-06, + "loss": 0.5435, + "step": 531 + }, + { + "epoch": 0.23, + "grad_norm": 0.5702190944475204, + "learning_rate": 4.9992770034624e-06, + "loss": 0.5259, + "step": 532 + }, + { + "epoch": 0.23, + "grad_norm": 0.6629005033715545, + "learning_rate": 4.999231112480088e-06, + "loss": 0.5247, + "step": 533 + }, + { + "epoch": 0.23, + "grad_norm": 0.6161756194165398, + "learning_rate": 4.999183809761481e-06, + "loss": 0.5233, + "step": 534 + }, + { + "epoch": 0.23, + "grad_norm": 0.6143755401649461, + "learning_rate": 4.999135095333301e-06, + "loss": 0.4983, + "step": 535 + }, + { + "epoch": 0.23, + "grad_norm": 0.6374604849318725, + "learning_rate": 4.999084969223064e-06, + "loss": 0.5407, + "step": 536 + }, + { + "epoch": 0.23, + "grad_norm": 0.6489680492749198, + "learning_rate": 4.999033431459084e-06, + "loss": 0.5216, + "step": 537 + }, + { + "epoch": 0.23, + "grad_norm": 0.6128330475540421, + "learning_rate": 4.998980482070473e-06, + "loss": 0.4956, + "step": 538 + }, + { + "epoch": 0.23, + "grad_norm": 0.6084501316124987, + "learning_rate": 4.998926121087142e-06, + "loss": 0.5105, + "step": 539 + }, + { + "epoch": 0.23, + "grad_norm": 0.6718312705515297, + "learning_rate": 4.998870348539797e-06, + "loss": 0.4978, + "step": 540 + }, + { + "epoch": 0.23, + "grad_norm": 0.5917495579968527, + "learning_rate": 4.998813164459942e-06, + "loss": 0.5005, + "step": 541 + }, + { + "epoch": 0.23, + "grad_norm": 0.7002888012401992, + "learning_rate": 4.9987545688798765e-06, + "loss": 0.5158, + "step": 542 + }, + { + "epoch": 0.23, + "grad_norm": 0.6215562316702827, + "learning_rate": 4.998694561832703e-06, + "loss": 0.5063, + "step": 543 + }, + { + "epoch": 0.23, + "grad_norm": 0.6463810547164883, + "learning_rate": 4.998633143352315e-06, + "loss": 0.5216, + "step": 544 + }, + { + "epoch": 0.23, + "grad_norm": 0.6607126999162819, + "learning_rate": 4.998570313473408e-06, + "loss": 0.5176, + "step": 545 + }, + { + "epoch": 0.23, + "grad_norm": 0.6896680501018091, + "learning_rate": 4.998506072231469e-06, + "loss": 0.5142, + "step": 546 + }, + { + "epoch": 0.23, + "grad_norm": 0.64482281681983, + "learning_rate": 4.99844041966279e-06, + "loss": 0.5223, + "step": 547 + }, + { + "epoch": 0.23, + "grad_norm": 0.6804316388777557, + "learning_rate": 4.998373355804454e-06, + "loss": 0.512, + "step": 548 + }, + { + "epoch": 0.23, + "grad_norm": 0.6799252023969709, + "learning_rate": 4.998304880694342e-06, + "loss": 0.4962, + "step": 549 + }, + { + "epoch": 0.23, + "grad_norm": 0.637365085524163, + "learning_rate": 4.998234994371135e-06, + "loss": 0.5234, + "step": 550 + }, + { + "epoch": 0.24, + "grad_norm": 0.7036436945946638, + "learning_rate": 4.99816369687431e-06, + "loss": 0.5141, + "step": 551 + }, + { + "epoch": 0.24, + "grad_norm": 0.6240264386156906, + "learning_rate": 4.99809098824414e-06, + "loss": 0.5365, + "step": 552 + }, + { + "epoch": 0.24, + "grad_norm": 0.7403699319338876, + "learning_rate": 4.998016868521695e-06, + "loss": 0.5326, + "step": 553 + }, + { + "epoch": 0.24, + "grad_norm": 0.6543848290779426, + "learning_rate": 4.997941337748845e-06, + "loss": 0.5146, + "step": 554 + }, + { + "epoch": 0.24, + "grad_norm": 0.7042287590734285, + "learning_rate": 4.997864395968252e-06, + "loss": 0.5259, + "step": 555 + }, + { + "epoch": 0.24, + "grad_norm": 0.6574049565188761, + "learning_rate": 4.997786043223381e-06, + "loss": 0.5137, + "step": 556 + }, + { + "epoch": 0.24, + "grad_norm": 0.6165676577046679, + "learning_rate": 4.9977062795584895e-06, + "loss": 0.5473, + "step": 557 + }, + { + "epoch": 0.24, + "grad_norm": 0.7044566948417562, + "learning_rate": 4.997625105018634e-06, + "loss": 0.5067, + "step": 558 + }, + { + "epoch": 0.24, + "grad_norm": 0.6447891576595195, + "learning_rate": 4.9975425196496656e-06, + "loss": 0.5117, + "step": 559 + }, + { + "epoch": 0.24, + "grad_norm": 0.6083915401749171, + "learning_rate": 4.997458523498236e-06, + "loss": 0.4752, + "step": 560 + }, + { + "epoch": 0.24, + "grad_norm": 0.6990442708117952, + "learning_rate": 4.997373116611792e-06, + "loss": 0.5145, + "step": 561 + }, + { + "epoch": 0.24, + "grad_norm": 0.6792107396650091, + "learning_rate": 4.997286299038576e-06, + "loss": 0.5143, + "step": 562 + }, + { + "epoch": 0.24, + "grad_norm": 0.6274185410270217, + "learning_rate": 4.997198070827629e-06, + "loss": 0.5079, + "step": 563 + }, + { + "epoch": 0.24, + "grad_norm": 0.7317503385718371, + "learning_rate": 4.99710843202879e-06, + "loss": 0.5058, + "step": 564 + }, + { + "epoch": 0.24, + "grad_norm": 0.6602507519356526, + "learning_rate": 4.99701738269269e-06, + "loss": 0.5269, + "step": 565 + }, + { + "epoch": 0.24, + "grad_norm": 0.7158785502037818, + "learning_rate": 4.9969249228707625e-06, + "loss": 0.5449, + "step": 566 + }, + { + "epoch": 0.24, + "grad_norm": 0.6678036728294128, + "learning_rate": 4.996831052615234e-06, + "loss": 0.4996, + "step": 567 + }, + { + "epoch": 0.24, + "grad_norm": 0.6635048447906537, + "learning_rate": 4.996735771979129e-06, + "loss": 0.5117, + "step": 568 + }, + { + "epoch": 0.24, + "eval_loss": 0.517206609249115, + "eval_runtime": 6917.186, + "eval_samples_per_second": 41.981, + "eval_steps_per_second": 2.099, + "step": 568 + }, + { + "epoch": 0.24, + "grad_norm": 0.6631113641261142, + "learning_rate": 4.996639081016268e-06, + "loss": 0.5103, + "step": 569 + }, + { + "epoch": 0.24, + "grad_norm": 0.6279444558937123, + "learning_rate": 4.996540979781269e-06, + "loss": 0.5397, + "step": 570 + }, + { + "epoch": 0.24, + "grad_norm": 0.6876493931054276, + "learning_rate": 4.996441468329547e-06, + "loss": 0.521, + "step": 571 + }, + { + "epoch": 0.24, + "grad_norm": 0.6664423383196236, + "learning_rate": 4.996340546717312e-06, + "loss": 0.5449, + "step": 572 + }, + { + "epoch": 0.24, + "grad_norm": 0.6101029466977869, + "learning_rate": 4.996238215001571e-06, + "loss": 0.5443, + "step": 573 + }, + { + "epoch": 0.25, + "grad_norm": 0.6622002597747987, + "learning_rate": 4.99613447324013e-06, + "loss": 0.5245, + "step": 574 + }, + { + "epoch": 0.25, + "grad_norm": 0.60547382309911, + "learning_rate": 4.996029321491587e-06, + "loss": 0.5101, + "step": 575 + }, + { + "epoch": 0.25, + "grad_norm": 0.6685376758007866, + "learning_rate": 4.9959227598153395e-06, + "loss": 0.5226, + "step": 576 + }, + { + "epoch": 0.25, + "grad_norm": 0.6325127114162676, + "learning_rate": 4.995814788271582e-06, + "loss": 0.525, + "step": 577 + }, + { + "epoch": 0.25, + "grad_norm": 0.6462009207619525, + "learning_rate": 4.995705406921303e-06, + "loss": 0.5181, + "step": 578 + }, + { + "epoch": 0.25, + "grad_norm": 0.6514032258227322, + "learning_rate": 4.995594615826289e-06, + "loss": 0.5225, + "step": 579 + }, + { + "epoch": 0.25, + "grad_norm": 0.6013786935845273, + "learning_rate": 4.995482415049123e-06, + "loss": 0.5105, + "step": 580 + }, + { + "epoch": 0.25, + "grad_norm": 0.6722327524821109, + "learning_rate": 4.995368804653182e-06, + "loss": 0.5088, + "step": 581 + }, + { + "epoch": 0.25, + "grad_norm": 0.6486288344146888, + "learning_rate": 4.995253784702643e-06, + "loss": 0.4924, + "step": 582 + }, + { + "epoch": 0.25, + "grad_norm": 0.609052561499619, + "learning_rate": 4.995137355262475e-06, + "loss": 0.5091, + "step": 583 + }, + { + "epoch": 0.25, + "grad_norm": 0.6149869407753081, + "learning_rate": 4.995019516398447e-06, + "loss": 0.522, + "step": 584 + }, + { + "epoch": 0.25, + "grad_norm": 0.649711066334699, + "learning_rate": 4.994900268177121e-06, + "loss": 0.5067, + "step": 585 + }, + { + "epoch": 0.25, + "grad_norm": 0.6492055338072015, + "learning_rate": 4.994779610665858e-06, + "loss": 0.5531, + "step": 586 + }, + { + "epoch": 0.25, + "grad_norm": 0.6055259963194052, + "learning_rate": 4.9946575439328124e-06, + "loss": 0.4979, + "step": 587 + }, + { + "epoch": 0.25, + "grad_norm": 0.5966649511641912, + "learning_rate": 4.994534068046936e-06, + "loss": 0.511, + "step": 588 + }, + { + "epoch": 0.25, + "grad_norm": 0.6406292315887588, + "learning_rate": 4.994409183077979e-06, + "loss": 0.5423, + "step": 589 + }, + { + "epoch": 0.25, + "grad_norm": 0.6273737605531128, + "learning_rate": 4.99428288909648e-06, + "loss": 0.5, + "step": 590 + }, + { + "epoch": 0.25, + "grad_norm": 0.6106470129106254, + "learning_rate": 4.994155186173782e-06, + "loss": 0.5107, + "step": 591 + }, + { + "epoch": 0.25, + "grad_norm": 0.6047596492090179, + "learning_rate": 4.994026074382019e-06, + "loss": 0.5267, + "step": 592 + }, + { + "epoch": 0.25, + "grad_norm": 0.5986927472135537, + "learning_rate": 4.993895553794123e-06, + "loss": 0.5025, + "step": 593 + }, + { + "epoch": 0.25, + "grad_norm": 0.6429583180993461, + "learning_rate": 4.993763624483821e-06, + "loss": 0.5158, + "step": 594 + }, + { + "epoch": 0.25, + "grad_norm": 0.5881062972343085, + "learning_rate": 4.993630286525634e-06, + "loss": 0.5061, + "step": 595 + }, + { + "epoch": 0.25, + "grad_norm": 0.6368950879373987, + "learning_rate": 4.993495539994882e-06, + "loss": 0.5203, + "step": 596 + }, + { + "epoch": 0.26, + "grad_norm": 0.6716881530561206, + "learning_rate": 4.99335938496768e-06, + "loss": 0.5228, + "step": 597 + }, + { + "epoch": 0.26, + "grad_norm": 0.6023179095017582, + "learning_rate": 4.993221821520935e-06, + "loss": 0.5349, + "step": 598 + }, + { + "epoch": 0.26, + "grad_norm": 0.6118755989606954, + "learning_rate": 4.993082849732353e-06, + "loss": 0.5071, + "step": 599 + }, + { + "epoch": 0.26, + "grad_norm": 0.6160351740855943, + "learning_rate": 4.992942469680437e-06, + "loss": 0.4796, + "step": 600 + }, + { + "epoch": 0.26, + "grad_norm": 0.6291124034547457, + "learning_rate": 4.99280068144448e-06, + "loss": 0.5545, + "step": 601 + }, + { + "epoch": 0.26, + "grad_norm": 0.6534616170585308, + "learning_rate": 4.992657485104575e-06, + "loss": 0.5358, + "step": 602 + }, + { + "epoch": 0.26, + "grad_norm": 0.6264541141447443, + "learning_rate": 4.99251288074161e-06, + "loss": 0.5292, + "step": 603 + }, + { + "epoch": 0.26, + "grad_norm": 0.6329545583821921, + "learning_rate": 4.992366868437266e-06, + "loss": 0.5032, + "step": 604 + }, + { + "epoch": 0.26, + "grad_norm": 0.6262380789197445, + "learning_rate": 4.992219448274022e-06, + "loss": 0.5178, + "step": 605 + }, + { + "epoch": 0.26, + "grad_norm": 0.6003543172945967, + "learning_rate": 4.9920706203351495e-06, + "loss": 0.5037, + "step": 606 + }, + { + "epoch": 0.26, + "grad_norm": 0.6614794995436638, + "learning_rate": 4.9919203847047185e-06, + "loss": 0.5391, + "step": 607 + }, + { + "epoch": 0.26, + "grad_norm": 0.6276911443763706, + "learning_rate": 4.99176874146759e-06, + "loss": 0.4837, + "step": 608 + }, + { + "epoch": 0.26, + "grad_norm": 0.6140831194461105, + "learning_rate": 4.9916156907094246e-06, + "loss": 0.4944, + "step": 609 + }, + { + "epoch": 0.26, + "grad_norm": 0.6520299553825838, + "learning_rate": 4.991461232516675e-06, + "loss": 0.4984, + "step": 610 + }, + { + "epoch": 0.26, + "grad_norm": 0.6785930087303712, + "learning_rate": 4.99130536697659e-06, + "loss": 0.5289, + "step": 611 + }, + { + "epoch": 0.26, + "grad_norm": 0.6751246335138484, + "learning_rate": 4.991148094177212e-06, + "loss": 0.517, + "step": 612 + }, + { + "epoch": 0.26, + "grad_norm": 0.7063631237307217, + "learning_rate": 4.990989414207381e-06, + "loss": 0.4747, + "step": 613 + }, + { + "epoch": 0.26, + "grad_norm": 0.633915838850146, + "learning_rate": 4.990829327156729e-06, + "loss": 0.5182, + "step": 614 + }, + { + "epoch": 0.26, + "grad_norm": 0.6346239919506149, + "learning_rate": 4.990667833115684e-06, + "loss": 0.4991, + "step": 615 + }, + { + "epoch": 0.26, + "grad_norm": 0.6922170062238953, + "learning_rate": 4.990504932175471e-06, + "loss": 0.4997, + "step": 616 + }, + { + "epoch": 0.26, + "grad_norm": 0.6497250780582481, + "learning_rate": 4.990340624428105e-06, + "loss": 0.5025, + "step": 617 + }, + { + "epoch": 0.26, + "grad_norm": 0.6480587404624075, + "learning_rate": 4.990174909966399e-06, + "loss": 0.5164, + "step": 618 + }, + { + "epoch": 0.26, + "grad_norm": 0.5647887869895122, + "learning_rate": 4.9900077888839606e-06, + "loss": 0.4974, + "step": 619 + }, + { + "epoch": 0.26, + "grad_norm": 0.6157405333654503, + "learning_rate": 4.989839261275191e-06, + "loss": 0.5119, + "step": 620 + }, + { + "epoch": 0.27, + "grad_norm": 0.6350389596733836, + "learning_rate": 4.989669327235285e-06, + "loss": 0.5252, + "step": 621 + }, + { + "epoch": 0.27, + "grad_norm": 0.584442178782563, + "learning_rate": 4.989497986860234e-06, + "loss": 0.5043, + "step": 622 + }, + { + "epoch": 0.27, + "grad_norm": 0.6196796070950197, + "learning_rate": 4.989325240246823e-06, + "loss": 0.5176, + "step": 623 + }, + { + "epoch": 0.27, + "grad_norm": 0.6015855990105409, + "learning_rate": 4.98915108749263e-06, + "loss": 0.5065, + "step": 624 + }, + { + "epoch": 0.27, + "grad_norm": 0.5944351660408316, + "learning_rate": 4.988975528696028e-06, + "loss": 0.5027, + "step": 625 + }, + { + "epoch": 0.27, + "grad_norm": 0.5799955435183154, + "learning_rate": 4.988798563956186e-06, + "loss": 0.4897, + "step": 626 + }, + { + "epoch": 0.27, + "grad_norm": 0.6252189457639663, + "learning_rate": 4.988620193373066e-06, + "loss": 0.5185, + "step": 627 + }, + { + "epoch": 0.27, + "grad_norm": 0.6091789072393431, + "learning_rate": 4.988440417047424e-06, + "loss": 0.5364, + "step": 628 + }, + { + "epoch": 0.27, + "grad_norm": 0.5991774516054336, + "learning_rate": 4.988259235080807e-06, + "loss": 0.5218, + "step": 629 + }, + { + "epoch": 0.27, + "grad_norm": 0.6004079133450607, + "learning_rate": 4.988076647575562e-06, + "loss": 0.5346, + "step": 630 + }, + { + "epoch": 0.27, + "grad_norm": 0.6055525535102732, + "learning_rate": 4.987892654634825e-06, + "loss": 0.4794, + "step": 631 + }, + { + "epoch": 0.27, + "grad_norm": 0.6231744592672184, + "learning_rate": 4.987707256362529e-06, + "loss": 0.5332, + "step": 632 + }, + { + "epoch": 0.27, + "grad_norm": 0.5943003024735096, + "learning_rate": 4.9875204528633995e-06, + "loss": 0.4958, + "step": 633 + }, + { + "epoch": 0.27, + "grad_norm": 0.6429217088606017, + "learning_rate": 4.987332244242955e-06, + "loss": 0.5049, + "step": 634 + }, + { + "epoch": 0.27, + "grad_norm": 0.6474023001802436, + "learning_rate": 4.98714263060751e-06, + "loss": 0.4769, + "step": 635 + }, + { + "epoch": 0.27, + "grad_norm": 0.6404945389649939, + "learning_rate": 4.9869516120641705e-06, + "loss": 0.4947, + "step": 636 + }, + { + "epoch": 0.27, + "grad_norm": 0.6640610934148168, + "learning_rate": 4.986759188720836e-06, + "loss": 0.5258, + "step": 637 + }, + { + "epoch": 0.27, + "grad_norm": 0.6232010288109409, + "learning_rate": 4.986565360686201e-06, + "loss": 0.5189, + "step": 638 + }, + { + "epoch": 0.27, + "grad_norm": 0.6143458681147979, + "learning_rate": 4.9863701280697535e-06, + "loss": 0.4986, + "step": 639 + }, + { + "epoch": 0.27, + "eval_loss": 0.5126649141311646, + "eval_runtime": 6910.7711, + "eval_samples_per_second": 42.02, + "eval_steps_per_second": 2.101, + "step": 639 + }, + { + "epoch": 0.27, + "grad_norm": 0.6231312288980349, + "learning_rate": 4.986173490981773e-06, + "loss": 0.5218, + "step": 640 + }, + { + "epoch": 0.27, + "grad_norm": 0.6521330675888304, + "learning_rate": 4.985975449533335e-06, + "loss": 0.4984, + "step": 641 + }, + { + "epoch": 0.27, + "grad_norm": 0.6192181654765082, + "learning_rate": 4.9857760038363045e-06, + "loss": 0.5257, + "step": 642 + }, + { + "epoch": 0.27, + "grad_norm": 0.6486948358491045, + "learning_rate": 4.9855751540033446e-06, + "loss": 0.5147, + "step": 643 + }, + { + "epoch": 0.28, + "grad_norm": 0.5969342371638264, + "learning_rate": 4.985372900147907e-06, + "loss": 0.5185, + "step": 644 + }, + { + "epoch": 0.28, + "grad_norm": 0.6320704370838177, + "learning_rate": 4.9851692423842406e-06, + "loss": 0.5383, + "step": 645 + }, + { + "epoch": 0.28, + "grad_norm": 0.5966997098167783, + "learning_rate": 4.984964180827383e-06, + "loss": 0.5044, + "step": 646 + }, + { + "epoch": 0.28, + "grad_norm": 0.6109011858617819, + "learning_rate": 4.984757715593168e-06, + "loss": 0.5091, + "step": 647 + }, + { + "epoch": 0.28, + "grad_norm": 0.653805713393856, + "learning_rate": 4.984549846798221e-06, + "loss": 0.5161, + "step": 648 + }, + { + "epoch": 0.28, + "grad_norm": 0.6199215842335395, + "learning_rate": 4.984340574559961e-06, + "loss": 0.5152, + "step": 649 + }, + { + "epoch": 0.28, + "grad_norm": 0.6068098393695217, + "learning_rate": 4.984129898996599e-06, + "loss": 0.5209, + "step": 650 + }, + { + "epoch": 0.28, + "grad_norm": 0.5796411094349677, + "learning_rate": 4.9839178202271375e-06, + "loss": 0.502, + "step": 651 + }, + { + "epoch": 0.28, + "grad_norm": 0.6053468178000233, + "learning_rate": 4.983704338371375e-06, + "loss": 0.5283, + "step": 652 + }, + { + "epoch": 0.28, + "grad_norm": 0.6156634669849913, + "learning_rate": 4.983489453549901e-06, + "loss": 0.4947, + "step": 653 + }, + { + "epoch": 0.28, + "grad_norm": 0.6241545798810763, + "learning_rate": 4.983273165884096e-06, + "loss": 0.5176, + "step": 654 + }, + { + "epoch": 0.28, + "grad_norm": 0.5895664409761231, + "learning_rate": 4.983055475496134e-06, + "loss": 0.5105, + "step": 655 + }, + { + "epoch": 0.28, + "grad_norm": 0.5935426249245771, + "learning_rate": 4.982836382508981e-06, + "loss": 0.5, + "step": 656 + }, + { + "epoch": 0.28, + "grad_norm": 0.6233645788446041, + "learning_rate": 4.9826158870463955e-06, + "loss": 0.5099, + "step": 657 + }, + { + "epoch": 0.28, + "grad_norm": 0.5991332439052809, + "learning_rate": 4.982393989232931e-06, + "loss": 0.4924, + "step": 658 + }, + { + "epoch": 0.28, + "grad_norm": 0.5880708311480832, + "learning_rate": 4.982170689193927e-06, + "loss": 0.4978, + "step": 659 + }, + { + "epoch": 0.28, + "grad_norm": 0.627672707017739, + "learning_rate": 4.981945987055521e-06, + "loss": 0.5077, + "step": 660 + }, + { + "epoch": 0.28, + "grad_norm": 0.6040524425886902, + "learning_rate": 4.981719882944639e-06, + "loss": 0.493, + "step": 661 + }, + { + "epoch": 0.28, + "grad_norm": 0.5875699138537736, + "learning_rate": 4.981492376989001e-06, + "loss": 0.506, + "step": 662 + }, + { + "epoch": 0.28, + "grad_norm": 0.6071888999132754, + "learning_rate": 4.981263469317116e-06, + "loss": 0.4954, + "step": 663 + }, + { + "epoch": 0.28, + "grad_norm": 0.6343036259474204, + "learning_rate": 4.981033160058289e-06, + "loss": 0.5302, + "step": 664 + }, + { + "epoch": 0.28, + "grad_norm": 0.6270477353053345, + "learning_rate": 4.9808014493426124e-06, + "loss": 0.4676, + "step": 665 + }, + { + "epoch": 0.28, + "grad_norm": 0.5957359905347063, + "learning_rate": 4.9805683373009746e-06, + "loss": 0.5108, + "step": 666 + }, + { + "epoch": 0.28, + "grad_norm": 0.6267707786737462, + "learning_rate": 4.980333824065051e-06, + "loss": 0.5293, + "step": 667 + }, + { + "epoch": 0.29, + "grad_norm": 0.577921156213165, + "learning_rate": 4.980097909767311e-06, + "loss": 0.5115, + "step": 668 + }, + { + "epoch": 0.29, + "grad_norm": 0.6306888663442499, + "learning_rate": 4.9798605945410156e-06, + "loss": 0.5123, + "step": 669 + }, + { + "epoch": 0.29, + "grad_norm": 0.5963346695333823, + "learning_rate": 4.979621878520217e-06, + "loss": 0.5026, + "step": 670 + }, + { + "epoch": 0.29, + "grad_norm": 0.590690163604036, + "learning_rate": 4.979381761839757e-06, + "loss": 0.503, + "step": 671 + }, + { + "epoch": 0.29, + "grad_norm": 0.6410334373270873, + "learning_rate": 4.979140244635271e-06, + "loss": 0.5116, + "step": 672 + }, + { + "epoch": 0.29, + "grad_norm": 0.6105251339526676, + "learning_rate": 4.978897327043185e-06, + "loss": 0.503, + "step": 673 + }, + { + "epoch": 0.29, + "grad_norm": 0.611118980970183, + "learning_rate": 4.978653009200713e-06, + "loss": 0.5005, + "step": 674 + }, + { + "epoch": 0.29, + "grad_norm": 0.61089260561556, + "learning_rate": 4.978407291245866e-06, + "loss": 0.4745, + "step": 675 + }, + { + "epoch": 0.29, + "grad_norm": 0.6296501736035641, + "learning_rate": 4.978160173317439e-06, + "loss": 0.5413, + "step": 676 + }, + { + "epoch": 0.29, + "grad_norm": 0.5754436830327142, + "learning_rate": 4.977911655555022e-06, + "loss": 0.5258, + "step": 677 + }, + { + "epoch": 0.29, + "grad_norm": 0.6080486185142544, + "learning_rate": 4.977661738098996e-06, + "loss": 0.5022, + "step": 678 + }, + { + "epoch": 0.29, + "grad_norm": 0.6095469549060126, + "learning_rate": 4.97741042109053e-06, + "loss": 0.5236, + "step": 679 + }, + { + "epoch": 0.29, + "grad_norm": 0.6246514201425122, + "learning_rate": 4.977157704671585e-06, + "loss": 0.4948, + "step": 680 + }, + { + "epoch": 0.29, + "grad_norm": 0.6043169735120094, + "learning_rate": 4.976903588984913e-06, + "loss": 0.5174, + "step": 681 + }, + { + "epoch": 0.29, + "grad_norm": 0.5782841931005864, + "learning_rate": 4.976648074174056e-06, + "loss": 0.4935, + "step": 682 + }, + { + "epoch": 0.29, + "grad_norm": 0.6172869080905491, + "learning_rate": 4.976391160383347e-06, + "loss": 0.5068, + "step": 683 + }, + { + "epoch": 0.29, + "grad_norm": 0.5986800192103562, + "learning_rate": 4.976132847757906e-06, + "loss": 0.5293, + "step": 684 + }, + { + "epoch": 0.29, + "grad_norm": 0.5997059239072083, + "learning_rate": 4.975873136443649e-06, + "loss": 0.5311, + "step": 685 + }, + { + "epoch": 0.29, + "grad_norm": 0.586987259487267, + "learning_rate": 4.9756120265872755e-06, + "loss": 0.513, + "step": 686 + }, + { + "epoch": 0.29, + "grad_norm": 0.6278820584072181, + "learning_rate": 4.97534951833628e-06, + "loss": 0.5278, + "step": 687 + }, + { + "epoch": 0.29, + "grad_norm": 0.5792346089546118, + "learning_rate": 4.975085611838944e-06, + "loss": 0.5048, + "step": 688 + }, + { + "epoch": 0.29, + "grad_norm": 0.5926608888246347, + "learning_rate": 4.974820307244341e-06, + "loss": 0.4979, + "step": 689 + }, + { + "epoch": 0.29, + "grad_norm": 0.5905921659974432, + "learning_rate": 4.974553604702332e-06, + "loss": 0.4977, + "step": 690 + }, + { + "epoch": 0.3, + "grad_norm": 0.6604840701536976, + "learning_rate": 4.974285504363569e-06, + "loss": 0.504, + "step": 691 + }, + { + "epoch": 0.3, + "grad_norm": 0.6243772158616288, + "learning_rate": 4.974016006379495e-06, + "loss": 0.5288, + "step": 692 + }, + { + "epoch": 0.3, + "grad_norm": 0.6240076780093031, + "learning_rate": 4.973745110902339e-06, + "loss": 0.5189, + "step": 693 + }, + { + "epoch": 0.3, + "grad_norm": 0.6107829102505847, + "learning_rate": 4.973472818085122e-06, + "loss": 0.5359, + "step": 694 + }, + { + "epoch": 0.3, + "grad_norm": 0.6345275208943483, + "learning_rate": 4.9731991280816534e-06, + "loss": 0.5096, + "step": 695 + }, + { + "epoch": 0.3, + "grad_norm": 0.6136833545759933, + "learning_rate": 4.9729240410465315e-06, + "loss": 0.5048, + "step": 696 + }, + { + "epoch": 0.3, + "grad_norm": 0.6405628475705748, + "learning_rate": 4.972647557135146e-06, + "loss": 0.5234, + "step": 697 + }, + { + "epoch": 0.3, + "grad_norm": 0.5807229777227344, + "learning_rate": 4.972369676503672e-06, + "loss": 0.516, + "step": 698 + }, + { + "epoch": 0.3, + "grad_norm": 0.6808588571301922, + "learning_rate": 4.972090399309075e-06, + "loss": 0.4935, + "step": 699 + }, + { + "epoch": 0.3, + "grad_norm": 0.598882478841374, + "learning_rate": 4.971809725709112e-06, + "loss": 0.5167, + "step": 700 + }, + { + "epoch": 0.3, + "grad_norm": 0.6597647873322787, + "learning_rate": 4.971527655862325e-06, + "loss": 0.4986, + "step": 701 + }, + { + "epoch": 0.3, + "grad_norm": 0.6205471007307612, + "learning_rate": 4.9712441899280475e-06, + "loss": 0.481, + "step": 702 + }, + { + "epoch": 0.3, + "grad_norm": 0.6501750674785319, + "learning_rate": 4.970959328066399e-06, + "loss": 0.5141, + "step": 703 + }, + { + "epoch": 0.3, + "grad_norm": 0.6389770637630756, + "learning_rate": 4.97067307043829e-06, + "loss": 0.5045, + "step": 704 + }, + { + "epoch": 0.3, + "grad_norm": 0.6071448493335176, + "learning_rate": 4.970385417205418e-06, + "loss": 0.5231, + "step": 705 + }, + { + "epoch": 0.3, + "grad_norm": 0.6307400799413926, + "learning_rate": 4.9700963685302685e-06, + "loss": 0.5132, + "step": 706 + }, + { + "epoch": 0.3, + "grad_norm": 0.6345555266447565, + "learning_rate": 4.969805924576116e-06, + "loss": 0.5165, + "step": 707 + }, + { + "epoch": 0.3, + "grad_norm": 0.5992064057317342, + "learning_rate": 4.969514085507025e-06, + "loss": 0.4933, + "step": 708 + }, + { + "epoch": 0.3, + "grad_norm": 0.6327334322592149, + "learning_rate": 4.9692208514878445e-06, + "loss": 0.4959, + "step": 709 + }, + { + "epoch": 0.3, + "grad_norm": 0.6250597389392478, + "learning_rate": 4.968926222684213e-06, + "loss": 0.5097, + "step": 710 + }, + { + "epoch": 0.3, + "eval_loss": 0.5087815523147583, + "eval_runtime": 6910.3283, + "eval_samples_per_second": 42.022, + "eval_steps_per_second": 2.101, + "step": 710 + }, + { + "epoch": 0.3, + "grad_norm": 0.6069692793366778, + "learning_rate": 4.9686301992625575e-06, + "loss": 0.4892, + "step": 711 + }, + { + "epoch": 0.3, + "grad_norm": 0.6516138876780239, + "learning_rate": 4.968332781390092e-06, + "loss": 0.5303, + "step": 712 + }, + { + "epoch": 0.3, + "grad_norm": 0.6143846915255134, + "learning_rate": 4.968033969234818e-06, + "loss": 0.4919, + "step": 713 + }, + { + "epoch": 0.31, + "grad_norm": 0.5981365954582023, + "learning_rate": 4.967733762965526e-06, + "loss": 0.5053, + "step": 714 + }, + { + "epoch": 0.31, + "grad_norm": 0.6237699240689051, + "learning_rate": 4.967432162751792e-06, + "loss": 0.4727, + "step": 715 + }, + { + "epoch": 0.31, + "grad_norm": 0.666141553803713, + "learning_rate": 4.967129168763981e-06, + "loss": 0.4991, + "step": 716 + }, + { + "epoch": 0.31, + "grad_norm": 0.646564036075848, + "learning_rate": 4.966824781173245e-06, + "loss": 0.5251, + "step": 717 + }, + { + "epoch": 0.31, + "grad_norm": 0.6401865563361007, + "learning_rate": 4.966519000151522e-06, + "loss": 0.4882, + "step": 718 + }, + { + "epoch": 0.31, + "grad_norm": 0.6532940949381087, + "learning_rate": 4.966211825871538e-06, + "loss": 0.4859, + "step": 719 + }, + { + "epoch": 0.31, + "grad_norm": 0.6240667658033411, + "learning_rate": 4.965903258506806e-06, + "loss": 0.5165, + "step": 720 + }, + { + "epoch": 0.31, + "grad_norm": 0.6050182788187826, + "learning_rate": 4.965593298231627e-06, + "loss": 0.4972, + "step": 721 + }, + { + "epoch": 0.31, + "grad_norm": 0.6394796781675777, + "learning_rate": 4.965281945221086e-06, + "loss": 0.4863, + "step": 722 + }, + { + "epoch": 0.31, + "grad_norm": 0.5879647113680488, + "learning_rate": 4.964969199651059e-06, + "loss": 0.5268, + "step": 723 + }, + { + "epoch": 0.31, + "grad_norm": 0.6719126547265977, + "learning_rate": 4.964655061698204e-06, + "loss": 0.5094, + "step": 724 + }, + { + "epoch": 0.31, + "grad_norm": 0.6247081850307215, + "learning_rate": 4.964339531539967e-06, + "loss": 0.5042, + "step": 725 + }, + { + "epoch": 0.31, + "grad_norm": 0.6448631581908335, + "learning_rate": 4.964022609354583e-06, + "loss": 0.548, + "step": 726 + }, + { + "epoch": 0.31, + "grad_norm": 0.6155726808462035, + "learning_rate": 4.963704295321069e-06, + "loss": 0.5079, + "step": 727 + }, + { + "epoch": 0.31, + "grad_norm": 0.5856540757636703, + "learning_rate": 4.963384589619233e-06, + "loss": 0.5344, + "step": 728 + }, + { + "epoch": 0.31, + "grad_norm": 0.6160053187700611, + "learning_rate": 4.963063492429665e-06, + "loss": 0.5174, + "step": 729 + }, + { + "epoch": 0.31, + "grad_norm": 0.6092787845104115, + "learning_rate": 4.9627410039337426e-06, + "loss": 0.5069, + "step": 730 + }, + { + "epoch": 0.31, + "grad_norm": 0.572582103354353, + "learning_rate": 4.96241712431363e-06, + "loss": 0.4989, + "step": 731 + }, + { + "epoch": 0.31, + "grad_norm": 0.5711072343879144, + "learning_rate": 4.962091853752276e-06, + "loss": 0.5148, + "step": 732 + }, + { + "epoch": 0.31, + "grad_norm": 0.6061761244724242, + "learning_rate": 4.961765192433415e-06, + "loss": 0.5287, + "step": 733 + }, + { + "epoch": 0.31, + "grad_norm": 0.6033994961802631, + "learning_rate": 4.961437140541569e-06, + "loss": 0.4927, + "step": 734 + }, + { + "epoch": 0.31, + "grad_norm": 0.6209778806623933, + "learning_rate": 4.9611076982620445e-06, + "loss": 0.5358, + "step": 735 + }, + { + "epoch": 0.31, + "grad_norm": 0.6501849583238293, + "learning_rate": 4.960776865780931e-06, + "loss": 0.5183, + "step": 736 + }, + { + "epoch": 0.31, + "grad_norm": 0.5740429036937207, + "learning_rate": 4.9604446432851064e-06, + "loss": 0.481, + "step": 737 + }, + { + "epoch": 0.32, + "grad_norm": 0.5986038003667088, + "learning_rate": 4.960111030962232e-06, + "loss": 0.5035, + "step": 738 + }, + { + "epoch": 0.32, + "grad_norm": 0.6269593371981165, + "learning_rate": 4.959776029000756e-06, + "loss": 0.5019, + "step": 739 + }, + { + "epoch": 0.32, + "grad_norm": 0.586820399261423, + "learning_rate": 4.959439637589909e-06, + "loss": 0.4907, + "step": 740 + }, + { + "epoch": 0.32, + "grad_norm": 0.591243683238131, + "learning_rate": 4.959101856919709e-06, + "loss": 0.5128, + "step": 741 + }, + { + "epoch": 0.32, + "grad_norm": 0.6043813840833816, + "learning_rate": 4.9587626871809564e-06, + "loss": 0.5254, + "step": 742 + }, + { + "epoch": 0.32, + "grad_norm": 0.6088008476429794, + "learning_rate": 4.958422128565238e-06, + "loss": 0.5099, + "step": 743 + }, + { + "epoch": 0.32, + "grad_norm": 0.5919159624960704, + "learning_rate": 4.958080181264926e-06, + "loss": 0.4813, + "step": 744 + }, + { + "epoch": 0.32, + "grad_norm": 0.5767332938769787, + "learning_rate": 4.957736845473173e-06, + "loss": 0.5118, + "step": 745 + }, + { + "epoch": 0.32, + "grad_norm": 0.6115434997604602, + "learning_rate": 4.957392121383919e-06, + "loss": 0.5178, + "step": 746 + }, + { + "epoch": 0.32, + "grad_norm": 0.606547936666955, + "learning_rate": 4.957046009191889e-06, + "loss": 0.5125, + "step": 747 + }, + { + "epoch": 0.32, + "grad_norm": 0.6281682365690769, + "learning_rate": 4.956698509092591e-06, + "loss": 0.5302, + "step": 748 + }, + { + "epoch": 0.32, + "grad_norm": 0.6041160141331913, + "learning_rate": 4.956349621282315e-06, + "loss": 0.4981, + "step": 749 + }, + { + "epoch": 0.32, + "grad_norm": 0.6082725831189176, + "learning_rate": 4.9559993459581375e-06, + "loss": 0.5022, + "step": 750 + }, + { + "epoch": 0.32, + "grad_norm": 0.6308489518539907, + "learning_rate": 4.9556476833179185e-06, + "loss": 0.5145, + "step": 751 + }, + { + "epoch": 0.32, + "grad_norm": 0.6122107491320039, + "learning_rate": 4.9552946335603006e-06, + "loss": 0.4803, + "step": 752 + }, + { + "epoch": 0.32, + "grad_norm": 0.6247807256313168, + "learning_rate": 4.95494019688471e-06, + "loss": 0.5033, + "step": 753 + }, + { + "epoch": 0.32, + "grad_norm": 0.6133850387446359, + "learning_rate": 4.954584373491357e-06, + "loss": 0.5094, + "step": 754 + }, + { + "epoch": 0.32, + "grad_norm": 0.6610062403122814, + "learning_rate": 4.954227163581234e-06, + "loss": 0.5246, + "step": 755 + }, + { + "epoch": 0.32, + "grad_norm": 0.5719697098479033, + "learning_rate": 4.953868567356121e-06, + "loss": 0.4986, + "step": 756 + }, + { + "epoch": 0.32, + "grad_norm": 0.611277655866564, + "learning_rate": 4.953508585018573e-06, + "loss": 0.5084, + "step": 757 + }, + { + "epoch": 0.32, + "grad_norm": 0.570613285492476, + "learning_rate": 4.953147216771935e-06, + "loss": 0.5154, + "step": 758 + }, + { + "epoch": 0.32, + "grad_norm": 0.6081820557545713, + "learning_rate": 4.952784462820333e-06, + "loss": 0.5217, + "step": 759 + }, + { + "epoch": 0.32, + "grad_norm": 0.6233401854792252, + "learning_rate": 4.952420323368673e-06, + "loss": 0.4905, + "step": 760 + }, + { + "epoch": 0.33, + "grad_norm": 0.5903715996857434, + "learning_rate": 4.952054798622649e-06, + "loss": 0.4612, + "step": 761 + }, + { + "epoch": 0.33, + "grad_norm": 0.5976026591907524, + "learning_rate": 4.951687888788731e-06, + "loss": 0.5114, + "step": 762 + }, + { + "epoch": 0.33, + "grad_norm": 0.6424120319126979, + "learning_rate": 4.9513195940741764e-06, + "loss": 0.4839, + "step": 763 + }, + { + "epoch": 0.33, + "grad_norm": 0.6168679097191889, + "learning_rate": 4.950949914687024e-06, + "loss": 0.5134, + "step": 764 + }, + { + "epoch": 0.33, + "grad_norm": 0.6400860199848029, + "learning_rate": 4.950578850836092e-06, + "loss": 0.5087, + "step": 765 + }, + { + "epoch": 0.33, + "grad_norm": 0.6011473106721757, + "learning_rate": 4.950206402730984e-06, + "loss": 0.526, + "step": 766 + }, + { + "epoch": 0.33, + "grad_norm": 0.6076752750689335, + "learning_rate": 4.949832570582083e-06, + "loss": 0.5243, + "step": 767 + }, + { + "epoch": 0.33, + "grad_norm": 0.611589146611987, + "learning_rate": 4.949457354600556e-06, + "loss": 0.5602, + "step": 768 + }, + { + "epoch": 0.33, + "grad_norm": 0.6379045005420435, + "learning_rate": 4.94908075499835e-06, + "loss": 0.485, + "step": 769 + }, + { + "epoch": 0.33, + "grad_norm": 0.6111544996406524, + "learning_rate": 4.948702771988195e-06, + "loss": 0.4961, + "step": 770 + }, + { + "epoch": 0.33, + "grad_norm": 0.6440163967432894, + "learning_rate": 4.9483234057836e-06, + "loss": 0.5247, + "step": 771 + }, + { + "epoch": 0.33, + "grad_norm": 0.6424136239524529, + "learning_rate": 4.9479426565988585e-06, + "loss": 0.5095, + "step": 772 + }, + { + "epoch": 0.33, + "grad_norm": 0.5845387618126129, + "learning_rate": 4.947560524649043e-06, + "loss": 0.4667, + "step": 773 + }, + { + "epoch": 0.33, + "grad_norm": 0.5581508613420673, + "learning_rate": 4.947177010150007e-06, + "loss": 0.4915, + "step": 774 + }, + { + "epoch": 0.33, + "grad_norm": 0.6002089992364806, + "learning_rate": 4.9467921133183864e-06, + "loss": 0.4957, + "step": 775 + }, + { + "epoch": 0.33, + "grad_norm": 0.571455007692429, + "learning_rate": 4.946405834371598e-06, + "loss": 0.4998, + "step": 776 + }, + { + "epoch": 0.33, + "grad_norm": 0.5996615965227683, + "learning_rate": 4.9460181735278365e-06, + "loss": 0.5058, + "step": 777 + }, + { + "epoch": 0.33, + "grad_norm": 0.5933857632393196, + "learning_rate": 4.945629131006081e-06, + "loss": 0.5165, + "step": 778 + }, + { + "epoch": 0.33, + "grad_norm": 0.5922936609778912, + "learning_rate": 4.945238707026087e-06, + "loss": 0.497, + "step": 779 + }, + { + "epoch": 0.33, + "grad_norm": 0.6468383832092212, + "learning_rate": 4.944846901808397e-06, + "loss": 0.4988, + "step": 780 + }, + { + "epoch": 0.33, + "grad_norm": 0.5963282275866615, + "learning_rate": 4.9444537155743245e-06, + "loss": 0.5146, + "step": 781 + }, + { + "epoch": 0.33, + "eval_loss": 0.5058240294456482, + "eval_runtime": 6911.9149, + "eval_samples_per_second": 42.013, + "eval_steps_per_second": 2.101, + "step": 781 + }, + { + "epoch": 0.33, + "grad_norm": 0.616517236291001, + "learning_rate": 4.944059148545971e-06, + "loss": 0.5123, + "step": 782 + }, + { + "epoch": 0.33, + "grad_norm": 0.6002931009806305, + "learning_rate": 4.943663200946213e-06, + "loss": 0.501, + "step": 783 + }, + { + "epoch": 0.33, + "grad_norm": 0.5875304361967715, + "learning_rate": 4.94326587299871e-06, + "loss": 0.4847, + "step": 784 + }, + { + "epoch": 0.34, + "grad_norm": 0.5836573087771422, + "learning_rate": 4.942867164927899e-06, + "loss": 0.4998, + "step": 785 + }, + { + "epoch": 0.34, + "grad_norm": 0.5783319062666641, + "learning_rate": 4.942467076958999e-06, + "loss": 0.4989, + "step": 786 + }, + { + "epoch": 0.34, + "grad_norm": 0.6295939909126551, + "learning_rate": 4.9420656093180056e-06, + "loss": 0.4886, + "step": 787 + }, + { + "epoch": 0.34, + "grad_norm": 0.5808772327348201, + "learning_rate": 4.941662762231695e-06, + "loss": 0.5036, + "step": 788 + }, + { + "epoch": 0.34, + "grad_norm": 0.5757656711915483, + "learning_rate": 4.9412585359276235e-06, + "loss": 0.4892, + "step": 789 + }, + { + "epoch": 0.34, + "grad_norm": 0.6156971766935416, + "learning_rate": 4.940852930634126e-06, + "loss": 0.5189, + "step": 790 + }, + { + "epoch": 0.34, + "grad_norm": 0.5964903100014775, + "learning_rate": 4.940445946580315e-06, + "loss": 0.4893, + "step": 791 + }, + { + "epoch": 0.34, + "grad_norm": 0.5930502537178118, + "learning_rate": 4.9400375839960826e-06, + "loss": 0.4854, + "step": 792 + }, + { + "epoch": 0.34, + "grad_norm": 0.6216632090251537, + "learning_rate": 4.939627843112102e-06, + "loss": 0.52, + "step": 793 + }, + { + "epoch": 0.34, + "grad_norm": 0.5981307705816444, + "learning_rate": 4.939216724159821e-06, + "loss": 0.4924, + "step": 794 + }, + { + "epoch": 0.34, + "grad_norm": 0.6171104301125759, + "learning_rate": 4.938804227371467e-06, + "loss": 0.4949, + "step": 795 + }, + { + "epoch": 0.34, + "grad_norm": 0.6232653150523578, + "learning_rate": 4.938390352980049e-06, + "loss": 0.5276, + "step": 796 + }, + { + "epoch": 0.34, + "grad_norm": 0.6319806108460523, + "learning_rate": 4.93797510121935e-06, + "loss": 0.4867, + "step": 797 + }, + { + "epoch": 0.34, + "grad_norm": 0.6373229220396098, + "learning_rate": 4.937558472323932e-06, + "loss": 0.5175, + "step": 798 + }, + { + "epoch": 0.34, + "grad_norm": 0.6381263394133697, + "learning_rate": 4.937140466529135e-06, + "loss": 0.5212, + "step": 799 + }, + { + "epoch": 0.34, + "grad_norm": 0.6889087852800232, + "learning_rate": 4.936721084071079e-06, + "loss": 0.5068, + "step": 800 + }, + { + "epoch": 0.34, + "grad_norm": 0.6217485975923667, + "learning_rate": 4.936300325186659e-06, + "loss": 0.4926, + "step": 801 + }, + { + "epoch": 0.34, + "grad_norm": 0.571769849631971, + "learning_rate": 4.9358781901135485e-06, + "loss": 0.509, + "step": 802 + }, + { + "epoch": 0.34, + "grad_norm": 0.6780444728173729, + "learning_rate": 4.935454679090197e-06, + "loss": 0.5165, + "step": 803 + }, + { + "epoch": 0.34, + "grad_norm": 0.6076278839686122, + "learning_rate": 4.935029792355834e-06, + "loss": 0.5039, + "step": 804 + }, + { + "epoch": 0.34, + "grad_norm": 0.6538852692558306, + "learning_rate": 4.9346035301504644e-06, + "loss": 0.4962, + "step": 805 + }, + { + "epoch": 0.34, + "grad_norm": 0.6171376333190426, + "learning_rate": 4.934175892714869e-06, + "loss": 0.5112, + "step": 806 + }, + { + "epoch": 0.34, + "grad_norm": 0.6558521386937952, + "learning_rate": 4.933746880290607e-06, + "loss": 0.4807, + "step": 807 + }, + { + "epoch": 0.35, + "grad_norm": 0.5755989068902357, + "learning_rate": 4.933316493120015e-06, + "loss": 0.4821, + "step": 808 + }, + { + "epoch": 0.35, + "grad_norm": 0.6612931100425695, + "learning_rate": 4.932884731446204e-06, + "loss": 0.517, + "step": 809 + }, + { + "epoch": 0.35, + "grad_norm": 0.6216763697335129, + "learning_rate": 4.932451595513063e-06, + "loss": 0.5269, + "step": 810 + }, + { + "epoch": 0.35, + "grad_norm": 0.6136117850372235, + "learning_rate": 4.932017085565256e-06, + "loss": 0.5197, + "step": 811 + }, + { + "epoch": 0.35, + "grad_norm": 0.5783344700139442, + "learning_rate": 4.931581201848224e-06, + "loss": 0.5167, + "step": 812 + }, + { + "epoch": 0.35, + "grad_norm": 0.5691685860925245, + "learning_rate": 4.931143944608184e-06, + "loss": 0.4957, + "step": 813 + }, + { + "epoch": 0.35, + "grad_norm": 0.6136146632090349, + "learning_rate": 4.93070531409213e-06, + "loss": 0.4923, + "step": 814 + }, + { + "epoch": 0.35, + "grad_norm": 0.6146071241365089, + "learning_rate": 4.930265310547829e-06, + "loss": 0.498, + "step": 815 + }, + { + "epoch": 0.35, + "grad_norm": 0.6002352821905501, + "learning_rate": 4.9298239342238255e-06, + "loss": 0.4994, + "step": 816 + }, + { + "epoch": 0.35, + "grad_norm": 0.5889451517506399, + "learning_rate": 4.929381185369438e-06, + "loss": 0.5001, + "step": 817 + }, + { + "epoch": 0.35, + "grad_norm": 0.6082258257150579, + "learning_rate": 4.928937064234764e-06, + "loss": 0.5521, + "step": 818 + }, + { + "epoch": 0.35, + "grad_norm": 0.6330482315113839, + "learning_rate": 4.928491571070669e-06, + "loss": 0.5219, + "step": 819 + }, + { + "epoch": 0.35, + "grad_norm": 0.6004600333139476, + "learning_rate": 4.928044706128803e-06, + "loss": 0.5005, + "step": 820 + }, + { + "epoch": 0.35, + "grad_norm": 0.6189100444908509, + "learning_rate": 4.927596469661582e-06, + "loss": 0.5176, + "step": 821 + }, + { + "epoch": 0.35, + "grad_norm": 0.606015342163562, + "learning_rate": 4.9271468619222015e-06, + "loss": 0.496, + "step": 822 + }, + { + "epoch": 0.35, + "grad_norm": 0.8556526555027408, + "learning_rate": 4.926695883164632e-06, + "loss": 0.5257, + "step": 823 + }, + { + "epoch": 0.35, + "grad_norm": 0.6258732371599479, + "learning_rate": 4.926243533643615e-06, + "loss": 0.487, + "step": 824 + }, + { + "epoch": 0.35, + "grad_norm": 0.5994220294037795, + "learning_rate": 4.92578981361467e-06, + "loss": 0.5072, + "step": 825 + }, + { + "epoch": 0.35, + "grad_norm": 0.5716588082101919, + "learning_rate": 4.925334723334088e-06, + "loss": 0.4994, + "step": 826 + }, + { + "epoch": 0.35, + "grad_norm": 0.585486108238998, + "learning_rate": 4.924878263058937e-06, + "loss": 0.5226, + "step": 827 + }, + { + "epoch": 0.35, + "grad_norm": 0.5747575013418114, + "learning_rate": 4.924420433047055e-06, + "loss": 0.4965, + "step": 828 + }, + { + "epoch": 0.35, + "grad_norm": 0.5919850393540613, + "learning_rate": 4.9239612335570555e-06, + "loss": 0.4948, + "step": 829 + }, + { + "epoch": 0.35, + "grad_norm": 0.5830251031451256, + "learning_rate": 4.923500664848327e-06, + "loss": 0.487, + "step": 830 + }, + { + "epoch": 0.36, + "grad_norm": 0.6178725623700653, + "learning_rate": 4.923038727181028e-06, + "loss": 0.4862, + "step": 831 + }, + { + "epoch": 0.36, + "grad_norm": 0.5814875278732987, + "learning_rate": 4.922575420816095e-06, + "loss": 0.5166, + "step": 832 + }, + { + "epoch": 0.36, + "grad_norm": 0.5986635615376367, + "learning_rate": 4.922110746015234e-06, + "loss": 0.489, + "step": 833 + }, + { + "epoch": 0.36, + "grad_norm": 0.5859519526703544, + "learning_rate": 4.921644703040925e-06, + "loss": 0.4906, + "step": 834 + }, + { + "epoch": 0.36, + "grad_norm": 0.5743263620222753, + "learning_rate": 4.9211772921564205e-06, + "loss": 0.4843, + "step": 835 + }, + { + "epoch": 0.36, + "grad_norm": 0.8492032385838142, + "learning_rate": 4.920708513625746e-06, + "loss": 0.5099, + "step": 836 + }, + { + "epoch": 0.36, + "grad_norm": 0.6140314776814894, + "learning_rate": 4.9202383677137005e-06, + "loss": 0.5006, + "step": 837 + }, + { + "epoch": 0.36, + "grad_norm": 0.6527049634117856, + "learning_rate": 4.919766854685854e-06, + "loss": 0.5264, + "step": 838 + }, + { + "epoch": 0.36, + "grad_norm": 0.6114157924940975, + "learning_rate": 4.91929397480855e-06, + "loss": 0.4989, + "step": 839 + }, + { + "epoch": 0.36, + "grad_norm": 0.660553349533273, + "learning_rate": 4.918819728348901e-06, + "loss": 0.4894, + "step": 840 + }, + { + "epoch": 0.36, + "grad_norm": 0.6011284115605714, + "learning_rate": 4.918344115574797e-06, + "loss": 0.5038, + "step": 841 + }, + { + "epoch": 0.36, + "grad_norm": 0.5690628939882818, + "learning_rate": 4.917867136754894e-06, + "loss": 0.5419, + "step": 842 + }, + { + "epoch": 0.36, + "grad_norm": 0.5962796608244493, + "learning_rate": 4.917388792158623e-06, + "loss": 0.4726, + "step": 843 + }, + { + "epoch": 0.36, + "grad_norm": 0.6420232226244765, + "learning_rate": 4.9169090820561845e-06, + "loss": 0.5121, + "step": 844 + }, + { + "epoch": 0.36, + "grad_norm": 0.5977037225796433, + "learning_rate": 4.916428006718555e-06, + "loss": 0.5125, + "step": 845 + }, + { + "epoch": 0.36, + "grad_norm": 0.6028773887667327, + "learning_rate": 4.9159455664174756e-06, + "loss": 0.4987, + "step": 846 + }, + { + "epoch": 0.36, + "grad_norm": 0.6366562430011422, + "learning_rate": 4.9154617614254616e-06, + "loss": 0.482, + "step": 847 + }, + { + "epoch": 0.36, + "grad_norm": 0.6591094862661601, + "learning_rate": 4.914976592015801e-06, + "loss": 0.5366, + "step": 848 + }, + { + "epoch": 0.36, + "grad_norm": 0.5958159768787876, + "learning_rate": 4.914490058462549e-06, + "loss": 0.4975, + "step": 849 + }, + { + "epoch": 0.36, + "grad_norm": 0.60494927183394, + "learning_rate": 4.9140021610405335e-06, + "loss": 0.5224, + "step": 850 + }, + { + "epoch": 0.36, + "grad_norm": 0.6105732826310788, + "learning_rate": 4.913512900025351e-06, + "loss": 0.52, + "step": 851 + }, + { + "epoch": 0.36, + "grad_norm": 0.5967159340815495, + "learning_rate": 4.913022275693372e-06, + "loss": 0.4667, + "step": 852 + }, + { + "epoch": 0.36, + "eval_loss": 0.5032065510749817, + "eval_runtime": 6906.5208, + "eval_samples_per_second": 42.046, + "eval_steps_per_second": 2.102, + "step": 852 + }, + { + "epoch": 0.36, + "grad_norm": 0.6009525199808483, + "learning_rate": 4.912530288321733e-06, + "loss": 0.4868, + "step": 853 + }, + { + "epoch": 0.36, + "grad_norm": 0.5830517125297875, + "learning_rate": 4.912036938188342e-06, + "loss": 0.5266, + "step": 854 + }, + { + "epoch": 0.37, + "grad_norm": 0.5889940979301209, + "learning_rate": 4.911542225571877e-06, + "loss": 0.5029, + "step": 855 + }, + { + "epoch": 0.37, + "grad_norm": 0.6026973021517633, + "learning_rate": 4.911046150751786e-06, + "loss": 0.517, + "step": 856 + }, + { + "epoch": 0.37, + "grad_norm": 0.5785861902321248, + "learning_rate": 4.910548714008285e-06, + "loss": 0.4926, + "step": 857 + }, + { + "epoch": 0.37, + "grad_norm": 0.6270047301502678, + "learning_rate": 4.910049915622361e-06, + "loss": 0.5154, + "step": 858 + }, + { + "epoch": 0.37, + "grad_norm": 0.5882468386280117, + "learning_rate": 4.90954975587577e-06, + "loss": 0.5096, + "step": 859 + }, + { + "epoch": 0.37, + "grad_norm": 0.5663550702566633, + "learning_rate": 4.909048235051033e-06, + "loss": 0.4908, + "step": 860 + }, + { + "epoch": 0.37, + "grad_norm": 0.6017236125989268, + "learning_rate": 4.9085453534314474e-06, + "loss": 0.5193, + "step": 861 + }, + { + "epoch": 0.37, + "grad_norm": 0.6222382484211336, + "learning_rate": 4.908041111301074e-06, + "loss": 0.5167, + "step": 862 + }, + { + "epoch": 0.37, + "grad_norm": 0.5963402145379366, + "learning_rate": 4.90753550894474e-06, + "loss": 0.4786, + "step": 863 + }, + { + "epoch": 0.37, + "grad_norm": 0.5654270988232517, + "learning_rate": 4.907028546648049e-06, + "loss": 0.509, + "step": 864 + }, + { + "epoch": 0.37, + "grad_norm": 0.6108797196445513, + "learning_rate": 4.906520224697364e-06, + "loss": 0.5055, + "step": 865 + }, + { + "epoch": 0.37, + "grad_norm": 0.6008153268576729, + "learning_rate": 4.906010543379821e-06, + "loss": 0.5043, + "step": 866 + }, + { + "epoch": 0.37, + "grad_norm": 0.6448956256200653, + "learning_rate": 4.905499502983325e-06, + "loss": 0.5257, + "step": 867 + }, + { + "epoch": 0.37, + "grad_norm": 0.5595461401241696, + "learning_rate": 4.904987103796544e-06, + "loss": 0.5017, + "step": 868 + }, + { + "epoch": 0.37, + "grad_norm": 0.5941241562315258, + "learning_rate": 4.904473346108916e-06, + "loss": 0.5052, + "step": 869 + }, + { + "epoch": 0.37, + "grad_norm": 0.6018542340595553, + "learning_rate": 4.903958230210647e-06, + "loss": 0.4875, + "step": 870 + }, + { + "epoch": 0.37, + "grad_norm": 0.5681619089604921, + "learning_rate": 4.9034417563927105e-06, + "loss": 0.4876, + "step": 871 + }, + { + "epoch": 0.37, + "grad_norm": 0.580332576906727, + "learning_rate": 4.902923924946845e-06, + "loss": 0.503, + "step": 872 + }, + { + "epoch": 0.37, + "grad_norm": 0.5991899633835461, + "learning_rate": 4.902404736165557e-06, + "loss": 0.4792, + "step": 873 + }, + { + "epoch": 0.37, + "grad_norm": 0.5788544217689144, + "learning_rate": 4.901884190342121e-06, + "loss": 0.534, + "step": 874 + }, + { + "epoch": 0.37, + "grad_norm": 0.5873037059704003, + "learning_rate": 4.901362287770576e-06, + "loss": 0.5138, + "step": 875 + }, + { + "epoch": 0.37, + "grad_norm": 0.5673844198994867, + "learning_rate": 4.900839028745727e-06, + "loss": 0.5094, + "step": 876 + }, + { + "epoch": 0.37, + "grad_norm": 0.5992496148744582, + "learning_rate": 4.900314413563149e-06, + "loss": 0.5296, + "step": 877 + }, + { + "epoch": 0.38, + "grad_norm": 0.619714807218819, + "learning_rate": 4.899788442519178e-06, + "loss": 0.5174, + "step": 878 + }, + { + "epoch": 0.38, + "grad_norm": 0.5833898092793183, + "learning_rate": 4.899261115910919e-06, + "loss": 0.4845, + "step": 879 + }, + { + "epoch": 0.38, + "grad_norm": 0.5731340631423973, + "learning_rate": 4.8987324340362445e-06, + "loss": 0.494, + "step": 880 + }, + { + "epoch": 0.38, + "grad_norm": 0.6335983797182833, + "learning_rate": 4.898202397193787e-06, + "loss": 0.4902, + "step": 881 + }, + { + "epoch": 0.38, + "grad_norm": 0.6021252115369131, + "learning_rate": 4.897671005682948e-06, + "loss": 0.5039, + "step": 882 + }, + { + "epoch": 0.38, + "grad_norm": 0.6136802203792989, + "learning_rate": 4.8971382598038945e-06, + "loss": 0.5108, + "step": 883 + }, + { + "epoch": 0.38, + "grad_norm": 0.6131873733015205, + "learning_rate": 4.896604159857557e-06, + "loss": 0.4997, + "step": 884 + }, + { + "epoch": 0.38, + "grad_norm": 0.6159506568258571, + "learning_rate": 4.896068706145632e-06, + "loss": 0.5292, + "step": 885 + }, + { + "epoch": 0.38, + "grad_norm": 0.6041398474984774, + "learning_rate": 4.8955318989705814e-06, + "loss": 0.51, + "step": 886 + }, + { + "epoch": 0.38, + "grad_norm": 0.6148781569224463, + "learning_rate": 4.8949937386356284e-06, + "loss": 0.5094, + "step": 887 + }, + { + "epoch": 0.38, + "grad_norm": 0.5976753831281014, + "learning_rate": 4.894454225444764e-06, + "loss": 0.5026, + "step": 888 + }, + { + "epoch": 0.38, + "grad_norm": 0.608033075624346, + "learning_rate": 4.893913359702742e-06, + "loss": 0.5011, + "step": 889 + }, + { + "epoch": 0.38, + "grad_norm": 0.5610693452613091, + "learning_rate": 4.89337114171508e-06, + "loss": 0.4903, + "step": 890 + }, + { + "epoch": 0.38, + "grad_norm": 0.5740287354727569, + "learning_rate": 4.89282757178806e-06, + "loss": 0.5355, + "step": 891 + }, + { + "epoch": 0.38, + "grad_norm": 0.5808789786027553, + "learning_rate": 4.892282650228728e-06, + "loss": 0.4798, + "step": 892 + }, + { + "epoch": 0.38, + "grad_norm": 0.5636801724275967, + "learning_rate": 4.891736377344891e-06, + "loss": 0.4929, + "step": 893 + }, + { + "epoch": 0.38, + "grad_norm": 0.6092124636402675, + "learning_rate": 4.891188753445122e-06, + "loss": 0.4976, + "step": 894 + }, + { + "epoch": 0.38, + "grad_norm": 0.5840172519663077, + "learning_rate": 4.890639778838757e-06, + "loss": 0.4841, + "step": 895 + }, + { + "epoch": 0.38, + "grad_norm": 0.5819312884051389, + "learning_rate": 4.890089453835894e-06, + "loss": 0.4869, + "step": 896 + }, + { + "epoch": 0.38, + "grad_norm": 0.6120829375007303, + "learning_rate": 4.889537778747396e-06, + "loss": 0.5265, + "step": 897 + }, + { + "epoch": 0.38, + "grad_norm": 0.5726939025833054, + "learning_rate": 4.888984753884882e-06, + "loss": 0.4708, + "step": 898 + }, + { + "epoch": 0.38, + "grad_norm": 0.6080885452408311, + "learning_rate": 4.8884303795607424e-06, + "loss": 0.501, + "step": 899 + }, + { + "epoch": 0.38, + "grad_norm": 0.5909973175245595, + "learning_rate": 4.887874656088124e-06, + "loss": 0.5017, + "step": 900 + }, + { + "epoch": 0.38, + "grad_norm": 0.591651916865507, + "learning_rate": 4.887317583780937e-06, + "loss": 0.483, + "step": 901 + }, + { + "epoch": 0.39, + "grad_norm": 0.5745658546447854, + "learning_rate": 4.886759162953856e-06, + "loss": 0.4942, + "step": 902 + }, + { + "epoch": 0.39, + "grad_norm": 0.6021387624200228, + "learning_rate": 4.886199393922313e-06, + "loss": 0.4906, + "step": 903 + }, + { + "epoch": 0.39, + "grad_norm": 0.5993298907461448, + "learning_rate": 4.885638277002503e-06, + "loss": 0.4987, + "step": 904 + }, + { + "epoch": 0.39, + "grad_norm": 0.6004161472848657, + "learning_rate": 4.885075812511386e-06, + "loss": 0.5152, + "step": 905 + }, + { + "epoch": 0.39, + "grad_norm": 0.5799301402705993, + "learning_rate": 4.884512000766679e-06, + "loss": 0.5045, + "step": 906 + }, + { + "epoch": 0.39, + "grad_norm": 0.6121599376461038, + "learning_rate": 4.883946842086861e-06, + "loss": 0.4915, + "step": 907 + }, + { + "epoch": 0.39, + "grad_norm": 0.5663312512818208, + "learning_rate": 4.883380336791172e-06, + "loss": 0.4983, + "step": 908 + }, + { + "epoch": 0.39, + "grad_norm": 0.6095573669970138, + "learning_rate": 4.882812485199614e-06, + "loss": 0.5285, + "step": 909 + }, + { + "epoch": 0.39, + "grad_norm": 0.5921811411224858, + "learning_rate": 4.882243287632947e-06, + "loss": 0.5194, + "step": 910 + }, + { + "epoch": 0.39, + "grad_norm": 0.6006510756243623, + "learning_rate": 4.8816727444126935e-06, + "loss": 0.492, + "step": 911 + }, + { + "epoch": 0.39, + "grad_norm": 0.5649111234747042, + "learning_rate": 4.881100855861134e-06, + "loss": 0.4916, + "step": 912 + }, + { + "epoch": 0.39, + "grad_norm": 0.5762188689086207, + "learning_rate": 4.880527622301312e-06, + "loss": 0.4887, + "step": 913 + }, + { + "epoch": 0.39, + "grad_norm": 0.6441465256010206, + "learning_rate": 4.879953044057028e-06, + "loss": 0.511, + "step": 914 + }, + { + "epoch": 0.39, + "grad_norm": 0.5684710047872453, + "learning_rate": 4.879377121452844e-06, + "loss": 0.5001, + "step": 915 + }, + { + "epoch": 0.39, + "grad_norm": 0.5686046518640278, + "learning_rate": 4.8787998548140794e-06, + "loss": 0.5014, + "step": 916 + }, + { + "epoch": 0.39, + "grad_norm": 0.5905740541193955, + "learning_rate": 4.878221244466813e-06, + "loss": 0.486, + "step": 917 + }, + { + "epoch": 0.39, + "grad_norm": 0.5952876200881407, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.481, + "step": 918 + }, + { + "epoch": 0.39, + "grad_norm": 0.5872355311893122, + "learning_rate": 4.877059993954891e-06, + "loss": 0.5039, + "step": 919 + }, + { + "epoch": 0.39, + "grad_norm": 0.6074745120236803, + "learning_rate": 4.8764773544461895e-06, + "loss": 0.5108, + "step": 920 + }, + { + "epoch": 0.39, + "grad_norm": 0.6130696940827612, + "learning_rate": 4.875893372540893e-06, + "loss": 0.5048, + "step": 921 + }, + { + "epoch": 0.39, + "grad_norm": 0.6004208834442807, + "learning_rate": 4.875308048568875e-06, + "loss": 0.4988, + "step": 922 + }, + { + "epoch": 0.39, + "grad_norm": 0.6892465313158495, + "learning_rate": 4.8747213828607675e-06, + "loss": 0.483, + "step": 923 + }, + { + "epoch": 0.39, + "eval_loss": 0.5001593232154846, + "eval_runtime": 6909.7667, + "eval_samples_per_second": 42.026, + "eval_steps_per_second": 2.101, + "step": 923 + }, + { + "epoch": 0.39, + "grad_norm": 0.6134636983656643, + "learning_rate": 4.874133375747957e-06, + "loss": 0.5137, + "step": 924 + }, + { + "epoch": 0.4, + "grad_norm": 0.6440085903145949, + "learning_rate": 4.873544027562593e-06, + "loss": 0.5176, + "step": 925 + }, + { + "epoch": 0.4, + "grad_norm": 0.6899079462308539, + "learning_rate": 4.8729533386375775e-06, + "loss": 0.5162, + "step": 926 + }, + { + "epoch": 0.4, + "grad_norm": 0.6146416139724011, + "learning_rate": 4.872361309306572e-06, + "loss": 0.5113, + "step": 927 + }, + { + "epoch": 0.4, + "grad_norm": 0.585989400430957, + "learning_rate": 4.8717679399039954e-06, + "loss": 0.4987, + "step": 928 + }, + { + "epoch": 0.4, + "grad_norm": 0.6462954923557843, + "learning_rate": 4.871173230765024e-06, + "loss": 0.501, + "step": 929 + }, + { + "epoch": 0.4, + "grad_norm": 0.6369012674472616, + "learning_rate": 4.8705771822255895e-06, + "loss": 0.4859, + "step": 930 + }, + { + "epoch": 0.4, + "grad_norm": 0.6299056450791893, + "learning_rate": 4.8699797946223805e-06, + "loss": 0.5061, + "step": 931 + }, + { + "epoch": 0.4, + "grad_norm": 0.6098008991534455, + "learning_rate": 4.869381068292842e-06, + "loss": 0.4871, + "step": 932 + }, + { + "epoch": 0.4, + "grad_norm": 0.63534813466544, + "learning_rate": 4.868781003575176e-06, + "loss": 0.4946, + "step": 933 + }, + { + "epoch": 0.4, + "grad_norm": 0.607178762605803, + "learning_rate": 4.86817960080834e-06, + "loss": 0.4986, + "step": 934 + }, + { + "epoch": 0.4, + "grad_norm": 0.5983782529586613, + "learning_rate": 4.867576860332048e-06, + "loss": 0.4795, + "step": 935 + }, + { + "epoch": 0.4, + "grad_norm": 0.6738363380499427, + "learning_rate": 4.8669727824867686e-06, + "loss": 0.5166, + "step": 936 + }, + { + "epoch": 0.4, + "grad_norm": 0.598996652424602, + "learning_rate": 4.866367367613725e-06, + "loss": 0.5197, + "step": 937 + }, + { + "epoch": 0.4, + "grad_norm": 0.5815833611063322, + "learning_rate": 4.865760616054899e-06, + "loss": 0.4844, + "step": 938 + }, + { + "epoch": 0.4, + "grad_norm": 0.603444821349114, + "learning_rate": 4.865152528153022e-06, + "loss": 0.5207, + "step": 939 + }, + { + "epoch": 0.4, + "grad_norm": 0.5867776906541347, + "learning_rate": 4.864543104251587e-06, + "loss": 0.529, + "step": 940 + }, + { + "epoch": 0.4, + "grad_norm": 0.5650043009819257, + "learning_rate": 4.863932344694837e-06, + "loss": 0.4959, + "step": 941 + }, + { + "epoch": 0.4, + "grad_norm": 0.6917490151688115, + "learning_rate": 4.8633202498277695e-06, + "loss": 0.5137, + "step": 942 + }, + { + "epoch": 0.4, + "grad_norm": 0.549493622060356, + "learning_rate": 4.862706819996139e-06, + "loss": 0.4845, + "step": 943 + }, + { + "epoch": 0.4, + "grad_norm": 0.5858437778240971, + "learning_rate": 4.8620920555464515e-06, + "loss": 0.5006, + "step": 944 + }, + { + "epoch": 0.4, + "grad_norm": 0.5993940522874021, + "learning_rate": 4.8614759568259685e-06, + "loss": 0.506, + "step": 945 + }, + { + "epoch": 0.4, + "grad_norm": 0.5948965330805686, + "learning_rate": 4.860858524182704e-06, + "loss": 0.5192, + "step": 946 + }, + { + "epoch": 0.4, + "grad_norm": 0.5944946909841916, + "learning_rate": 4.860239757965428e-06, + "loss": 0.5071, + "step": 947 + }, + { + "epoch": 0.41, + "grad_norm": 0.5915006708411217, + "learning_rate": 4.8596196585236595e-06, + "loss": 0.4853, + "step": 948 + }, + { + "epoch": 0.41, + "grad_norm": 0.5630831967485228, + "learning_rate": 4.858998226207674e-06, + "loss": 0.484, + "step": 949 + }, + { + "epoch": 0.41, + "grad_norm": 0.5666674173283087, + "learning_rate": 4.858375461368499e-06, + "loss": 0.512, + "step": 950 + }, + { + "epoch": 0.41, + "grad_norm": 0.5836639113649253, + "learning_rate": 4.857751364357913e-06, + "loss": 0.514, + "step": 951 + }, + { + "epoch": 0.41, + "grad_norm": 0.5744731439956018, + "learning_rate": 4.857125935528451e-06, + "loss": 0.5177, + "step": 952 + }, + { + "epoch": 0.41, + "grad_norm": 0.5926955465496379, + "learning_rate": 4.8564991752333975e-06, + "loss": 0.4955, + "step": 953 + }, + { + "epoch": 0.41, + "grad_norm": 0.5628507262484403, + "learning_rate": 4.855871083826789e-06, + "loss": 0.4995, + "step": 954 + }, + { + "epoch": 0.41, + "grad_norm": 0.5866653009845907, + "learning_rate": 4.855241661663413e-06, + "loss": 0.5002, + "step": 955 + }, + { + "epoch": 0.41, + "grad_norm": 0.5599328850779348, + "learning_rate": 4.854610909098813e-06, + "loss": 0.4593, + "step": 956 + }, + { + "epoch": 0.41, + "grad_norm": 0.5973389278804867, + "learning_rate": 4.853978826489277e-06, + "loss": 0.4913, + "step": 957 + }, + { + "epoch": 0.41, + "grad_norm": 0.643022571476318, + "learning_rate": 4.8533454141918525e-06, + "loss": 0.5036, + "step": 958 + }, + { + "epoch": 0.41, + "grad_norm": 0.5688007484429585, + "learning_rate": 4.852710672564332e-06, + "loss": 0.5036, + "step": 959 + }, + { + "epoch": 0.41, + "grad_norm": 0.5969281444464002, + "learning_rate": 4.852074601965261e-06, + "loss": 0.5344, + "step": 960 + }, + { + "epoch": 0.41, + "grad_norm": 0.5730247632836935, + "learning_rate": 4.851437202753936e-06, + "loss": 0.4809, + "step": 961 + }, + { + "epoch": 0.41, + "grad_norm": 0.5799456612673418, + "learning_rate": 4.850798475290403e-06, + "loss": 0.4536, + "step": 962 + }, + { + "epoch": 0.41, + "grad_norm": 0.6205297443588728, + "learning_rate": 4.8501584199354604e-06, + "loss": 0.5099, + "step": 963 + }, + { + "epoch": 0.41, + "grad_norm": 0.5681608576563654, + "learning_rate": 4.849517037050653e-06, + "loss": 0.4903, + "step": 964 + }, + { + "epoch": 0.41, + "grad_norm": 0.6025610912235543, + "learning_rate": 4.848874326998279e-06, + "loss": 0.4836, + "step": 965 + }, + { + "epoch": 0.41, + "grad_norm": 0.5901338922642949, + "learning_rate": 4.848230290141383e-06, + "loss": 0.4662, + "step": 966 + }, + { + "epoch": 0.41, + "grad_norm": 0.5688713468398793, + "learning_rate": 4.847584926843765e-06, + "loss": 0.4884, + "step": 967 + }, + { + "epoch": 0.41, + "grad_norm": 0.5778466913445185, + "learning_rate": 4.846938237469966e-06, + "loss": 0.5096, + "step": 968 + }, + { + "epoch": 0.41, + "grad_norm": 0.603606926382752, + "learning_rate": 4.846290222385282e-06, + "loss": 0.5415, + "step": 969 + }, + { + "epoch": 0.41, + "grad_norm": 0.594103598829822, + "learning_rate": 4.845640881955757e-06, + "loss": 0.5007, + "step": 970 + }, + { + "epoch": 0.41, + "grad_norm": 0.5983275671847197, + "learning_rate": 4.844990216548181e-06, + "loss": 0.501, + "step": 971 + }, + { + "epoch": 0.42, + "grad_norm": 0.6107199353721016, + "learning_rate": 4.844338226530095e-06, + "loss": 0.5303, + "step": 972 + }, + { + "epoch": 0.42, + "grad_norm": 0.6032407557209453, + "learning_rate": 4.843684912269789e-06, + "loss": 0.4946, + "step": 973 + }, + { + "epoch": 0.42, + "grad_norm": 0.5413313146483812, + "learning_rate": 4.843030274136297e-06, + "loss": 0.4956, + "step": 974 + }, + { + "epoch": 0.42, + "grad_norm": 0.6106091128154526, + "learning_rate": 4.842374312499405e-06, + "loss": 0.5105, + "step": 975 + }, + { + "epoch": 0.42, + "grad_norm": 0.574289734465813, + "learning_rate": 4.841717027729643e-06, + "loss": 0.5215, + "step": 976 + }, + { + "epoch": 0.42, + "grad_norm": 0.6083050534381844, + "learning_rate": 4.8410584201982934e-06, + "loss": 0.4862, + "step": 977 + }, + { + "epoch": 0.42, + "grad_norm": 0.5766091053101502, + "learning_rate": 4.84039849027738e-06, + "loss": 0.5117, + "step": 978 + }, + { + "epoch": 0.42, + "grad_norm": 0.5711574672193085, + "learning_rate": 4.8397372383396765e-06, + "loss": 0.4722, + "step": 979 + }, + { + "epoch": 0.42, + "grad_norm": 0.5774240645659215, + "learning_rate": 4.839074664758705e-06, + "loss": 0.522, + "step": 980 + }, + { + "epoch": 0.42, + "grad_norm": 0.5572922524103036, + "learning_rate": 4.8384107699087305e-06, + "loss": 0.4989, + "step": 981 + }, + { + "epoch": 0.42, + "grad_norm": 0.6070310913343694, + "learning_rate": 4.837745554164766e-06, + "loss": 0.4857, + "step": 982 + }, + { + "epoch": 0.42, + "grad_norm": 0.60261744740847, + "learning_rate": 4.8370790179025715e-06, + "loss": 0.5182, + "step": 983 + }, + { + "epoch": 0.42, + "grad_norm": 0.5659965233077204, + "learning_rate": 4.836411161498653e-06, + "loss": 0.4828, + "step": 984 + }, + { + "epoch": 0.42, + "grad_norm": 0.5957726040740746, + "learning_rate": 4.835741985330259e-06, + "loss": 0.4724, + "step": 985 + }, + { + "epoch": 0.42, + "grad_norm": 0.6024381722019819, + "learning_rate": 4.835071489775388e-06, + "loss": 0.5038, + "step": 986 + }, + { + "epoch": 0.42, + "grad_norm": 0.5889077069135094, + "learning_rate": 4.834399675212781e-06, + "loss": 0.4837, + "step": 987 + }, + { + "epoch": 0.42, + "grad_norm": 0.6240890961484814, + "learning_rate": 4.8337265420219245e-06, + "loss": 0.5006, + "step": 988 + }, + { + "epoch": 0.42, + "grad_norm": 0.5986554935817346, + "learning_rate": 4.833052090583052e-06, + "loss": 0.4603, + "step": 989 + }, + { + "epoch": 0.42, + "grad_norm": 0.5632943529640556, + "learning_rate": 4.832376321277136e-06, + "loss": 0.478, + "step": 990 + }, + { + "epoch": 0.42, + "grad_norm": 0.5811104630507077, + "learning_rate": 4.831699234485899e-06, + "loss": 0.503, + "step": 991 + }, + { + "epoch": 0.42, + "grad_norm": 0.6286701964893617, + "learning_rate": 4.831020830591806e-06, + "loss": 0.5186, + "step": 992 + }, + { + "epoch": 0.42, + "grad_norm": 0.5656632345758932, + "learning_rate": 4.8303411099780665e-06, + "loss": 0.4863, + "step": 993 + }, + { + "epoch": 0.42, + "grad_norm": 0.5914309687581247, + "learning_rate": 4.829660073028631e-06, + "loss": 0.5056, + "step": 994 + }, + { + "epoch": 0.42, + "eval_loss": 0.4973524808883667, + "eval_runtime": 6914.8778, + "eval_samples_per_second": 41.995, + "eval_steps_per_second": 2.1, + "step": 994 + }, + { + "epoch": 0.43, + "grad_norm": 0.6155119630859821, + "learning_rate": 4.828977720128198e-06, + "loss": 0.494, + "step": 995 + }, + { + "epoch": 0.43, + "grad_norm": 0.5711544453505207, + "learning_rate": 4.828294051662206e-06, + "loss": 0.4958, + "step": 996 + }, + { + "epoch": 0.43, + "grad_norm": 0.5630820865144905, + "learning_rate": 4.827609068016836e-06, + "loss": 0.5164, + "step": 997 + }, + { + "epoch": 0.43, + "grad_norm": 0.5954033434874677, + "learning_rate": 4.826922769579017e-06, + "loss": 0.5199, + "step": 998 + }, + { + "epoch": 0.43, + "grad_norm": 0.5740037081231429, + "learning_rate": 4.826235156736414e-06, + "loss": 0.4798, + "step": 999 + }, + { + "epoch": 0.43, + "grad_norm": 0.5812019395259428, + "learning_rate": 4.825546229877439e-06, + "loss": 0.4865, + "step": 1000 + }, + { + "epoch": 0.43, + "grad_norm": 0.5893943585416656, + "learning_rate": 4.824855989391245e-06, + "loss": 0.4912, + "step": 1001 + }, + { + "epoch": 0.43, + "grad_norm": 0.6065006741354068, + "learning_rate": 4.824164435667727e-06, + "loss": 0.5072, + "step": 1002 + }, + { + "epoch": 0.43, + "grad_norm": 0.6019073096492751, + "learning_rate": 4.823471569097521e-06, + "loss": 0.4971, + "step": 1003 + }, + { + "epoch": 0.43, + "grad_norm": 0.5883223828151468, + "learning_rate": 4.822777390072006e-06, + "loss": 0.514, + "step": 1004 + }, + { + "epoch": 0.43, + "grad_norm": 0.6604784109834196, + "learning_rate": 4.822081898983302e-06, + "loss": 0.5065, + "step": 1005 + }, + { + "epoch": 0.43, + "grad_norm": 0.5937829112220406, + "learning_rate": 4.821385096224268e-06, + "loss": 0.4714, + "step": 1006 + }, + { + "epoch": 0.43, + "grad_norm": 0.5748353294494913, + "learning_rate": 4.820686982188508e-06, + "loss": 0.5059, + "step": 1007 + }, + { + "epoch": 0.43, + "grad_norm": 0.6069950364399875, + "learning_rate": 4.819987557270364e-06, + "loss": 0.5204, + "step": 1008 + }, + { + "epoch": 0.43, + "grad_norm": 0.6358630633744264, + "learning_rate": 4.819286821864917e-06, + "loss": 0.4903, + "step": 1009 + }, + { + "epoch": 0.43, + "grad_norm": 0.6204642750728111, + "learning_rate": 4.818584776367992e-06, + "loss": 0.5128, + "step": 1010 + }, + { + "epoch": 0.43, + "grad_norm": 0.6829577526239162, + "learning_rate": 4.817881421176153e-06, + "loss": 0.5197, + "step": 1011 + }, + { + "epoch": 0.43, + "grad_norm": 0.5922818255526525, + "learning_rate": 4.817176756686701e-06, + "loss": 0.4918, + "step": 1012 + }, + { + "epoch": 0.43, + "grad_norm": 0.5740860408805906, + "learning_rate": 4.816470783297679e-06, + "loss": 0.4982, + "step": 1013 + }, + { + "epoch": 0.43, + "grad_norm": 0.5910300282185704, + "learning_rate": 4.815763501407869e-06, + "loss": 0.485, + "step": 1014 + }, + { + "epoch": 0.43, + "grad_norm": 0.5715165207150573, + "learning_rate": 4.815054911416795e-06, + "loss": 0.4849, + "step": 1015 + }, + { + "epoch": 0.43, + "grad_norm": 0.6024018069444244, + "learning_rate": 4.8143450137247116e-06, + "loss": 0.4994, + "step": 1016 + }, + { + "epoch": 0.43, + "grad_norm": 0.6266053738868206, + "learning_rate": 4.8136338087326214e-06, + "loss": 0.5286, + "step": 1017 + }, + { + "epoch": 0.43, + "grad_norm": 0.5953155175370475, + "learning_rate": 4.812921296842261e-06, + "loss": 0.4776, + "step": 1018 + }, + { + "epoch": 0.44, + "grad_norm": 0.6031211740166059, + "learning_rate": 4.812207478456105e-06, + "loss": 0.5148, + "step": 1019 + }, + { + "epoch": 0.44, + "grad_norm": 0.6273922038641672, + "learning_rate": 4.811492353977366e-06, + "loss": 0.5029, + "step": 1020 + }, + { + "epoch": 0.44, + "grad_norm": 0.5826143241371425, + "learning_rate": 4.810775923809996e-06, + "loss": 0.5056, + "step": 1021 + }, + { + "epoch": 0.44, + "grad_norm": 0.582077622885555, + "learning_rate": 4.810058188358685e-06, + "loss": 0.487, + "step": 1022 + }, + { + "epoch": 0.44, + "grad_norm": 0.5767038876088614, + "learning_rate": 4.809339148028857e-06, + "loss": 0.4805, + "step": 1023 + }, + { + "epoch": 0.44, + "grad_norm": 0.5850526215638874, + "learning_rate": 4.808618803226675e-06, + "loss": 0.4686, + "step": 1024 + }, + { + "epoch": 0.44, + "grad_norm": 0.6178237737641012, + "learning_rate": 4.80789715435904e-06, + "loss": 0.5005, + "step": 1025 + }, + { + "epoch": 0.44, + "grad_norm": 0.5579649875995263, + "learning_rate": 4.807174201833589e-06, + "loss": 0.4848, + "step": 1026 + }, + { + "epoch": 0.44, + "grad_norm": 0.576234598032971, + "learning_rate": 4.8064499460586926e-06, + "loss": 0.4824, + "step": 1027 + }, + { + "epoch": 0.44, + "grad_norm": 0.5826322787338029, + "learning_rate": 4.8057243874434625e-06, + "loss": 0.4997, + "step": 1028 + }, + { + "epoch": 0.44, + "grad_norm": 0.5580813074509025, + "learning_rate": 4.8049975263977416e-06, + "loss": 0.4693, + "step": 1029 + }, + { + "epoch": 0.44, + "grad_norm": 0.5695474003960879, + "learning_rate": 4.804269363332112e-06, + "loss": 0.492, + "step": 1030 + }, + { + "epoch": 0.44, + "grad_norm": 0.5905987109265735, + "learning_rate": 4.80353989865789e-06, + "loss": 0.4966, + "step": 1031 + }, + { + "epoch": 0.44, + "grad_norm": 0.5685057462956427, + "learning_rate": 4.802809132787125e-06, + "loss": 0.5016, + "step": 1032 + }, + { + "epoch": 0.44, + "grad_norm": 0.5886266001836196, + "learning_rate": 4.802077066132607e-06, + "loss": 0.4946, + "step": 1033 + }, + { + "epoch": 0.44, + "grad_norm": 0.6035821042805889, + "learning_rate": 4.801343699107854e-06, + "loss": 0.4959, + "step": 1034 + }, + { + "epoch": 0.44, + "grad_norm": 0.566265021004039, + "learning_rate": 4.800609032127123e-06, + "loss": 0.4972, + "step": 1035 + }, + { + "epoch": 0.44, + "grad_norm": 0.5614736780544856, + "learning_rate": 4.799873065605404e-06, + "loss": 0.4807, + "step": 1036 + }, + { + "epoch": 0.44, + "grad_norm": 0.600684352457, + "learning_rate": 4.799135799958421e-06, + "loss": 0.496, + "step": 1037 + }, + { + "epoch": 0.44, + "grad_norm": 0.591873988888275, + "learning_rate": 4.798397235602632e-06, + "loss": 0.4698, + "step": 1038 + }, + { + "epoch": 0.44, + "grad_norm": 0.6143917690140963, + "learning_rate": 4.797657372955228e-06, + "loss": 0.4906, + "step": 1039 + }, + { + "epoch": 0.44, + "grad_norm": 0.5837819971908614, + "learning_rate": 4.7969162124341354e-06, + "loss": 0.4711, + "step": 1040 + }, + { + "epoch": 0.44, + "grad_norm": 0.6109135143710297, + "learning_rate": 4.79617375445801e-06, + "loss": 0.5127, + "step": 1041 + }, + { + "epoch": 0.45, + "grad_norm": 0.5763531175283814, + "learning_rate": 4.795429999446246e-06, + "loss": 0.4889, + "step": 1042 + }, + { + "epoch": 0.45, + "grad_norm": 0.5818174333398002, + "learning_rate": 4.794684947818964e-06, + "loss": 0.4956, + "step": 1043 + }, + { + "epoch": 0.45, + "grad_norm": 0.6195177609111279, + "learning_rate": 4.793938599997021e-06, + "loss": 0.4926, + "step": 1044 + }, + { + "epoch": 0.45, + "grad_norm": 0.5686953952488751, + "learning_rate": 4.793190956402005e-06, + "loss": 0.4815, + "step": 1045 + }, + { + "epoch": 0.45, + "grad_norm": 0.5648932079137525, + "learning_rate": 4.792442017456237e-06, + "loss": 0.5092, + "step": 1046 + }, + { + "epoch": 0.45, + "grad_norm": 0.5851943135122919, + "learning_rate": 4.791691783582768e-06, + "loss": 0.4765, + "step": 1047 + }, + { + "epoch": 0.45, + "grad_norm": 0.591274407551804, + "learning_rate": 4.790940255205381e-06, + "loss": 0.5056, + "step": 1048 + }, + { + "epoch": 0.45, + "grad_norm": 0.6379099025664374, + "learning_rate": 4.790187432748591e-06, + "loss": 0.5158, + "step": 1049 + }, + { + "epoch": 0.45, + "grad_norm": 0.6181747896706304, + "learning_rate": 4.789433316637644e-06, + "loss": 0.5146, + "step": 1050 + }, + { + "epoch": 0.45, + "grad_norm": 0.5531345989453061, + "learning_rate": 4.788677907298516e-06, + "loss": 0.4878, + "step": 1051 + }, + { + "epoch": 0.45, + "grad_norm": 0.5876103896392866, + "learning_rate": 4.7879212051579124e-06, + "loss": 0.5136, + "step": 1052 + }, + { + "epoch": 0.45, + "grad_norm": 3.875613200014087, + "learning_rate": 4.787163210643272e-06, + "loss": 0.4833, + "step": 1053 + }, + { + "epoch": 0.45, + "grad_norm": 0.6036074704422942, + "learning_rate": 4.786403924182761e-06, + "loss": 0.5099, + "step": 1054 + }, + { + "epoch": 0.45, + "grad_norm": 0.621921611720999, + "learning_rate": 4.785643346205277e-06, + "loss": 0.5052, + "step": 1055 + }, + { + "epoch": 0.45, + "grad_norm": 0.6125351583146175, + "learning_rate": 4.784881477140445e-06, + "loss": 0.5053, + "step": 1056 + }, + { + "epoch": 0.45, + "grad_norm": 0.5999869964811588, + "learning_rate": 4.784118317418621e-06, + "loss": 0.5077, + "step": 1057 + }, + { + "epoch": 0.45, + "grad_norm": 0.5923633626129878, + "learning_rate": 4.7833538674708905e-06, + "loss": 0.4784, + "step": 1058 + }, + { + "epoch": 0.45, + "grad_norm": 0.5564286456120893, + "learning_rate": 4.782588127729066e-06, + "loss": 0.4985, + "step": 1059 + }, + { + "epoch": 0.45, + "grad_norm": 0.5651692637493795, + "learning_rate": 4.781821098625691e-06, + "loss": 0.4796, + "step": 1060 + }, + { + "epoch": 0.45, + "grad_norm": 0.6611434126482291, + "learning_rate": 4.7810527805940344e-06, + "loss": 0.5125, + "step": 1061 + }, + { + "epoch": 0.45, + "grad_norm": 0.6079384633788307, + "learning_rate": 4.7802831740680955e-06, + "loss": 0.4909, + "step": 1062 + }, + { + "epoch": 0.45, + "grad_norm": 0.5887501058130822, + "learning_rate": 4.7795122794826e-06, + "loss": 0.4729, + "step": 1063 + }, + { + "epoch": 0.45, + "grad_norm": 0.8409421704273247, + "learning_rate": 4.778740097273003e-06, + "loss": 0.5162, + "step": 1064 + }, + { + "epoch": 0.46, + "grad_norm": 0.6099196443182712, + "learning_rate": 4.777966627875484e-06, + "loss": 0.5268, + "step": 1065 + }, + { + "epoch": 0.46, + "eval_loss": 0.49549439549446106, + "eval_runtime": 6918.4924, + "eval_samples_per_second": 41.973, + "eval_steps_per_second": 2.099, + "step": 1065 + }, + { + "epoch": 0.46, + "grad_norm": 0.63487506837019, + "learning_rate": 4.777191871726951e-06, + "loss": 0.4854, + "step": 1066 + }, + { + "epoch": 0.46, + "grad_norm": 0.810054670245462, + "learning_rate": 4.776415829265043e-06, + "loss": 0.5234, + "step": 1067 + }, + { + "epoch": 0.46, + "grad_norm": 0.5924971114450747, + "learning_rate": 4.775638500928117e-06, + "loss": 0.5024, + "step": 1068 + }, + { + "epoch": 0.46, + "grad_norm": 0.5596644374385136, + "learning_rate": 4.774859887155263e-06, + "loss": 0.4743, + "step": 1069 + }, + { + "epoch": 0.46, + "grad_norm": 0.6022292614035627, + "learning_rate": 4.7740799883862966e-06, + "loss": 0.4998, + "step": 1070 + }, + { + "epoch": 0.46, + "grad_norm": 0.5685916266892248, + "learning_rate": 4.773298805061756e-06, + "loss": 0.4877, + "step": 1071 + }, + { + "epoch": 0.46, + "grad_norm": 0.5962356632555038, + "learning_rate": 4.772516337622907e-06, + "loss": 0.4977, + "step": 1072 + }, + { + "epoch": 0.46, + "grad_norm": 0.6113056469088662, + "learning_rate": 4.771732586511741e-06, + "loss": 0.5129, + "step": 1073 + }, + { + "epoch": 0.46, + "grad_norm": 0.5729626434260354, + "learning_rate": 4.7709475521709745e-06, + "loss": 0.5059, + "step": 1074 + }, + { + "epoch": 0.46, + "grad_norm": 0.5952807746932272, + "learning_rate": 4.770161235044047e-06, + "loss": 0.4947, + "step": 1075 + }, + { + "epoch": 0.46, + "grad_norm": 0.583828215474725, + "learning_rate": 4.769373635575127e-06, + "loss": 0.4919, + "step": 1076 + }, + { + "epoch": 0.46, + "grad_norm": 0.6213165462594205, + "learning_rate": 4.768584754209101e-06, + "loss": 0.5161, + "step": 1077 + }, + { + "epoch": 0.46, + "grad_norm": 0.5695247509557161, + "learning_rate": 4.767794591391585e-06, + "loss": 0.4995, + "step": 1078 + }, + { + "epoch": 0.46, + "grad_norm": 0.5741341553365366, + "learning_rate": 4.767003147568917e-06, + "loss": 0.475, + "step": 1079 + }, + { + "epoch": 0.46, + "grad_norm": 0.5768424395874503, + "learning_rate": 4.766210423188158e-06, + "loss": 0.4994, + "step": 1080 + }, + { + "epoch": 0.46, + "grad_norm": 0.5626374906483355, + "learning_rate": 4.765416418697092e-06, + "loss": 0.503, + "step": 1081 + }, + { + "epoch": 0.46, + "grad_norm": 0.5683372940335709, + "learning_rate": 4.764621134544229e-06, + "loss": 0.5083, + "step": 1082 + }, + { + "epoch": 0.46, + "grad_norm": 0.9131547521845851, + "learning_rate": 4.763824571178798e-06, + "loss": 0.4672, + "step": 1083 + }, + { + "epoch": 0.46, + "grad_norm": 0.5723059333795633, + "learning_rate": 4.763026729050752e-06, + "loss": 0.483, + "step": 1084 + }, + { + "epoch": 0.46, + "grad_norm": 0.5906452138705621, + "learning_rate": 4.7622276086107685e-06, + "loss": 0.5082, + "step": 1085 + }, + { + "epoch": 0.46, + "grad_norm": 0.5774338186179943, + "learning_rate": 4.761427210310244e-06, + "loss": 0.4797, + "step": 1086 + }, + { + "epoch": 0.46, + "grad_norm": 0.5795580232694232, + "learning_rate": 4.760625534601299e-06, + "loss": 0.4718, + "step": 1087 + }, + { + "epoch": 0.46, + "grad_norm": 0.5917263527753107, + "learning_rate": 4.759822581936773e-06, + "loss": 0.4773, + "step": 1088 + }, + { + "epoch": 0.47, + "grad_norm": 0.5606273835003459, + "learning_rate": 4.759018352770229e-06, + "loss": 0.4876, + "step": 1089 + }, + { + "epoch": 0.47, + "grad_norm": 0.5763982202540356, + "learning_rate": 4.758212847555953e-06, + "loss": 0.476, + "step": 1090 + }, + { + "epoch": 0.47, + "grad_norm": 0.5761656331674372, + "learning_rate": 4.757406066748947e-06, + "loss": 0.4526, + "step": 1091 + }, + { + "epoch": 0.47, + "grad_norm": 0.6337475882937359, + "learning_rate": 4.756598010804935e-06, + "loss": 0.5034, + "step": 1092 + }, + { + "epoch": 0.47, + "grad_norm": 0.598332500270868, + "learning_rate": 4.755788680180363e-06, + "loss": 0.5204, + "step": 1093 + }, + { + "epoch": 0.47, + "grad_norm": 0.5983336967750388, + "learning_rate": 4.754978075332398e-06, + "loss": 0.4992, + "step": 1094 + }, + { + "epoch": 0.47, + "grad_norm": 0.5932701762917508, + "learning_rate": 4.7541661967189225e-06, + "loss": 0.4924, + "step": 1095 + }, + { + "epoch": 0.47, + "grad_norm": 0.571498332890339, + "learning_rate": 4.7533530447985424e-06, + "loss": 0.507, + "step": 1096 + }, + { + "epoch": 0.47, + "grad_norm": 0.6003239552991582, + "learning_rate": 4.752538620030581e-06, + "loss": 0.4797, + "step": 1097 + }, + { + "epoch": 0.47, + "grad_norm": 0.5906168083325332, + "learning_rate": 4.7517229228750804e-06, + "loss": 0.5193, + "step": 1098 + }, + { + "epoch": 0.47, + "grad_norm": 0.5931501865913923, + "learning_rate": 4.750905953792803e-06, + "loss": 0.4812, + "step": 1099 + }, + { + "epoch": 0.47, + "grad_norm": 0.6290969275442341, + "learning_rate": 4.750087713245227e-06, + "loss": 0.5057, + "step": 1100 + }, + { + "epoch": 0.47, + "grad_norm": 0.5604034491261822, + "learning_rate": 4.749268201694553e-06, + "loss": 0.4678, + "step": 1101 + }, + { + "epoch": 0.47, + "grad_norm": 0.6014101011761356, + "learning_rate": 4.748447419603696e-06, + "loss": 0.4999, + "step": 1102 + }, + { + "epoch": 0.47, + "grad_norm": 0.5926144054570872, + "learning_rate": 4.747625367436288e-06, + "loss": 0.4803, + "step": 1103 + }, + { + "epoch": 0.47, + "grad_norm": 0.5820804611550561, + "learning_rate": 4.746802045656683e-06, + "loss": 0.5047, + "step": 1104 + }, + { + "epoch": 0.47, + "grad_norm": 0.6087772177843124, + "learning_rate": 4.745977454729947e-06, + "loss": 0.5042, + "step": 1105 + }, + { + "epoch": 0.47, + "grad_norm": 0.594424789965164, + "learning_rate": 4.7451515951218675e-06, + "loss": 0.5102, + "step": 1106 + }, + { + "epoch": 0.47, + "grad_norm": 0.5731428331636981, + "learning_rate": 4.744324467298944e-06, + "loss": 0.5191, + "step": 1107 + }, + { + "epoch": 0.47, + "grad_norm": 0.5657909010178551, + "learning_rate": 4.743496071728396e-06, + "loss": 0.4829, + "step": 1108 + }, + { + "epoch": 0.47, + "grad_norm": 0.5670594056702332, + "learning_rate": 4.7426664088781585e-06, + "loss": 0.4896, + "step": 1109 + }, + { + "epoch": 0.47, + "grad_norm": 0.6028975438711562, + "learning_rate": 4.74183547921688e-06, + "loss": 0.4996, + "step": 1110 + }, + { + "epoch": 0.47, + "grad_norm": 0.6072539145327235, + "learning_rate": 4.741003283213928e-06, + "loss": 0.5041, + "step": 1111 + }, + { + "epoch": 0.48, + "grad_norm": 0.5867589851114696, + "learning_rate": 4.740169821339381e-06, + "loss": 0.4992, + "step": 1112 + }, + { + "epoch": 0.48, + "grad_norm": 0.586826402405483, + "learning_rate": 4.739335094064038e-06, + "loss": 0.4616, + "step": 1113 + }, + { + "epoch": 0.48, + "grad_norm": 0.5943697459318228, + "learning_rate": 4.738499101859409e-06, + "loss": 0.4942, + "step": 1114 + }, + { + "epoch": 0.48, + "grad_norm": 0.6118584069615158, + "learning_rate": 4.7376618451977195e-06, + "loss": 0.4826, + "step": 1115 + }, + { + "epoch": 0.48, + "grad_norm": 0.5877105797297991, + "learning_rate": 4.736823324551909e-06, + "loss": 0.4845, + "step": 1116 + }, + { + "epoch": 0.48, + "grad_norm": 0.6099949628606918, + "learning_rate": 4.735983540395631e-06, + "loss": 0.4982, + "step": 1117 + }, + { + "epoch": 0.48, + "grad_norm": 0.5834134958716418, + "learning_rate": 4.735142493203253e-06, + "loss": 0.4944, + "step": 1118 + }, + { + "epoch": 0.48, + "grad_norm": 0.5566576275979142, + "learning_rate": 4.734300183449856e-06, + "loss": 0.5016, + "step": 1119 + }, + { + "epoch": 0.48, + "grad_norm": 0.6126927082777508, + "learning_rate": 4.733456611611233e-06, + "loss": 0.5104, + "step": 1120 + }, + { + "epoch": 0.48, + "grad_norm": 0.5844938513436758, + "learning_rate": 4.732611778163894e-06, + "loss": 0.5373, + "step": 1121 + }, + { + "epoch": 0.48, + "grad_norm": 0.5486863613853675, + "learning_rate": 4.7317656835850544e-06, + "loss": 0.5098, + "step": 1122 + }, + { + "epoch": 0.48, + "grad_norm": 0.6075051942047663, + "learning_rate": 4.73091832835265e-06, + "loss": 0.4901, + "step": 1123 + }, + { + "epoch": 0.48, + "grad_norm": 0.5805346057610045, + "learning_rate": 4.730069712945322e-06, + "loss": 0.511, + "step": 1124 + }, + { + "epoch": 0.48, + "grad_norm": 0.5775795364832905, + "learning_rate": 4.729219837842427e-06, + "loss": 0.4916, + "step": 1125 + }, + { + "epoch": 0.48, + "grad_norm": 0.5931365026299273, + "learning_rate": 4.728368703524034e-06, + "loss": 0.4897, + "step": 1126 + }, + { + "epoch": 0.48, + "grad_norm": 0.5538844072984839, + "learning_rate": 4.72751631047092e-06, + "loss": 0.4937, + "step": 1127 + }, + { + "epoch": 0.48, + "grad_norm": 0.585745992804038, + "learning_rate": 4.726662659164576e-06, + "loss": 0.4791, + "step": 1128 + }, + { + "epoch": 0.48, + "grad_norm": 0.58228568438699, + "learning_rate": 4.725807750087201e-06, + "loss": 0.4897, + "step": 1129 + }, + { + "epoch": 0.48, + "grad_norm": 0.623272001177232, + "learning_rate": 4.7249515837217075e-06, + "loss": 0.4713, + "step": 1130 + }, + { + "epoch": 0.48, + "grad_norm": 0.5882528036227618, + "learning_rate": 4.724094160551716e-06, + "loss": 0.4888, + "step": 1131 + }, + { + "epoch": 0.48, + "grad_norm": 0.5665571281823603, + "learning_rate": 4.7232354810615575e-06, + "loss": 0.4634, + "step": 1132 + }, + { + "epoch": 0.48, + "grad_norm": 0.5547983753857056, + "learning_rate": 4.722375545736273e-06, + "loss": 0.4908, + "step": 1133 + }, + { + "epoch": 0.48, + "grad_norm": 0.5637582498617196, + "learning_rate": 4.7215143550616124e-06, + "loss": 0.4912, + "step": 1134 + }, + { + "epoch": 0.48, + "grad_norm": 0.5691678446202798, + "learning_rate": 4.720651909524037e-06, + "loss": 0.5088, + "step": 1135 + }, + { + "epoch": 0.49, + "grad_norm": 0.6037047316013417, + "learning_rate": 4.719788209610711e-06, + "loss": 0.5063, + "step": 1136 + }, + { + "epoch": 0.49, + "eval_loss": 0.49330827593803406, + "eval_runtime": 6918.0742, + "eval_samples_per_second": 41.975, + "eval_steps_per_second": 2.099, + "step": 1136 + }, + { + "epoch": 0.49, + "grad_norm": 0.7053964822007601, + "learning_rate": 4.718923255809514e-06, + "loss": 0.4887, + "step": 1137 + }, + { + "epoch": 0.49, + "grad_norm": 0.6112035128658351, + "learning_rate": 4.71805704860903e-06, + "loss": 0.515, + "step": 1138 + }, + { + "epoch": 0.49, + "grad_norm": 0.6144098315190231, + "learning_rate": 4.717189588498552e-06, + "loss": 0.4933, + "step": 1139 + }, + { + "epoch": 0.49, + "grad_norm": 0.5899433076087655, + "learning_rate": 4.716320875968081e-06, + "loss": 0.5082, + "step": 1140 + }, + { + "epoch": 0.49, + "grad_norm": 0.5827664203359575, + "learning_rate": 4.715450911508324e-06, + "loss": 0.4962, + "step": 1141 + }, + { + "epoch": 0.49, + "grad_norm": 0.559390267287684, + "learning_rate": 4.714579695610698e-06, + "loss": 0.496, + "step": 1142 + }, + { + "epoch": 0.49, + "grad_norm": 0.5970166193989745, + "learning_rate": 4.7137072287673244e-06, + "loss": 0.4963, + "step": 1143 + }, + { + "epoch": 0.49, + "grad_norm": 0.5862371291827102, + "learning_rate": 4.712833511471032e-06, + "loss": 0.5107, + "step": 1144 + }, + { + "epoch": 0.49, + "grad_norm": 0.5621562211030774, + "learning_rate": 4.711958544215355e-06, + "loss": 0.4848, + "step": 1145 + }, + { + "epoch": 0.49, + "grad_norm": 0.5957285834779159, + "learning_rate": 4.711082327494536e-06, + "loss": 0.5068, + "step": 1146 + }, + { + "epoch": 0.49, + "grad_norm": 0.6068577769642044, + "learning_rate": 4.710204861803522e-06, + "loss": 0.4806, + "step": 1147 + }, + { + "epoch": 0.49, + "grad_norm": 0.5871604299043601, + "learning_rate": 4.709326147637965e-06, + "loss": 0.5263, + "step": 1148 + }, + { + "epoch": 0.49, + "grad_norm": 0.6470536987852363, + "learning_rate": 4.708446185494222e-06, + "loss": 0.4911, + "step": 1149 + }, + { + "epoch": 0.49, + "grad_norm": 0.581159959235341, + "learning_rate": 4.707564975869357e-06, + "loss": 0.5169, + "step": 1150 + }, + { + "epoch": 0.49, + "grad_norm": 0.5891669395141927, + "learning_rate": 4.706682519261137e-06, + "loss": 0.4872, + "step": 1151 + }, + { + "epoch": 0.49, + "grad_norm": 0.6080392901510567, + "learning_rate": 4.7057988161680325e-06, + "loss": 0.5072, + "step": 1152 + }, + { + "epoch": 0.49, + "grad_norm": 0.6173464560574663, + "learning_rate": 4.704913867089221e-06, + "loss": 0.4857, + "step": 1153 + }, + { + "epoch": 0.49, + "grad_norm": 0.6353962573655269, + "learning_rate": 4.704027672524582e-06, + "loss": 0.5113, + "step": 1154 + }, + { + "epoch": 0.49, + "grad_norm": 0.6161269437993665, + "learning_rate": 4.703140232974697e-06, + "loss": 0.4904, + "step": 1155 + }, + { + "epoch": 0.49, + "grad_norm": 0.6096752081924692, + "learning_rate": 4.7022515489408536e-06, + "loss": 0.4925, + "step": 1156 + }, + { + "epoch": 0.49, + "grad_norm": 0.669481066717579, + "learning_rate": 4.701361620925041e-06, + "loss": 0.4996, + "step": 1157 + }, + { + "epoch": 0.49, + "grad_norm": 0.6192786083709042, + "learning_rate": 4.700470449429952e-06, + "loss": 0.5126, + "step": 1158 + }, + { + "epoch": 0.5, + "grad_norm": 0.643144004386803, + "learning_rate": 4.699578034958981e-06, + "loss": 0.4969, + "step": 1159 + }, + { + "epoch": 0.5, + "grad_norm": 0.5846720216828944, + "learning_rate": 4.698684378016223e-06, + "loss": 0.4776, + "step": 1160 + }, + { + "epoch": 0.5, + "grad_norm": 0.5741587597255912, + "learning_rate": 4.697789479106479e-06, + "loss": 0.4923, + "step": 1161 + }, + { + "epoch": 0.5, + "grad_norm": 0.5636128681731576, + "learning_rate": 4.696893338735246e-06, + "loss": 0.4946, + "step": 1162 + }, + { + "epoch": 0.5, + "grad_norm": 0.5616937977408087, + "learning_rate": 4.6959959574087265e-06, + "loss": 0.4956, + "step": 1163 + }, + { + "epoch": 0.5, + "grad_norm": 0.5752353921016059, + "learning_rate": 4.695097335633823e-06, + "loss": 0.4951, + "step": 1164 + }, + { + "epoch": 0.5, + "grad_norm": 0.5975316140588034, + "learning_rate": 4.694197473918139e-06, + "loss": 0.4702, + "step": 1165 + }, + { + "epoch": 0.5, + "grad_norm": 0.5712072339658532, + "learning_rate": 4.693296372769978e-06, + "loss": 0.5235, + "step": 1166 + }, + { + "epoch": 0.5, + "grad_norm": 0.5634240355738215, + "learning_rate": 4.692394032698341e-06, + "loss": 0.4926, + "step": 1167 + }, + { + "epoch": 0.5, + "grad_norm": 0.5804932429546033, + "learning_rate": 4.691490454212933e-06, + "loss": 0.4799, + "step": 1168 + }, + { + "epoch": 0.5, + "grad_norm": 0.553753907689284, + "learning_rate": 4.690585637824158e-06, + "loss": 0.4689, + "step": 1169 + }, + { + "epoch": 0.5, + "grad_norm": 0.5690679915211406, + "learning_rate": 4.6896795840431155e-06, + "loss": 0.4839, + "step": 1170 + }, + { + "epoch": 0.5, + "grad_norm": 0.6053719083807559, + "learning_rate": 4.688772293381608e-06, + "loss": 0.5041, + "step": 1171 + }, + { + "epoch": 0.5, + "grad_norm": 0.5622187791991081, + "learning_rate": 4.687863766352134e-06, + "loss": 0.5178, + "step": 1172 + }, + { + "epoch": 0.5, + "grad_norm": 0.5706828038584673, + "learning_rate": 4.686954003467894e-06, + "loss": 0.5148, + "step": 1173 + }, + { + "epoch": 0.5, + "grad_norm": 0.5814193933038267, + "learning_rate": 4.686043005242781e-06, + "loss": 0.4925, + "step": 1174 + }, + { + "epoch": 0.5, + "grad_norm": 0.5841581837583776, + "learning_rate": 4.685130772191392e-06, + "loss": 0.5043, + "step": 1175 + }, + { + "epoch": 0.5, + "grad_norm": 0.5432146572491381, + "learning_rate": 4.684217304829017e-06, + "loss": 0.497, + "step": 1176 + }, + { + "epoch": 0.5, + "grad_norm": 0.6238281748493371, + "learning_rate": 4.683302603671644e-06, + "loss": 0.5117, + "step": 1177 + }, + { + "epoch": 0.5, + "grad_norm": 0.6120786023963148, + "learning_rate": 4.682386669235959e-06, + "loss": 0.5043, + "step": 1178 + }, + { + "epoch": 0.5, + "grad_norm": 0.6097201971727148, + "learning_rate": 4.681469502039345e-06, + "loss": 0.4781, + "step": 1179 + }, + { + "epoch": 0.5, + "grad_norm": 0.6099227512690751, + "learning_rate": 4.680551102599881e-06, + "loss": 0.484, + "step": 1180 + }, + { + "epoch": 0.5, + "grad_norm": 0.6118633618285224, + "learning_rate": 4.67963147143634e-06, + "loss": 0.4973, + "step": 1181 + }, + { + "epoch": 0.5, + "grad_norm": 0.5963607635102951, + "learning_rate": 4.678710609068193e-06, + "loss": 0.5189, + "step": 1182 + }, + { + "epoch": 0.51, + "grad_norm": 0.6095853439513462, + "learning_rate": 4.677788516015608e-06, + "loss": 0.4954, + "step": 1183 + }, + { + "epoch": 0.51, + "grad_norm": 0.5621028815997745, + "learning_rate": 4.676865192799443e-06, + "loss": 0.4872, + "step": 1184 + }, + { + "epoch": 0.51, + "grad_norm": 0.5945331480044012, + "learning_rate": 4.675940639941256e-06, + "loss": 0.5155, + "step": 1185 + }, + { + "epoch": 0.51, + "grad_norm": 0.5753731126662963, + "learning_rate": 4.675014857963297e-06, + "loss": 0.4699, + "step": 1186 + }, + { + "epoch": 0.51, + "grad_norm": 0.5747798420504858, + "learning_rate": 4.674087847388511e-06, + "loss": 0.4952, + "step": 1187 + }, + { + "epoch": 0.51, + "grad_norm": 0.5749508507302784, + "learning_rate": 4.673159608740536e-06, + "loss": 0.4896, + "step": 1188 + }, + { + "epoch": 0.51, + "grad_norm": 0.5809285237560987, + "learning_rate": 4.6722301425437056e-06, + "loss": 0.4659, + "step": 1189 + }, + { + "epoch": 0.51, + "grad_norm": 0.6013752084991827, + "learning_rate": 4.671299449323045e-06, + "loss": 0.5326, + "step": 1190 + }, + { + "epoch": 0.51, + "grad_norm": 0.6050011396855485, + "learning_rate": 4.670367529604274e-06, + "loss": 0.5158, + "step": 1191 + }, + { + "epoch": 0.51, + "grad_norm": 0.5740809770574845, + "learning_rate": 4.669434383913803e-06, + "loss": 0.4945, + "step": 1192 + }, + { + "epoch": 0.51, + "grad_norm": 0.6493339789648531, + "learning_rate": 4.668500012778738e-06, + "loss": 0.5303, + "step": 1193 + }, + { + "epoch": 0.51, + "grad_norm": 0.5707017426293574, + "learning_rate": 4.667564416726875e-06, + "loss": 0.5118, + "step": 1194 + }, + { + "epoch": 0.51, + "grad_norm": 0.5849224346254286, + "learning_rate": 4.666627596286702e-06, + "loss": 0.489, + "step": 1195 + }, + { + "epoch": 0.51, + "grad_norm": 0.5747420247370756, + "learning_rate": 4.6656895519874e-06, + "loss": 0.4815, + "step": 1196 + }, + { + "epoch": 0.51, + "grad_norm": 0.5840838885906293, + "learning_rate": 4.664750284358841e-06, + "loss": 0.5038, + "step": 1197 + }, + { + "epoch": 0.51, + "grad_norm": 0.5865365275558818, + "learning_rate": 4.663809793931585e-06, + "loss": 0.481, + "step": 1198 + }, + { + "epoch": 0.51, + "grad_norm": 0.6430958410357724, + "learning_rate": 4.662868081236887e-06, + "loss": 0.4874, + "step": 1199 + }, + { + "epoch": 0.51, + "grad_norm": 0.5694523106768161, + "learning_rate": 4.66192514680669e-06, + "loss": 0.4681, + "step": 1200 + }, + { + "epoch": 0.51, + "grad_norm": 0.6361578693239596, + "learning_rate": 4.660980991173628e-06, + "loss": 0.5068, + "step": 1201 + }, + { + "epoch": 0.51, + "grad_norm": 0.6082855757365931, + "learning_rate": 4.660035614871024e-06, + "loss": 0.4994, + "step": 1202 + }, + { + "epoch": 0.51, + "grad_norm": 0.5711249982493317, + "learning_rate": 4.659089018432893e-06, + "loss": 0.4821, + "step": 1203 + }, + { + "epoch": 0.51, + "grad_norm": 0.6147166083946165, + "learning_rate": 4.658141202393935e-06, + "loss": 0.5056, + "step": 1204 + }, + { + "epoch": 0.51, + "grad_norm": 0.5988711912518265, + "learning_rate": 4.657192167289542e-06, + "loss": 0.4711, + "step": 1205 + }, + { + "epoch": 0.52, + "grad_norm": 0.5741231990207563, + "learning_rate": 4.6562419136557935e-06, + "loss": 0.5082, + "step": 1206 + }, + { + "epoch": 0.52, + "grad_norm": 0.6046056425511495, + "learning_rate": 4.655290442029459e-06, + "loss": 0.4681, + "step": 1207 + }, + { + "epoch": 0.52, + "eval_loss": 0.4913150668144226, + "eval_runtime": 6919.579, + "eval_samples_per_second": 41.966, + "eval_steps_per_second": 2.098, + "step": 1207 + }, + { + "epoch": 0.52, + "grad_norm": 0.5744167724479108, + "learning_rate": 4.654337752947992e-06, + "loss": 0.4737, + "step": 1208 + }, + { + "epoch": 0.52, + "grad_norm": 0.5907262180093524, + "learning_rate": 4.653383846949539e-06, + "loss": 0.4771, + "step": 1209 + }, + { + "epoch": 0.52, + "grad_norm": 0.6398591796202622, + "learning_rate": 4.652428724572929e-06, + "loss": 0.5307, + "step": 1210 + }, + { + "epoch": 0.52, + "grad_norm": 0.6121432006011495, + "learning_rate": 4.6514723863576815e-06, + "loss": 0.4952, + "step": 1211 + }, + { + "epoch": 0.52, + "grad_norm": 0.6039842346032075, + "learning_rate": 4.650514832844002e-06, + "loss": 0.5106, + "step": 1212 + }, + { + "epoch": 0.52, + "grad_norm": 0.5945723569093062, + "learning_rate": 4.649556064572781e-06, + "loss": 0.5187, + "step": 1213 + }, + { + "epoch": 0.52, + "grad_norm": 0.6127059471922567, + "learning_rate": 4.648596082085597e-06, + "loss": 0.5023, + "step": 1214 + }, + { + "epoch": 0.52, + "grad_norm": 0.6148342317689859, + "learning_rate": 4.647634885924713e-06, + "loss": 0.5041, + "step": 1215 + }, + { + "epoch": 0.52, + "grad_norm": 0.6171661212182703, + "learning_rate": 4.64667247663308e-06, + "loss": 0.4656, + "step": 1216 + }, + { + "epoch": 0.52, + "grad_norm": 0.5811162956002015, + "learning_rate": 4.645708854754329e-06, + "loss": 0.4793, + "step": 1217 + }, + { + "epoch": 0.52, + "grad_norm": 0.5897461617987049, + "learning_rate": 4.644744020832782e-06, + "loss": 0.511, + "step": 1218 + }, + { + "epoch": 0.52, + "grad_norm": 0.6360033916365583, + "learning_rate": 4.6437779754134424e-06, + "loss": 0.4959, + "step": 1219 + }, + { + "epoch": 0.52, + "grad_norm": 0.5920841965277126, + "learning_rate": 4.642810719041999e-06, + "loss": 0.5078, + "step": 1220 + }, + { + "epoch": 0.52, + "grad_norm": 0.5764590932419221, + "learning_rate": 4.641842252264824e-06, + "loss": 0.4816, + "step": 1221 + }, + { + "epoch": 0.52, + "grad_norm": 0.606347475674508, + "learning_rate": 4.640872575628973e-06, + "loss": 0.5039, + "step": 1222 + }, + { + "epoch": 0.52, + "grad_norm": 0.5601452063877889, + "learning_rate": 4.639901689682186e-06, + "loss": 0.483, + "step": 1223 + }, + { + "epoch": 0.52, + "grad_norm": 0.60107394353492, + "learning_rate": 4.638929594972885e-06, + "loss": 0.5017, + "step": 1224 + }, + { + "epoch": 0.52, + "grad_norm": 0.5832229161651298, + "learning_rate": 4.637956292050176e-06, + "loss": 0.4881, + "step": 1225 + }, + { + "epoch": 0.52, + "grad_norm": 0.6230227342266171, + "learning_rate": 4.636981781463848e-06, + "loss": 0.4771, + "step": 1226 + }, + { + "epoch": 0.52, + "grad_norm": 0.6267479062825622, + "learning_rate": 4.636006063764369e-06, + "loss": 0.4898, + "step": 1227 + }, + { + "epoch": 0.52, + "grad_norm": 0.6267200657967705, + "learning_rate": 4.635029139502892e-06, + "loss": 0.4901, + "step": 1228 + }, + { + "epoch": 0.53, + "grad_norm": 0.6484932594616627, + "learning_rate": 4.634051009231251e-06, + "loss": 0.5206, + "step": 1229 + }, + { + "epoch": 0.53, + "grad_norm": 0.5931663500547755, + "learning_rate": 4.63307167350196e-06, + "loss": 0.4702, + "step": 1230 + }, + { + "epoch": 0.53, + "grad_norm": 0.5860181057294145, + "learning_rate": 4.632091132868214e-06, + "loss": 0.506, + "step": 1231 + }, + { + "epoch": 0.53, + "grad_norm": 0.5931231715432055, + "learning_rate": 4.631109387883891e-06, + "loss": 0.5044, + "step": 1232 + }, + { + "epoch": 0.53, + "grad_norm": 0.5821409581725687, + "learning_rate": 4.630126439103546e-06, + "loss": 0.4844, + "step": 1233 + }, + { + "epoch": 0.53, + "grad_norm": 0.549630228044515, + "learning_rate": 4.629142287082416e-06, + "loss": 0.4804, + "step": 1234 + }, + { + "epoch": 0.53, + "grad_norm": 0.5406774405919889, + "learning_rate": 4.628156932376419e-06, + "loss": 0.4937, + "step": 1235 + }, + { + "epoch": 0.53, + "grad_norm": 0.5446742036382618, + "learning_rate": 4.627170375542147e-06, + "loss": 0.4896, + "step": 1236 + }, + { + "epoch": 0.53, + "grad_norm": 0.5801553090659803, + "learning_rate": 4.626182617136877e-06, + "loss": 0.5097, + "step": 1237 + }, + { + "epoch": 0.53, + "grad_norm": 0.547761487753955, + "learning_rate": 4.625193657718563e-06, + "loss": 0.486, + "step": 1238 + }, + { + "epoch": 0.53, + "grad_norm": 0.5718577243267876, + "learning_rate": 4.624203497845835e-06, + "loss": 0.4951, + "step": 1239 + }, + { + "epoch": 0.53, + "grad_norm": 0.5960930128816431, + "learning_rate": 4.623212138078004e-06, + "loss": 0.5076, + "step": 1240 + }, + { + "epoch": 0.53, + "grad_norm": 0.5721738998469788, + "learning_rate": 4.622219578975057e-06, + "loss": 0.4883, + "step": 1241 + }, + { + "epoch": 0.53, + "grad_norm": 0.5957161415624224, + "learning_rate": 4.62122582109766e-06, + "loss": 0.4832, + "step": 1242 + }, + { + "epoch": 0.53, + "grad_norm": 0.5704351658199357, + "learning_rate": 4.620230865007154e-06, + "loss": 0.5335, + "step": 1243 + }, + { + "epoch": 0.53, + "grad_norm": 0.5613480503706471, + "learning_rate": 4.619234711265558e-06, + "loss": 0.4999, + "step": 1244 + }, + { + "epoch": 0.53, + "grad_norm": 0.5990237352315096, + "learning_rate": 4.61823736043557e-06, + "loss": 0.5194, + "step": 1245 + }, + { + "epoch": 0.53, + "grad_norm": 0.5646085384828234, + "learning_rate": 4.617238813080559e-06, + "loss": 0.5143, + "step": 1246 + }, + { + "epoch": 0.53, + "grad_norm": 0.5868684924014452, + "learning_rate": 4.616239069764574e-06, + "loss": 0.5031, + "step": 1247 + }, + { + "epoch": 0.53, + "grad_norm": 0.6029024571519792, + "learning_rate": 4.615238131052339e-06, + "loss": 0.4964, + "step": 1248 + }, + { + "epoch": 0.53, + "grad_norm": 0.6047661298443577, + "learning_rate": 4.614235997509251e-06, + "loss": 0.515, + "step": 1249 + }, + { + "epoch": 0.53, + "grad_norm": 0.5787855934766027, + "learning_rate": 4.613232669701384e-06, + "loss": 0.5045, + "step": 1250 + }, + { + "epoch": 0.53, + "grad_norm": 0.5628826075983923, + "learning_rate": 4.612228148195486e-06, + "loss": 0.4866, + "step": 1251 + }, + { + "epoch": 0.53, + "grad_norm": 0.5482732141245912, + "learning_rate": 4.61122243355898e-06, + "loss": 0.4854, + "step": 1252 + }, + { + "epoch": 0.54, + "grad_norm": 0.5570796537586561, + "learning_rate": 4.610215526359961e-06, + "loss": 0.4892, + "step": 1253 + }, + { + "epoch": 0.54, + "grad_norm": 0.5726673029878695, + "learning_rate": 4.609207427167201e-06, + "loss": 0.4801, + "step": 1254 + }, + { + "epoch": 0.54, + "grad_norm": 0.6214832894338556, + "learning_rate": 4.60819813655014e-06, + "loss": 0.5322, + "step": 1255 + }, + { + "epoch": 0.54, + "grad_norm": 0.5714531549826166, + "learning_rate": 4.607187655078896e-06, + "loss": 0.4983, + "step": 1256 + }, + { + "epoch": 0.54, + "grad_norm": 0.5854107865928243, + "learning_rate": 4.6061759833242585e-06, + "loss": 0.5097, + "step": 1257 + }, + { + "epoch": 0.54, + "grad_norm": 0.5877571773366539, + "learning_rate": 4.605163121857688e-06, + "loss": 0.4764, + "step": 1258 + }, + { + "epoch": 0.54, + "grad_norm": 0.6161387817508228, + "learning_rate": 4.604149071251318e-06, + "loss": 0.4921, + "step": 1259 + }, + { + "epoch": 0.54, + "grad_norm": 0.5863778207848662, + "learning_rate": 4.603133832077953e-06, + "loss": 0.4652, + "step": 1260 + }, + { + "epoch": 0.54, + "grad_norm": 0.5889518886848664, + "learning_rate": 4.602117404911071e-06, + "loss": 0.4978, + "step": 1261 + }, + { + "epoch": 0.54, + "grad_norm": 0.569076904297205, + "learning_rate": 4.601099790324817e-06, + "loss": 0.5452, + "step": 1262 + }, + { + "epoch": 0.54, + "grad_norm": 0.5823464063147468, + "learning_rate": 4.6000809888940105e-06, + "loss": 0.516, + "step": 1263 + }, + { + "epoch": 0.54, + "grad_norm": 0.5762572387418361, + "learning_rate": 4.59906100119414e-06, + "loss": 0.4883, + "step": 1264 + }, + { + "epoch": 0.54, + "grad_norm": 0.5710353785095638, + "learning_rate": 4.598039827801364e-06, + "loss": 0.5114, + "step": 1265 + }, + { + "epoch": 0.54, + "grad_norm": 0.5484965886436899, + "learning_rate": 4.597017469292511e-06, + "loss": 0.4806, + "step": 1266 + }, + { + "epoch": 0.54, + "grad_norm": 0.5809463207101558, + "learning_rate": 4.5959939262450796e-06, + "loss": 0.4614, + "step": 1267 + }, + { + "epoch": 0.54, + "grad_norm": 0.5845778185226627, + "learning_rate": 4.594969199237235e-06, + "loss": 0.5241, + "step": 1268 + }, + { + "epoch": 0.54, + "grad_norm": 0.5908229060244614, + "learning_rate": 4.593943288847814e-06, + "loss": 0.5007, + "step": 1269 + }, + { + "epoch": 0.54, + "grad_norm": 0.5823679847675488, + "learning_rate": 4.592916195656322e-06, + "loss": 0.4949, + "step": 1270 + }, + { + "epoch": 0.54, + "grad_norm": 0.6773394663731939, + "learning_rate": 4.591887920242929e-06, + "loss": 0.4771, + "step": 1271 + }, + { + "epoch": 0.54, + "grad_norm": 0.5600458296952812, + "learning_rate": 4.590858463188477e-06, + "loss": 0.4636, + "step": 1272 + }, + { + "epoch": 0.54, + "grad_norm": 0.5983807787136752, + "learning_rate": 4.589827825074472e-06, + "loss": 0.4664, + "step": 1273 + }, + { + "epoch": 0.54, + "grad_norm": 0.6098172385279027, + "learning_rate": 4.58879600648309e-06, + "loss": 0.4971, + "step": 1274 + }, + { + "epoch": 0.54, + "grad_norm": 0.6237066298991583, + "learning_rate": 4.587763007997173e-06, + "loss": 0.4906, + "step": 1275 + }, + { + "epoch": 0.55, + "grad_norm": 0.5912938391288759, + "learning_rate": 4.586728830200227e-06, + "loss": 0.5064, + "step": 1276 + }, + { + "epoch": 0.55, + "grad_norm": 0.6381599023642128, + "learning_rate": 4.585693473676428e-06, + "loss": 0.4548, + "step": 1277 + }, + { + "epoch": 0.55, + "grad_norm": 0.5844695919916449, + "learning_rate": 4.584656939010615e-06, + "loss": 0.5001, + "step": 1278 + }, + { + "epoch": 0.55, + "eval_loss": 0.48919913172721863, + "eval_runtime": 6918.8774, + "eval_samples_per_second": 41.971, + "eval_steps_per_second": 2.099, + "step": 1278 + }, + { + "epoch": 0.55, + "grad_norm": 0.586366005962443, + "learning_rate": 4.583619226788294e-06, + "loss": 0.5084, + "step": 1279 + }, + { + "epoch": 0.55, + "grad_norm": 0.5845148658515918, + "learning_rate": 4.582580337595636e-06, + "loss": 0.4912, + "step": 1280 + }, + { + "epoch": 0.55, + "grad_norm": 0.6242513167522349, + "learning_rate": 4.581540272019476e-06, + "loss": 0.519, + "step": 1281 + }, + { + "epoch": 0.55, + "grad_norm": 0.5978675772328417, + "learning_rate": 4.580499030647314e-06, + "loss": 0.505, + "step": 1282 + }, + { + "epoch": 0.55, + "grad_norm": 0.5716562163085582, + "learning_rate": 4.579456614067315e-06, + "loss": 0.4805, + "step": 1283 + }, + { + "epoch": 0.55, + "grad_norm": 0.5887843179010744, + "learning_rate": 4.578413022868305e-06, + "loss": 0.4805, + "step": 1284 + }, + { + "epoch": 0.55, + "grad_norm": 0.5448511038289022, + "learning_rate": 4.577368257639778e-06, + "loss": 0.5194, + "step": 1285 + }, + { + "epoch": 0.55, + "grad_norm": 0.5583269034513156, + "learning_rate": 4.576322318971888e-06, + "loss": 0.4665, + "step": 1286 + }, + { + "epoch": 0.55, + "grad_norm": 0.6175638858305248, + "learning_rate": 4.575275207455451e-06, + "loss": 0.4953, + "step": 1287 + }, + { + "epoch": 0.55, + "grad_norm": 0.5893120538439389, + "learning_rate": 4.5742269236819485e-06, + "loss": 0.5078, + "step": 1288 + }, + { + "epoch": 0.55, + "grad_norm": 0.5759225882638491, + "learning_rate": 4.5731774682435225e-06, + "loss": 0.4967, + "step": 1289 + }, + { + "epoch": 0.55, + "grad_norm": 0.6027419371051689, + "learning_rate": 4.572126841732977e-06, + "loss": 0.4927, + "step": 1290 + }, + { + "epoch": 0.55, + "grad_norm": 0.5713265952976949, + "learning_rate": 4.571075044743778e-06, + "loss": 0.4818, + "step": 1291 + }, + { + "epoch": 0.55, + "grad_norm": 0.6136093959980795, + "learning_rate": 4.570022077870051e-06, + "loss": 0.4928, + "step": 1292 + }, + { + "epoch": 0.55, + "grad_norm": 0.5812274200588445, + "learning_rate": 4.568967941706584e-06, + "loss": 0.461, + "step": 1293 + }, + { + "epoch": 0.55, + "grad_norm": 0.5705816273671851, + "learning_rate": 4.567912636848826e-06, + "loss": 0.4902, + "step": 1294 + }, + { + "epoch": 0.55, + "grad_norm": 0.5683273724782274, + "learning_rate": 4.566856163892884e-06, + "loss": 0.4751, + "step": 1295 + }, + { + "epoch": 0.55, + "grad_norm": 0.5783916742928776, + "learning_rate": 4.565798523435528e-06, + "loss": 0.5014, + "step": 1296 + }, + { + "epoch": 0.55, + "grad_norm": 0.6008921391069879, + "learning_rate": 4.564739716074182e-06, + "loss": 0.4751, + "step": 1297 + }, + { + "epoch": 0.55, + "grad_norm": 0.5977176200664817, + "learning_rate": 4.563679742406935e-06, + "loss": 0.4861, + "step": 1298 + }, + { + "epoch": 0.55, + "grad_norm": 0.5819179757150398, + "learning_rate": 4.562618603032533e-06, + "loss": 0.5001, + "step": 1299 + }, + { + "epoch": 0.56, + "grad_norm": 0.609985142660628, + "learning_rate": 4.561556298550379e-06, + "loss": 0.5011, + "step": 1300 + }, + { + "epoch": 0.56, + "grad_norm": 0.628383300337832, + "learning_rate": 4.560492829560535e-06, + "loss": 0.4917, + "step": 1301 + }, + { + "epoch": 0.56, + "grad_norm": 0.556934869126385, + "learning_rate": 4.559428196663721e-06, + "loss": 0.4939, + "step": 1302 + }, + { + "epoch": 0.56, + "grad_norm": 0.5753689242601436, + "learning_rate": 4.558362400461315e-06, + "loss": 0.4798, + "step": 1303 + }, + { + "epoch": 0.56, + "grad_norm": 0.608958621920625, + "learning_rate": 4.55729544155535e-06, + "loss": 0.5147, + "step": 1304 + }, + { + "epoch": 0.56, + "grad_norm": 0.5559662431460121, + "learning_rate": 4.556227320548519e-06, + "loss": 0.4939, + "step": 1305 + }, + { + "epoch": 0.56, + "grad_norm": 0.5704357834306742, + "learning_rate": 4.555158038044167e-06, + "loss": 0.4716, + "step": 1306 + }, + { + "epoch": 0.56, + "grad_norm": 0.6333282168527011, + "learning_rate": 4.5540875946463e-06, + "loss": 0.465, + "step": 1307 + }, + { + "epoch": 0.56, + "grad_norm": 0.6080793831354635, + "learning_rate": 4.553015990959577e-06, + "loss": 0.4881, + "step": 1308 + }, + { + "epoch": 0.56, + "grad_norm": 0.5706855311890715, + "learning_rate": 4.551943227589314e-06, + "loss": 0.4647, + "step": 1309 + }, + { + "epoch": 0.56, + "grad_norm": 0.5821734570269693, + "learning_rate": 4.550869305141478e-06, + "loss": 0.5317, + "step": 1310 + }, + { + "epoch": 0.56, + "grad_norm": 0.5827695023416871, + "learning_rate": 4.549794224222697e-06, + "loss": 0.5124, + "step": 1311 + }, + { + "epoch": 0.56, + "grad_norm": 0.5511547366384472, + "learning_rate": 4.548717985440247e-06, + "loss": 0.4722, + "step": 1312 + }, + { + "epoch": 0.56, + "grad_norm": 0.6041209279498603, + "learning_rate": 4.547640589402063e-06, + "loss": 0.5015, + "step": 1313 + }, + { + "epoch": 0.56, + "grad_norm": 0.5861854523134222, + "learning_rate": 4.546562036716732e-06, + "loss": 0.4808, + "step": 1314 + }, + { + "epoch": 0.56, + "grad_norm": 0.5860300360146689, + "learning_rate": 4.5454823279934924e-06, + "loss": 0.4905, + "step": 1315 + }, + { + "epoch": 0.56, + "grad_norm": 0.5653121567715198, + "learning_rate": 4.5444014638422396e-06, + "loss": 0.4877, + "step": 1316 + }, + { + "epoch": 0.56, + "grad_norm": 0.5828331265940039, + "learning_rate": 4.543319444873517e-06, + "loss": 0.4966, + "step": 1317 + }, + { + "epoch": 0.56, + "grad_norm": 0.5524931399027349, + "learning_rate": 4.5422362716985255e-06, + "loss": 0.4934, + "step": 1318 + }, + { + "epoch": 0.56, + "grad_norm": 0.5848515428912962, + "learning_rate": 4.541151944929114e-06, + "loss": 0.4874, + "step": 1319 + }, + { + "epoch": 0.56, + "grad_norm": 0.5981291916343784, + "learning_rate": 4.5400664651777835e-06, + "loss": 0.4998, + "step": 1320 + }, + { + "epoch": 0.56, + "grad_norm": 0.5655813478901666, + "learning_rate": 4.538979833057688e-06, + "loss": 0.4519, + "step": 1321 + }, + { + "epoch": 0.56, + "grad_norm": 0.6562430108814516, + "learning_rate": 4.537892049182631e-06, + "loss": 0.5272, + "step": 1322 + }, + { + "epoch": 0.57, + "grad_norm": 0.6017818414342826, + "learning_rate": 4.536803114167067e-06, + "loss": 0.4821, + "step": 1323 + }, + { + "epoch": 0.57, + "grad_norm": 0.558603956646909, + "learning_rate": 4.535713028626101e-06, + "loss": 0.4879, + "step": 1324 + }, + { + "epoch": 0.57, + "grad_norm": 0.5722300913660474, + "learning_rate": 4.534621793175488e-06, + "loss": 0.4873, + "step": 1325 + }, + { + "epoch": 0.57, + "grad_norm": 0.5997686300961845, + "learning_rate": 4.533529408431632e-06, + "loss": 0.5033, + "step": 1326 + }, + { + "epoch": 0.57, + "grad_norm": 0.5829364793352837, + "learning_rate": 4.532435875011586e-06, + "loss": 0.5, + "step": 1327 + }, + { + "epoch": 0.57, + "grad_norm": 0.570165895880547, + "learning_rate": 4.531341193533053e-06, + "loss": 0.4978, + "step": 1328 + }, + { + "epoch": 0.57, + "grad_norm": 0.579729697723111, + "learning_rate": 4.530245364614384e-06, + "loss": 0.4837, + "step": 1329 + }, + { + "epoch": 0.57, + "grad_norm": 0.5809138960078671, + "learning_rate": 4.529148388874577e-06, + "loss": 0.5126, + "step": 1330 + }, + { + "epoch": 0.57, + "grad_norm": 0.5916301742756499, + "learning_rate": 4.528050266933279e-06, + "loss": 0.4984, + "step": 1331 + }, + { + "epoch": 0.57, + "grad_norm": 0.5820120794112843, + "learning_rate": 4.526950999410785e-06, + "loss": 0.4617, + "step": 1332 + }, + { + "epoch": 0.57, + "grad_norm": 0.5602665039817353, + "learning_rate": 4.525850586928036e-06, + "loss": 0.4761, + "step": 1333 + }, + { + "epoch": 0.57, + "grad_norm": 0.569022812704669, + "learning_rate": 4.52474903010662e-06, + "loss": 0.4734, + "step": 1334 + }, + { + "epoch": 0.57, + "grad_norm": 0.5942689466703626, + "learning_rate": 4.523646329568771e-06, + "loss": 0.5025, + "step": 1335 + }, + { + "epoch": 0.57, + "grad_norm": 0.5652312432297659, + "learning_rate": 4.522542485937369e-06, + "loss": 0.4848, + "step": 1336 + }, + { + "epoch": 0.57, + "grad_norm": 0.6230515838830604, + "learning_rate": 4.521437499835942e-06, + "loss": 0.5154, + "step": 1337 + }, + { + "epoch": 0.57, + "grad_norm": 0.5789207150950728, + "learning_rate": 4.52033137188866e-06, + "loss": 0.492, + "step": 1338 + }, + { + "epoch": 0.57, + "grad_norm": 0.5723737075235371, + "learning_rate": 4.519224102720341e-06, + "loss": 0.4898, + "step": 1339 + }, + { + "epoch": 0.57, + "grad_norm": 0.5921182647904031, + "learning_rate": 4.518115692956445e-06, + "loss": 0.4786, + "step": 1340 + }, + { + "epoch": 0.57, + "grad_norm": 0.5661075262133396, + "learning_rate": 4.517006143223078e-06, + "loss": 0.4819, + "step": 1341 + }, + { + "epoch": 0.57, + "grad_norm": 0.5639183149476357, + "learning_rate": 4.515895454146989e-06, + "loss": 0.4899, + "step": 1342 + }, + { + "epoch": 0.57, + "grad_norm": 0.5844775926241916, + "learning_rate": 4.514783626355571e-06, + "loss": 0.4749, + "step": 1343 + }, + { + "epoch": 0.57, + "grad_norm": 0.5554143190741964, + "learning_rate": 4.513670660476861e-06, + "loss": 0.4904, + "step": 1344 + }, + { + "epoch": 0.57, + "grad_norm": 0.5730633115430818, + "learning_rate": 4.512556557139538e-06, + "loss": 0.4745, + "step": 1345 + }, + { + "epoch": 0.58, + "grad_norm": 0.5651701903378715, + "learning_rate": 4.5114413169729224e-06, + "loss": 0.4944, + "step": 1346 + }, + { + "epoch": 0.58, + "grad_norm": 0.5926181506414837, + "learning_rate": 4.51032494060698e-06, + "loss": 0.5002, + "step": 1347 + }, + { + "epoch": 0.58, + "grad_norm": 0.5603459457356793, + "learning_rate": 4.509207428672313e-06, + "loss": 0.4831, + "step": 1348 + }, + { + "epoch": 0.58, + "grad_norm": 0.5864508189913115, + "learning_rate": 4.508088781800172e-06, + "loss": 0.4759, + "step": 1349 + }, + { + "epoch": 0.58, + "eval_loss": 0.4873812198638916, + "eval_runtime": 6917.3872, + "eval_samples_per_second": 41.98, + "eval_steps_per_second": 2.099, + "step": 1349 + }, + { + "epoch": 0.58, + "grad_norm": 0.5924744693021977, + "learning_rate": 4.506969000622443e-06, + "loss": 0.4916, + "step": 1350 + }, + { + "epoch": 0.58, + "grad_norm": 0.7833760299298924, + "learning_rate": 4.5058480857716554e-06, + "loss": 0.4883, + "step": 1351 + }, + { + "epoch": 0.58, + "grad_norm": 0.6032529206566636, + "learning_rate": 4.504726037880978e-06, + "loss": 0.4588, + "step": 1352 + }, + { + "epoch": 0.58, + "grad_norm": 0.5651861924712768, + "learning_rate": 4.5036028575842215e-06, + "loss": 0.4935, + "step": 1353 + }, + { + "epoch": 0.58, + "grad_norm": 0.5811213046043772, + "learning_rate": 4.502478545515833e-06, + "loss": 0.4927, + "step": 1354 + }, + { + "epoch": 0.58, + "grad_norm": 0.5602131651168747, + "learning_rate": 4.501353102310901e-06, + "loss": 0.5045, + "step": 1355 + }, + { + "epoch": 0.58, + "grad_norm": 0.5609065838079279, + "learning_rate": 4.500226528605154e-06, + "loss": 0.5024, + "step": 1356 + }, + { + "epoch": 0.58, + "grad_norm": 0.5926162131260848, + "learning_rate": 4.499098825034956e-06, + "loss": 0.5022, + "step": 1357 + }, + { + "epoch": 0.58, + "grad_norm": 0.581609311231316, + "learning_rate": 4.497969992237312e-06, + "loss": 0.5069, + "step": 1358 + }, + { + "epoch": 0.58, + "grad_norm": 0.5734162550066819, + "learning_rate": 4.496840030849864e-06, + "loss": 0.4718, + "step": 1359 + }, + { + "epoch": 0.58, + "grad_norm": 0.5832809240820751, + "learning_rate": 4.49570894151089e-06, + "loss": 0.4884, + "step": 1360 + }, + { + "epoch": 0.58, + "grad_norm": 0.5845131849772652, + "learning_rate": 4.494576724859307e-06, + "loss": 0.5017, + "step": 1361 + }, + { + "epoch": 0.58, + "grad_norm": 0.5917877188606556, + "learning_rate": 4.49344338153467e-06, + "loss": 0.5087, + "step": 1362 + }, + { + "epoch": 0.58, + "grad_norm": 0.5336678295406534, + "learning_rate": 4.492308912177166e-06, + "loss": 0.5018, + "step": 1363 + }, + { + "epoch": 0.58, + "grad_norm": 0.5351847391697913, + "learning_rate": 4.491173317427622e-06, + "loss": 0.4747, + "step": 1364 + }, + { + "epoch": 0.58, + "grad_norm": 0.598617114856617, + "learning_rate": 4.490036597927499e-06, + "loss": 0.4894, + "step": 1365 + }, + { + "epoch": 0.58, + "grad_norm": 0.5814612225515823, + "learning_rate": 4.488898754318894e-06, + "loss": 0.5018, + "step": 1366 + }, + { + "epoch": 0.58, + "grad_norm": 0.5887096610888417, + "learning_rate": 4.48775978724454e-06, + "loss": 0.4913, + "step": 1367 + }, + { + "epoch": 0.58, + "grad_norm": 0.5831429888356454, + "learning_rate": 4.4866196973478e-06, + "loss": 0.4789, + "step": 1368 + }, + { + "epoch": 0.58, + "grad_norm": 0.5920011899513401, + "learning_rate": 4.485478485272678e-06, + "loss": 0.4933, + "step": 1369 + }, + { + "epoch": 0.59, + "grad_norm": 0.5646646883743509, + "learning_rate": 4.484336151663807e-06, + "loss": 0.4657, + "step": 1370 + }, + { + "epoch": 0.59, + "grad_norm": 0.5498567442056715, + "learning_rate": 4.483192697166455e-06, + "loss": 0.4762, + "step": 1371 + }, + { + "epoch": 0.59, + "grad_norm": 0.5802015082280455, + "learning_rate": 4.482048122426523e-06, + "loss": 0.498, + "step": 1372 + }, + { + "epoch": 0.59, + "grad_norm": 0.5960641848798551, + "learning_rate": 4.480902428090546e-06, + "loss": 0.4868, + "step": 1373 + }, + { + "epoch": 0.59, + "grad_norm": 0.5951553770712328, + "learning_rate": 4.4797556148056884e-06, + "loss": 0.4796, + "step": 1374 + }, + { + "epoch": 0.59, + "grad_norm": 0.5704287014934469, + "learning_rate": 4.47860768321975e-06, + "loss": 0.519, + "step": 1375 + }, + { + "epoch": 0.59, + "grad_norm": 0.5911943453713712, + "learning_rate": 4.477458633981161e-06, + "loss": 0.4969, + "step": 1376 + }, + { + "epoch": 0.59, + "grad_norm": 0.5706191668874873, + "learning_rate": 4.476308467738982e-06, + "loss": 0.4754, + "step": 1377 + }, + { + "epoch": 0.59, + "grad_norm": 0.5941395696773826, + "learning_rate": 4.4751571851429054e-06, + "loss": 0.4993, + "step": 1378 + }, + { + "epoch": 0.59, + "grad_norm": 0.5947043585408122, + "learning_rate": 4.474004786843256e-06, + "loss": 0.5004, + "step": 1379 + }, + { + "epoch": 0.59, + "grad_norm": 0.5573964245620489, + "learning_rate": 4.472851273490985e-06, + "loss": 0.4914, + "step": 1380 + }, + { + "epoch": 0.59, + "grad_norm": 0.5600880890049603, + "learning_rate": 4.471696645737675e-06, + "loss": 0.4713, + "step": 1381 + }, + { + "epoch": 0.59, + "grad_norm": 0.6082765235407795, + "learning_rate": 4.470540904235541e-06, + "loss": 0.4619, + "step": 1382 + }, + { + "epoch": 0.59, + "grad_norm": 0.579769492076148, + "learning_rate": 4.469384049637423e-06, + "loss": 0.5104, + "step": 1383 + }, + { + "epoch": 0.59, + "grad_norm": 0.5594418549752033, + "learning_rate": 4.468226082596792e-06, + "loss": 0.4688, + "step": 1384 + }, + { + "epoch": 0.59, + "grad_norm": 0.6322690165548234, + "learning_rate": 4.467067003767745e-06, + "loss": 0.4867, + "step": 1385 + }, + { + "epoch": 0.59, + "grad_norm": 0.5935187981482064, + "learning_rate": 4.465906813805012e-06, + "loss": 0.5181, + "step": 1386 + }, + { + "epoch": 0.59, + "grad_norm": 0.5944767421902712, + "learning_rate": 4.464745513363945e-06, + "loss": 0.4891, + "step": 1387 + }, + { + "epoch": 0.59, + "grad_norm": 0.5765100142428179, + "learning_rate": 4.463583103100527e-06, + "loss": 0.4829, + "step": 1388 + }, + { + "epoch": 0.59, + "grad_norm": 0.6169183106704456, + "learning_rate": 4.462419583671366e-06, + "loss": 0.4845, + "step": 1389 + }, + { + "epoch": 0.59, + "grad_norm": 0.5803513203340209, + "learning_rate": 4.4612549557336975e-06, + "loss": 0.4658, + "step": 1390 + }, + { + "epoch": 0.59, + "grad_norm": 0.562253781436203, + "learning_rate": 4.460089219945383e-06, + "loss": 0.4771, + "step": 1391 + }, + { + "epoch": 0.59, + "grad_norm": 0.5584896620934483, + "learning_rate": 4.458922376964909e-06, + "loss": 0.452, + "step": 1392 + }, + { + "epoch": 0.6, + "grad_norm": 0.5894231534862258, + "learning_rate": 4.457754427451389e-06, + "loss": 0.4713, + "step": 1393 + }, + { + "epoch": 0.6, + "grad_norm": 0.5847332266508826, + "learning_rate": 4.456585372064559e-06, + "loss": 0.4912, + "step": 1394 + }, + { + "epoch": 0.6, + "grad_norm": 0.5590333023061157, + "learning_rate": 4.455415211464783e-06, + "loss": 0.4849, + "step": 1395 + }, + { + "epoch": 0.6, + "grad_norm": 0.5390426978695757, + "learning_rate": 4.454243946313047e-06, + "loss": 0.5039, + "step": 1396 + }, + { + "epoch": 0.6, + "grad_norm": 0.5719225582437959, + "learning_rate": 4.453071577270961e-06, + "loss": 0.4779, + "step": 1397 + }, + { + "epoch": 0.6, + "grad_norm": 0.5850243105245156, + "learning_rate": 4.451898105000759e-06, + "loss": 0.4898, + "step": 1398 + }, + { + "epoch": 0.6, + "grad_norm": 0.5324711572140098, + "learning_rate": 4.450723530165299e-06, + "loss": 0.4757, + "step": 1399 + }, + { + "epoch": 0.6, + "grad_norm": 0.5560225385184383, + "learning_rate": 4.449547853428061e-06, + "loss": 0.4883, + "step": 1400 + }, + { + "epoch": 0.6, + "grad_norm": 0.5485424264260553, + "learning_rate": 4.448371075453147e-06, + "loss": 0.4701, + "step": 1401 + }, + { + "epoch": 0.6, + "grad_norm": 0.5667730723100881, + "learning_rate": 4.4471931969052816e-06, + "loss": 0.492, + "step": 1402 + }, + { + "epoch": 0.6, + "grad_norm": 0.6246169641223871, + "learning_rate": 4.446014218449811e-06, + "loss": 0.4884, + "step": 1403 + }, + { + "epoch": 0.6, + "grad_norm": 0.5799291856487485, + "learning_rate": 4.444834140752702e-06, + "loss": 0.4933, + "step": 1404 + }, + { + "epoch": 0.6, + "grad_norm": 0.5851580198155035, + "learning_rate": 4.443652964480544e-06, + "loss": 0.4821, + "step": 1405 + }, + { + "epoch": 0.6, + "grad_norm": 0.6056180941381275, + "learning_rate": 4.442470690300546e-06, + "loss": 0.4947, + "step": 1406 + }, + { + "epoch": 0.6, + "grad_norm": 0.5758464475466127, + "learning_rate": 4.441287318880537e-06, + "loss": 0.5095, + "step": 1407 + }, + { + "epoch": 0.6, + "grad_norm": 0.6109440245331582, + "learning_rate": 4.4401028508889645e-06, + "loss": 0.4702, + "step": 1408 + }, + { + "epoch": 0.6, + "grad_norm": 0.5763086784688934, + "learning_rate": 4.4389172869949e-06, + "loss": 0.4869, + "step": 1409 + }, + { + "epoch": 0.6, + "grad_norm": 0.6069906437238171, + "learning_rate": 4.437730627868028e-06, + "loss": 0.4947, + "step": 1410 + }, + { + "epoch": 0.6, + "grad_norm": 0.5927086729110405, + "learning_rate": 4.4365428741786554e-06, + "loss": 0.4606, + "step": 1411 + }, + { + "epoch": 0.6, + "grad_norm": 0.6090925352724758, + "learning_rate": 4.435354026597707e-06, + "loss": 0.4914, + "step": 1412 + }, + { + "epoch": 0.6, + "grad_norm": 0.5746470686497894, + "learning_rate": 4.434164085796724e-06, + "loss": 0.4891, + "step": 1413 + }, + { + "epoch": 0.6, + "grad_norm": 0.6111669276029154, + "learning_rate": 4.432973052447868e-06, + "loss": 0.4752, + "step": 1414 + }, + { + "epoch": 0.6, + "grad_norm": 0.5736428596336413, + "learning_rate": 4.4317809272239145e-06, + "loss": 0.4849, + "step": 1415 + }, + { + "epoch": 0.6, + "grad_norm": 0.5279491616141355, + "learning_rate": 4.430587710798257e-06, + "loss": 0.4732, + "step": 1416 + }, + { + "epoch": 0.61, + "grad_norm": 0.5799544850676969, + "learning_rate": 4.429393403844906e-06, + "loss": 0.5072, + "step": 1417 + }, + { + "epoch": 0.61, + "grad_norm": 0.5345098644918358, + "learning_rate": 4.428198007038489e-06, + "loss": 0.4731, + "step": 1418 + }, + { + "epoch": 0.61, + "grad_norm": 0.5979310441080966, + "learning_rate": 4.427001521054245e-06, + "loss": 0.4888, + "step": 1419 + }, + { + "epoch": 0.61, + "grad_norm": 0.580732593585383, + "learning_rate": 4.425803946568033e-06, + "loss": 0.4849, + "step": 1420 + }, + { + "epoch": 0.61, + "eval_loss": 0.4855799674987793, + "eval_runtime": 6914.623, + "eval_samples_per_second": 41.996, + "eval_steps_per_second": 2.1, + "step": 1420 + }, + { + "epoch": 0.61, + "grad_norm": 0.5404083046391809, + "learning_rate": 4.424605284256323e-06, + "loss": 0.4898, + "step": 1421 + }, + { + "epoch": 0.61, + "grad_norm": 0.5484706120901545, + "learning_rate": 4.423405534796204e-06, + "loss": 0.4973, + "step": 1422 + }, + { + "epoch": 0.61, + "grad_norm": 0.5788605549686905, + "learning_rate": 4.422204698865374e-06, + "loss": 0.4837, + "step": 1423 + }, + { + "epoch": 0.61, + "grad_norm": 0.6106952748012678, + "learning_rate": 4.421002777142148e-06, + "loss": 0.5014, + "step": 1424 + }, + { + "epoch": 0.61, + "grad_norm": 0.5448876773549647, + "learning_rate": 4.419799770305453e-06, + "loss": 0.4775, + "step": 1425 + }, + { + "epoch": 0.61, + "grad_norm": 0.5628932799070395, + "learning_rate": 4.41859567903483e-06, + "loss": 0.4877, + "step": 1426 + }, + { + "epoch": 0.61, + "grad_norm": 0.58401703660086, + "learning_rate": 4.417390504010432e-06, + "loss": 0.5241, + "step": 1427 + }, + { + "epoch": 0.61, + "grad_norm": 0.5916298435437614, + "learning_rate": 4.416184245913022e-06, + "loss": 0.4664, + "step": 1428 + }, + { + "epoch": 0.61, + "grad_norm": 0.5713018784420761, + "learning_rate": 4.41497690542398e-06, + "loss": 0.5127, + "step": 1429 + }, + { + "epoch": 0.61, + "grad_norm": 0.5997368182029638, + "learning_rate": 4.413768483225292e-06, + "loss": 0.4928, + "step": 1430 + }, + { + "epoch": 0.61, + "grad_norm": 0.5793090886150154, + "learning_rate": 4.4125589799995585e-06, + "loss": 0.4658, + "step": 1431 + }, + { + "epoch": 0.61, + "grad_norm": 0.563370967957538, + "learning_rate": 4.411348396429989e-06, + "loss": 0.4933, + "step": 1432 + }, + { + "epoch": 0.61, + "grad_norm": 0.5541515539221555, + "learning_rate": 4.410136733200404e-06, + "loss": 0.4635, + "step": 1433 + }, + { + "epoch": 0.61, + "grad_norm": 0.6348388776122461, + "learning_rate": 4.4089239909952335e-06, + "loss": 0.5072, + "step": 1434 + }, + { + "epoch": 0.61, + "grad_norm": 0.5613188088037165, + "learning_rate": 4.407710170499517e-06, + "loss": 0.4722, + "step": 1435 + }, + { + "epoch": 0.61, + "grad_norm": 0.5702782150061315, + "learning_rate": 4.406495272398903e-06, + "loss": 0.4601, + "step": 1436 + }, + { + "epoch": 0.61, + "grad_norm": 0.5706119901840532, + "learning_rate": 4.405279297379648e-06, + "loss": 0.4873, + "step": 1437 + }, + { + "epoch": 0.61, + "grad_norm": 0.6397976458766784, + "learning_rate": 4.404062246128621e-06, + "loss": 0.4943, + "step": 1438 + }, + { + "epoch": 0.61, + "grad_norm": 0.5973771040772421, + "learning_rate": 4.4028441193332914e-06, + "loss": 0.5085, + "step": 1439 + }, + { + "epoch": 0.62, + "grad_norm": 0.5914091770327425, + "learning_rate": 4.401624917681743e-06, + "loss": 0.4922, + "step": 1440 + }, + { + "epoch": 0.62, + "grad_norm": 0.6055784851510099, + "learning_rate": 4.400404641862664e-06, + "loss": 0.4745, + "step": 1441 + }, + { + "epoch": 0.62, + "grad_norm": 0.5764544930402707, + "learning_rate": 4.399183292565347e-06, + "loss": 0.4857, + "step": 1442 + }, + { + "epoch": 0.62, + "grad_norm": 0.5996098894523001, + "learning_rate": 4.397960870479696e-06, + "loss": 0.4978, + "step": 1443 + }, + { + "epoch": 0.62, + "grad_norm": 0.5521355170405737, + "learning_rate": 4.396737376296218e-06, + "loss": 0.4587, + "step": 1444 + }, + { + "epoch": 0.62, + "grad_norm": 0.59045938437032, + "learning_rate": 4.395512810706026e-06, + "loss": 0.4908, + "step": 1445 + }, + { + "epoch": 0.62, + "grad_norm": 0.583988820333397, + "learning_rate": 4.394287174400838e-06, + "loss": 0.4869, + "step": 1446 + }, + { + "epoch": 0.62, + "grad_norm": 0.6065684858025009, + "learning_rate": 4.393060468072976e-06, + "loss": 0.4892, + "step": 1447 + }, + { + "epoch": 0.62, + "grad_norm": 0.5703024949579703, + "learning_rate": 4.3918326924153685e-06, + "loss": 0.4759, + "step": 1448 + }, + { + "epoch": 0.62, + "grad_norm": 0.5745917135551377, + "learning_rate": 4.390603848121546e-06, + "loss": 0.4869, + "step": 1449 + }, + { + "epoch": 0.62, + "grad_norm": 0.5720705945602651, + "learning_rate": 4.3893739358856465e-06, + "loss": 0.4663, + "step": 1450 + }, + { + "epoch": 0.62, + "grad_norm": 0.5748475515933826, + "learning_rate": 4.388142956402405e-06, + "loss": 0.4981, + "step": 1451 + }, + { + "epoch": 0.62, + "grad_norm": 1.1756158657989677, + "learning_rate": 4.3869109103671645e-06, + "loss": 0.5071, + "step": 1452 + }, + { + "epoch": 0.62, + "grad_norm": 0.5696921062936988, + "learning_rate": 4.385677798475868e-06, + "loss": 0.4764, + "step": 1453 + }, + { + "epoch": 0.62, + "grad_norm": 0.557690101161529, + "learning_rate": 4.384443621425062e-06, + "loss": 0.5103, + "step": 1454 + }, + { + "epoch": 0.62, + "grad_norm": 0.5542852620725858, + "learning_rate": 4.383208379911893e-06, + "loss": 0.4835, + "step": 1455 + }, + { + "epoch": 0.62, + "grad_norm": 0.5402197457616243, + "learning_rate": 4.38197207463411e-06, + "loss": 0.4963, + "step": 1456 + }, + { + "epoch": 0.62, + "grad_norm": 0.584415569246153, + "learning_rate": 4.380734706290063e-06, + "loss": 0.4835, + "step": 1457 + }, + { + "epoch": 0.62, + "grad_norm": 0.5483678859464396, + "learning_rate": 4.379496275578701e-06, + "loss": 0.489, + "step": 1458 + }, + { + "epoch": 0.62, + "grad_norm": 0.6061260112046732, + "learning_rate": 4.378256783199575e-06, + "loss": 0.507, + "step": 1459 + }, + { + "epoch": 0.62, + "grad_norm": 0.5542441873465928, + "learning_rate": 4.377016229852836e-06, + "loss": 0.4915, + "step": 1460 + }, + { + "epoch": 0.62, + "grad_norm": 0.5545139083535812, + "learning_rate": 4.375774616239231e-06, + "loss": 0.5185, + "step": 1461 + }, + { + "epoch": 0.62, + "grad_norm": 0.552239627065928, + "learning_rate": 4.374531943060109e-06, + "loss": 0.4783, + "step": 1462 + }, + { + "epoch": 0.63, + "grad_norm": 0.5335369265994653, + "learning_rate": 4.373288211017418e-06, + "loss": 0.454, + "step": 1463 + }, + { + "epoch": 0.63, + "grad_norm": 0.5454066478470697, + "learning_rate": 4.3720434208137015e-06, + "loss": 0.4824, + "step": 1464 + }, + { + "epoch": 0.63, + "grad_norm": 0.6037061519362923, + "learning_rate": 4.370797573152101e-06, + "loss": 0.4977, + "step": 1465 + }, + { + "epoch": 0.63, + "grad_norm": 0.5805121340778033, + "learning_rate": 4.369550668736358e-06, + "loss": 0.4894, + "step": 1466 + }, + { + "epoch": 0.63, + "grad_norm": 0.551429386005362, + "learning_rate": 4.3683027082708085e-06, + "loss": 0.4612, + "step": 1467 + }, + { + "epoch": 0.63, + "grad_norm": 0.5772151081375165, + "learning_rate": 4.3670536924603855e-06, + "loss": 0.4893, + "step": 1468 + }, + { + "epoch": 0.63, + "grad_norm": 0.5600194793378279, + "learning_rate": 4.365803622010618e-06, + "loss": 0.4956, + "step": 1469 + }, + { + "epoch": 0.63, + "grad_norm": 0.5530119828515296, + "learning_rate": 4.364552497627632e-06, + "loss": 0.4612, + "step": 1470 + }, + { + "epoch": 0.63, + "grad_norm": 0.5805548135765888, + "learning_rate": 4.363300320018148e-06, + "loss": 0.4659, + "step": 1471 + }, + { + "epoch": 0.63, + "grad_norm": 0.5501610854660056, + "learning_rate": 4.36204708988948e-06, + "loss": 0.4842, + "step": 1472 + }, + { + "epoch": 0.63, + "grad_norm": 0.5997289615402112, + "learning_rate": 4.36079280794954e-06, + "loss": 0.52, + "step": 1473 + }, + { + "epoch": 0.63, + "grad_norm": 0.5805877442370376, + "learning_rate": 4.359537474906831e-06, + "loss": 0.487, + "step": 1474 + }, + { + "epoch": 0.63, + "grad_norm": 0.558823842875777, + "learning_rate": 4.35828109147045e-06, + "loss": 0.4664, + "step": 1475 + }, + { + "epoch": 0.63, + "grad_norm": 0.5889535016987915, + "learning_rate": 4.357023658350089e-06, + "loss": 0.4989, + "step": 1476 + }, + { + "epoch": 0.63, + "grad_norm": 0.6277448104948404, + "learning_rate": 4.3557651762560316e-06, + "loss": 0.5118, + "step": 1477 + }, + { + "epoch": 0.63, + "grad_norm": 0.5679808700739567, + "learning_rate": 4.3545056458991556e-06, + "loss": 0.463, + "step": 1478 + }, + { + "epoch": 0.63, + "grad_norm": 0.5962236218865731, + "learning_rate": 4.353245067990928e-06, + "loss": 0.4798, + "step": 1479 + }, + { + "epoch": 0.63, + "grad_norm": 0.5808142134901263, + "learning_rate": 4.3519834432434095e-06, + "loss": 0.514, + "step": 1480 + }, + { + "epoch": 0.63, + "grad_norm": 0.5889258035229223, + "learning_rate": 4.350720772369252e-06, + "loss": 0.5016, + "step": 1481 + }, + { + "epoch": 0.63, + "grad_norm": 0.5618362131992736, + "learning_rate": 4.3494570560817e-06, + "loss": 0.4861, + "step": 1482 + }, + { + "epoch": 0.63, + "grad_norm": 0.5693422718547901, + "learning_rate": 4.348192295094585e-06, + "loss": 0.4737, + "step": 1483 + }, + { + "epoch": 0.63, + "grad_norm": 0.5554250264653801, + "learning_rate": 4.346926490122329e-06, + "loss": 0.4959, + "step": 1484 + }, + { + "epoch": 0.63, + "grad_norm": 0.5541704411153275, + "learning_rate": 4.345659641879948e-06, + "loss": 0.489, + "step": 1485 + }, + { + "epoch": 0.63, + "grad_norm": 0.5749556736212126, + "learning_rate": 4.344391751083043e-06, + "loss": 0.4919, + "step": 1486 + }, + { + "epoch": 0.64, + "grad_norm": 0.5815536068592329, + "learning_rate": 4.343122818447804e-06, + "loss": 0.478, + "step": 1487 + }, + { + "epoch": 0.64, + "grad_norm": 0.5807918654560094, + "learning_rate": 4.341852844691012e-06, + "loss": 0.4893, + "step": 1488 + }, + { + "epoch": 0.64, + "grad_norm": 0.5772854343547457, + "learning_rate": 4.340581830530036e-06, + "loss": 0.4883, + "step": 1489 + }, + { + "epoch": 0.64, + "grad_norm": 0.6083000277946388, + "learning_rate": 4.33930977668283e-06, + "loss": 0.5033, + "step": 1490 + }, + { + "epoch": 0.64, + "grad_norm": 0.5740926343714129, + "learning_rate": 4.338036683867936e-06, + "loss": 0.4943, + "step": 1491 + }, + { + "epoch": 0.64, + "eval_loss": 0.4840275049209595, + "eval_runtime": 6927.0119, + "eval_samples_per_second": 41.921, + "eval_steps_per_second": 2.096, + "step": 1491 + }, + { + "epoch": 0.64, + "grad_norm": 0.5684026018424353, + "learning_rate": 4.336762552804485e-06, + "loss": 0.4718, + "step": 1492 + }, + { + "epoch": 0.64, + "grad_norm": 0.562055744831237, + "learning_rate": 4.335487384212194e-06, + "loss": 0.5053, + "step": 1493 + }, + { + "epoch": 0.64, + "grad_norm": 0.5816786952791797, + "learning_rate": 4.334211178811364e-06, + "loss": 0.4864, + "step": 1494 + }, + { + "epoch": 0.64, + "grad_norm": 0.543579418092536, + "learning_rate": 4.332933937322883e-06, + "loss": 0.4937, + "step": 1495 + }, + { + "epoch": 0.64, + "grad_norm": 0.576270682270156, + "learning_rate": 4.331655660468224e-06, + "loss": 0.4587, + "step": 1496 + }, + { + "epoch": 0.64, + "grad_norm": 0.5770731512199584, + "learning_rate": 4.330376348969445e-06, + "loss": 0.4996, + "step": 1497 + }, + { + "epoch": 0.64, + "grad_norm": 0.5523724826628789, + "learning_rate": 4.329096003549189e-06, + "loss": 0.4748, + "step": 1498 + }, + { + "epoch": 0.64, + "grad_norm": 0.603518080716257, + "learning_rate": 4.3278146249306825e-06, + "loss": 0.4974, + "step": 1499 + }, + { + "epoch": 0.64, + "grad_norm": 0.5503858002021573, + "learning_rate": 4.326532213837735e-06, + "loss": 0.4927, + "step": 1500 + }, + { + "epoch": 0.64, + "grad_norm": 0.5464005046510432, + "learning_rate": 4.325248770994741e-06, + "loss": 0.4834, + "step": 1501 + }, + { + "epoch": 0.64, + "grad_norm": 0.591865173361713, + "learning_rate": 4.323964297126675e-06, + "loss": 0.4651, + "step": 1502 + }, + { + "epoch": 0.64, + "grad_norm": 0.5839558098791746, + "learning_rate": 4.3226787929590965e-06, + "loss": 0.4726, + "step": 1503 + }, + { + "epoch": 0.64, + "grad_norm": 0.5870418970742052, + "learning_rate": 4.3213922592181455e-06, + "loss": 0.4996, + "step": 1504 + }, + { + "epoch": 0.64, + "grad_norm": 0.5949338157726647, + "learning_rate": 4.320104696630544e-06, + "loss": 0.4882, + "step": 1505 + }, + { + "epoch": 0.64, + "grad_norm": 0.5822457164207244, + "learning_rate": 4.318816105923596e-06, + "loss": 0.4999, + "step": 1506 + }, + { + "epoch": 0.64, + "grad_norm": 0.6042290491386848, + "learning_rate": 4.317526487825185e-06, + "loss": 0.4743, + "step": 1507 + }, + { + "epoch": 0.64, + "grad_norm": 0.576454722720242, + "learning_rate": 4.316235843063775e-06, + "loss": 0.5038, + "step": 1508 + }, + { + "epoch": 0.64, + "grad_norm": 0.5715912329774304, + "learning_rate": 4.314944172368411e-06, + "loss": 0.5029, + "step": 1509 + }, + { + "epoch": 0.65, + "grad_norm": 0.5530105706856633, + "learning_rate": 4.3136514764687155e-06, + "loss": 0.4861, + "step": 1510 + }, + { + "epoch": 0.65, + "grad_norm": 0.5500767838459567, + "learning_rate": 4.312357756094892e-06, + "loss": 0.4858, + "step": 1511 + }, + { + "epoch": 0.65, + "grad_norm": 0.54693416081482, + "learning_rate": 4.311063011977723e-06, + "loss": 0.5017, + "step": 1512 + }, + { + "epoch": 0.65, + "grad_norm": 0.5419499341904167, + "learning_rate": 4.309767244848567e-06, + "loss": 0.4891, + "step": 1513 + }, + { + "epoch": 0.65, + "grad_norm": 0.5614950534369257, + "learning_rate": 4.308470455439362e-06, + "loss": 0.4971, + "step": 1514 + }, + { + "epoch": 0.65, + "grad_norm": 0.5349156359990223, + "learning_rate": 4.3071726444826244e-06, + "loss": 0.4777, + "step": 1515 + }, + { + "epoch": 0.65, + "grad_norm": 0.6003648657933892, + "learning_rate": 4.305873812711445e-06, + "loss": 0.4883, + "step": 1516 + }, + { + "epoch": 0.65, + "grad_norm": 0.5561939266607488, + "learning_rate": 4.304573960859493e-06, + "loss": 0.477, + "step": 1517 + }, + { + "epoch": 0.65, + "grad_norm": 0.5772579264539893, + "learning_rate": 4.303273089661013e-06, + "loss": 0.4636, + "step": 1518 + }, + { + "epoch": 0.65, + "grad_norm": 0.5555633091494279, + "learning_rate": 4.301971199850826e-06, + "loss": 0.4598, + "step": 1519 + }, + { + "epoch": 0.65, + "grad_norm": 0.5909063796287866, + "learning_rate": 4.300668292164329e-06, + "loss": 0.4704, + "step": 1520 + }, + { + "epoch": 0.65, + "grad_norm": 0.5321519349592307, + "learning_rate": 4.299364367337493e-06, + "loss": 0.4726, + "step": 1521 + }, + { + "epoch": 0.65, + "grad_norm": 0.5948749563634228, + "learning_rate": 4.298059426106864e-06, + "loss": 0.5042, + "step": 1522 + }, + { + "epoch": 0.65, + "grad_norm": 0.5590529380799898, + "learning_rate": 4.29675346920956e-06, + "loss": 0.4771, + "step": 1523 + }, + { + "epoch": 0.65, + "grad_norm": 0.573778432963487, + "learning_rate": 4.2954464973832765e-06, + "loss": 0.4841, + "step": 1524 + }, + { + "epoch": 0.65, + "grad_norm": 0.5531613083984294, + "learning_rate": 4.29413851136628e-06, + "loss": 0.4695, + "step": 1525 + }, + { + "epoch": 0.65, + "grad_norm": 0.5612844436545967, + "learning_rate": 4.292829511897409e-06, + "loss": 0.4804, + "step": 1526 + }, + { + "epoch": 0.65, + "grad_norm": 0.7125770424238018, + "learning_rate": 4.2915194997160774e-06, + "loss": 0.4788, + "step": 1527 + }, + { + "epoch": 0.65, + "grad_norm": 0.6254458428657401, + "learning_rate": 4.2902084755622685e-06, + "loss": 0.4875, + "step": 1528 + }, + { + "epoch": 0.65, + "grad_norm": 0.5699982607183243, + "learning_rate": 4.288896440176539e-06, + "loss": 0.5127, + "step": 1529 + }, + { + "epoch": 0.65, + "grad_norm": 0.5359705267411138, + "learning_rate": 4.287583394300016e-06, + "loss": 0.4592, + "step": 1530 + }, + { + "epoch": 0.65, + "grad_norm": 0.573098209766848, + "learning_rate": 4.286269338674396e-06, + "loss": 0.4872, + "step": 1531 + }, + { + "epoch": 0.65, + "grad_norm": 0.582145535892815, + "learning_rate": 4.284954274041949e-06, + "loss": 0.4932, + "step": 1532 + }, + { + "epoch": 0.65, + "grad_norm": 0.569146953738728, + "learning_rate": 4.283638201145512e-06, + "loss": 0.4842, + "step": 1533 + }, + { + "epoch": 0.66, + "grad_norm": 0.6060270899238028, + "learning_rate": 4.282321120728493e-06, + "loss": 0.5012, + "step": 1534 + }, + { + "epoch": 0.66, + "grad_norm": 0.5636460053991561, + "learning_rate": 4.28100303353487e-06, + "loss": 0.4912, + "step": 1535 + }, + { + "epoch": 0.66, + "grad_norm": 0.5594354604572797, + "learning_rate": 4.279683940309187e-06, + "loss": 0.4742, + "step": 1536 + }, + { + "epoch": 0.66, + "grad_norm": 0.6148223579588533, + "learning_rate": 4.278363841796559e-06, + "loss": 0.4705, + "step": 1537 + }, + { + "epoch": 0.66, + "grad_norm": 0.5510025520117999, + "learning_rate": 4.277042738742668e-06, + "loss": 0.4813, + "step": 1538 + }, + { + "epoch": 0.66, + "grad_norm": 0.6189865762055806, + "learning_rate": 4.2757206318937625e-06, + "loss": 0.5076, + "step": 1539 + }, + { + "epoch": 0.66, + "grad_norm": 0.5549678033446015, + "learning_rate": 4.274397521996658e-06, + "loss": 0.5005, + "step": 1540 + }, + { + "epoch": 0.66, + "grad_norm": 0.5444972036743465, + "learning_rate": 4.27307340979874e-06, + "loss": 0.4738, + "step": 1541 + }, + { + "epoch": 0.66, + "grad_norm": 0.5863310201010252, + "learning_rate": 4.271748296047953e-06, + "loss": 0.5036, + "step": 1542 + }, + { + "epoch": 0.66, + "grad_norm": 0.5834850598785314, + "learning_rate": 4.270422181492815e-06, + "loss": 0.4789, + "step": 1543 + }, + { + "epoch": 0.66, + "grad_norm": 0.5617538920507555, + "learning_rate": 4.269095066882406e-06, + "loss": 0.4632, + "step": 1544 + }, + { + "epoch": 0.66, + "grad_norm": 0.5907455073189339, + "learning_rate": 4.267766952966369e-06, + "loss": 0.4953, + "step": 1545 + }, + { + "epoch": 0.66, + "grad_norm": 0.5983165451822997, + "learning_rate": 4.266437840494915e-06, + "loss": 0.4796, + "step": 1546 + }, + { + "epoch": 0.66, + "grad_norm": 0.569812624676123, + "learning_rate": 4.265107730218817e-06, + "loss": 0.4514, + "step": 1547 + }, + { + "epoch": 0.66, + "grad_norm": 0.5847228353394706, + "learning_rate": 4.2637766228894115e-06, + "loss": 0.4656, + "step": 1548 + }, + { + "epoch": 0.66, + "grad_norm": 0.5925087756170236, + "learning_rate": 4.2624445192585994e-06, + "loss": 0.5031, + "step": 1549 + }, + { + "epoch": 0.66, + "grad_norm": 0.5703001753744149, + "learning_rate": 4.261111420078844e-06, + "loss": 0.4648, + "step": 1550 + }, + { + "epoch": 0.66, + "grad_norm": 0.5777371677357935, + "learning_rate": 4.259777326103169e-06, + "loss": 0.5158, + "step": 1551 + }, + { + "epoch": 0.66, + "grad_norm": 0.6186779369597184, + "learning_rate": 4.258442238085164e-06, + "loss": 0.4643, + "step": 1552 + }, + { + "epoch": 0.66, + "grad_norm": 0.64721013399992, + "learning_rate": 4.2571061567789766e-06, + "loss": 0.4947, + "step": 1553 + }, + { + "epoch": 0.66, + "grad_norm": 0.536482893670931, + "learning_rate": 4.255769082939316e-06, + "loss": 0.4661, + "step": 1554 + }, + { + "epoch": 0.66, + "grad_norm": 0.5970665352812153, + "learning_rate": 4.2544310173214546e-06, + "loss": 0.4676, + "step": 1555 + }, + { + "epoch": 0.66, + "grad_norm": 0.6564985474844536, + "learning_rate": 4.253091960681222e-06, + "loss": 0.4875, + "step": 1556 + }, + { + "epoch": 0.67, + "grad_norm": 0.5742936073076695, + "learning_rate": 4.251751913775009e-06, + "loss": 0.4685, + "step": 1557 + }, + { + "epoch": 0.67, + "grad_norm": 0.5791371405621296, + "learning_rate": 4.250410877359765e-06, + "loss": 0.4816, + "step": 1558 + }, + { + "epoch": 0.67, + "grad_norm": 0.5879477020083694, + "learning_rate": 4.2490688521930005e-06, + "loss": 0.4648, + "step": 1559 + }, + { + "epoch": 0.67, + "grad_norm": 0.5644425455654086, + "learning_rate": 4.247725839032781e-06, + "loss": 0.4979, + "step": 1560 + }, + { + "epoch": 0.67, + "grad_norm": 0.5816219717795591, + "learning_rate": 4.246381838637733e-06, + "loss": 0.4767, + "step": 1561 + }, + { + "epoch": 0.67, + "grad_norm": 0.6181448897628907, + "learning_rate": 4.245036851767039e-06, + "loss": 0.5092, + "step": 1562 + }, + { + "epoch": 0.67, + "eval_loss": 0.4825836420059204, + "eval_runtime": 6927.1849, + "eval_samples_per_second": 41.92, + "eval_steps_per_second": 2.096, + "step": 1562 + }, + { + "epoch": 0.67, + "grad_norm": 0.6319339756032352, + "learning_rate": 4.243690879180441e-06, + "loss": 0.4999, + "step": 1563 + }, + { + "epoch": 0.67, + "grad_norm": 0.5766409130292584, + "learning_rate": 4.242343921638235e-06, + "loss": 0.4912, + "step": 1564 + }, + { + "epoch": 0.67, + "grad_norm": 0.6169305231670678, + "learning_rate": 4.240995979901273e-06, + "loss": 0.5037, + "step": 1565 + }, + { + "epoch": 0.67, + "grad_norm": 0.6002589034049418, + "learning_rate": 4.239647054730966e-06, + "loss": 0.506, + "step": 1566 + }, + { + "epoch": 0.67, + "grad_norm": 0.6023521389889079, + "learning_rate": 4.238297146889281e-06, + "loss": 0.5257, + "step": 1567 + }, + { + "epoch": 0.67, + "grad_norm": 0.5158122792594769, + "learning_rate": 4.236946257138734e-06, + "loss": 0.441, + "step": 1568 + }, + { + "epoch": 0.67, + "grad_norm": 0.5495762183794795, + "learning_rate": 4.235594386242403e-06, + "loss": 0.4743, + "step": 1569 + }, + { + "epoch": 0.67, + "grad_norm": 0.6628519895823336, + "learning_rate": 4.234241534963916e-06, + "loss": 0.5272, + "step": 1570 + }, + { + "epoch": 0.67, + "grad_norm": 0.5959806073232053, + "learning_rate": 4.232887704067455e-06, + "loss": 0.4951, + "step": 1571 + }, + { + "epoch": 0.67, + "grad_norm": 0.5572508686653875, + "learning_rate": 4.231532894317757e-06, + "loss": 0.4923, + "step": 1572 + }, + { + "epoch": 0.67, + "grad_norm": 0.5899131939103285, + "learning_rate": 4.23017710648011e-06, + "loss": 0.4837, + "step": 1573 + }, + { + "epoch": 0.67, + "grad_norm": 0.6064850421779246, + "learning_rate": 4.228820341320356e-06, + "loss": 0.5005, + "step": 1574 + }, + { + "epoch": 0.67, + "grad_norm": 0.5780537164257747, + "learning_rate": 4.227462599604889e-06, + "loss": 0.4785, + "step": 1575 + }, + { + "epoch": 0.67, + "grad_norm": 0.5810376085859065, + "learning_rate": 4.226103882100654e-06, + "loss": 0.4714, + "step": 1576 + }, + { + "epoch": 0.67, + "grad_norm": 0.5909147691169829, + "learning_rate": 4.224744189575148e-06, + "loss": 0.5021, + "step": 1577 + }, + { + "epoch": 0.67, + "grad_norm": 0.5942835635277367, + "learning_rate": 4.2233835227964145e-06, + "loss": 0.4924, + "step": 1578 + }, + { + "epoch": 0.67, + "grad_norm": 0.5618720908082389, + "learning_rate": 4.222021882533056e-06, + "loss": 0.4724, + "step": 1579 + }, + { + "epoch": 0.68, + "grad_norm": 0.5522683879852052, + "learning_rate": 4.220659269554217e-06, + "loss": 0.476, + "step": 1580 + }, + { + "epoch": 0.68, + "grad_norm": 0.5626316433669417, + "learning_rate": 4.219295684629595e-06, + "loss": 0.5038, + "step": 1581 + }, + { + "epoch": 0.68, + "grad_norm": 0.5832875815941423, + "learning_rate": 4.217931128529436e-06, + "loss": 0.4942, + "step": 1582 + }, + { + "epoch": 0.68, + "grad_norm": 0.5515392841114919, + "learning_rate": 4.216565602024533e-06, + "loss": 0.4832, + "step": 1583 + }, + { + "epoch": 0.68, + "grad_norm": 0.6821325914674553, + "learning_rate": 4.2151991058862314e-06, + "loss": 0.4696, + "step": 1584 + }, + { + "epoch": 0.68, + "grad_norm": 0.5918911213299775, + "learning_rate": 4.21383164088642e-06, + "loss": 0.4981, + "step": 1585 + }, + { + "epoch": 0.68, + "grad_norm": 0.567039525371673, + "learning_rate": 4.212463207797535e-06, + "loss": 0.474, + "step": 1586 + }, + { + "epoch": 0.68, + "grad_norm": 0.5743234762161183, + "learning_rate": 4.211093807392562e-06, + "loss": 0.4675, + "step": 1587 + }, + { + "epoch": 0.68, + "grad_norm": 0.5855603678522909, + "learning_rate": 4.209723440445032e-06, + "loss": 0.4796, + "step": 1588 + }, + { + "epoch": 0.68, + "grad_norm": 0.5500461666318762, + "learning_rate": 4.208352107729021e-06, + "loss": 0.4844, + "step": 1589 + }, + { + "epoch": 0.68, + "grad_norm": 0.5791356596651714, + "learning_rate": 4.206979810019153e-06, + "loss": 0.4769, + "step": 1590 + }, + { + "epoch": 0.68, + "grad_norm": 0.5943166707550008, + "learning_rate": 4.205606548090593e-06, + "loss": 0.4842, + "step": 1591 + }, + { + "epoch": 0.68, + "grad_norm": 0.5765468748074813, + "learning_rate": 4.204232322719055e-06, + "loss": 0.4877, + "step": 1592 + }, + { + "epoch": 0.68, + "grad_norm": 0.5705538264405167, + "learning_rate": 4.202857134680795e-06, + "loss": 0.4848, + "step": 1593 + }, + { + "epoch": 0.68, + "grad_norm": 0.550378639192882, + "learning_rate": 4.201480984752612e-06, + "loss": 0.4709, + "step": 1594 + }, + { + "epoch": 0.68, + "grad_norm": 0.5843384273029046, + "learning_rate": 4.20010387371185e-06, + "loss": 0.5116, + "step": 1595 + }, + { + "epoch": 0.68, + "grad_norm": 0.5430286998603538, + "learning_rate": 4.198725802336395e-06, + "loss": 0.4755, + "step": 1596 + }, + { + "epoch": 0.68, + "grad_norm": 0.5613235833509066, + "learning_rate": 4.197346771404677e-06, + "loss": 0.483, + "step": 1597 + }, + { + "epoch": 0.68, + "grad_norm": 0.5431464537859271, + "learning_rate": 4.1959667816956654e-06, + "loss": 0.4954, + "step": 1598 + }, + { + "epoch": 0.68, + "grad_norm": 0.5816367873246644, + "learning_rate": 4.194585833988873e-06, + "loss": 0.5123, + "step": 1599 + }, + { + "epoch": 0.68, + "grad_norm": 0.5842017694622618, + "learning_rate": 4.1932039290643534e-06, + "loss": 0.4751, + "step": 1600 + }, + { + "epoch": 0.68, + "grad_norm": 0.5983057704757485, + "learning_rate": 4.191821067702701e-06, + "loss": 0.476, + "step": 1601 + }, + { + "epoch": 0.68, + "grad_norm": 0.5785359587175982, + "learning_rate": 4.190437250685049e-06, + "loss": 0.4882, + "step": 1602 + }, + { + "epoch": 0.68, + "grad_norm": 0.6048435561330398, + "learning_rate": 4.189052478793074e-06, + "loss": 0.5045, + "step": 1603 + }, + { + "epoch": 0.69, + "grad_norm": 0.5886694967565423, + "learning_rate": 4.187666752808987e-06, + "loss": 0.4809, + "step": 1604 + }, + { + "epoch": 0.69, + "grad_norm": 0.5662837567043483, + "learning_rate": 4.186280073515543e-06, + "loss": 0.508, + "step": 1605 + }, + { + "epoch": 0.69, + "grad_norm": 0.5507517273321046, + "learning_rate": 4.184892441696031e-06, + "loss": 0.4574, + "step": 1606 + }, + { + "epoch": 0.69, + "grad_norm": 0.5506339868366799, + "learning_rate": 4.183503858134283e-06, + "loss": 0.4983, + "step": 1607 + }, + { + "epoch": 0.69, + "grad_norm": 0.551124570714273, + "learning_rate": 4.182114323614662e-06, + "loss": 0.4964, + "step": 1608 + }, + { + "epoch": 0.69, + "grad_norm": 0.5632058519052752, + "learning_rate": 4.180723838922076e-06, + "loss": 0.5091, + "step": 1609 + }, + { + "epoch": 0.69, + "grad_norm": 0.5523436692520042, + "learning_rate": 4.179332404841963e-06, + "loss": 0.484, + "step": 1610 + }, + { + "epoch": 0.69, + "grad_norm": 0.6137884481921679, + "learning_rate": 4.177940022160299e-06, + "loss": 0.4796, + "step": 1611 + }, + { + "epoch": 0.69, + "grad_norm": 0.6400564535674431, + "learning_rate": 4.1765466916636e-06, + "loss": 0.4813, + "step": 1612 + }, + { + "epoch": 0.69, + "grad_norm": 0.6137552397160586, + "learning_rate": 4.175152414138911e-06, + "loss": 0.5128, + "step": 1613 + }, + { + "epoch": 0.69, + "grad_norm": 0.5878395349717804, + "learning_rate": 4.173757190373817e-06, + "loss": 0.4921, + "step": 1614 + }, + { + "epoch": 0.69, + "grad_norm": 0.638177925386608, + "learning_rate": 4.172361021156436e-06, + "loss": 0.4697, + "step": 1615 + }, + { + "epoch": 0.69, + "grad_norm": 0.55678238023355, + "learning_rate": 4.170963907275418e-06, + "loss": 0.456, + "step": 1616 + }, + { + "epoch": 0.69, + "grad_norm": 0.5856822237716187, + "learning_rate": 4.169565849519949e-06, + "loss": 0.4762, + "step": 1617 + }, + { + "epoch": 0.69, + "grad_norm": 0.6465303799069492, + "learning_rate": 4.1681668486797475e-06, + "loss": 0.4964, + "step": 1618 + }, + { + "epoch": 0.69, + "grad_norm": 0.6003634481531572, + "learning_rate": 4.166766905545064e-06, + "loss": 0.4864, + "step": 1619 + }, + { + "epoch": 0.69, + "grad_norm": 0.5916487532205914, + "learning_rate": 4.1653660209066835e-06, + "loss": 0.5002, + "step": 1620 + }, + { + "epoch": 0.69, + "grad_norm": 0.5898166480233019, + "learning_rate": 4.1639641955559205e-06, + "loss": 0.4827, + "step": 1621 + }, + { + "epoch": 0.69, + "grad_norm": 0.5866832054250863, + "learning_rate": 4.162561430284621e-06, + "loss": 0.5073, + "step": 1622 + }, + { + "epoch": 0.69, + "grad_norm": 0.5553096089138998, + "learning_rate": 4.161157725885163e-06, + "loss": 0.4507, + "step": 1623 + }, + { + "epoch": 0.69, + "grad_norm": 0.5797616225893526, + "learning_rate": 4.159753083150455e-06, + "loss": 0.4781, + "step": 1624 + }, + { + "epoch": 0.69, + "grad_norm": 0.5782876181988972, + "learning_rate": 4.158347502873933e-06, + "loss": 0.486, + "step": 1625 + }, + { + "epoch": 0.69, + "grad_norm": 0.5593458949862828, + "learning_rate": 4.156940985849568e-06, + "loss": 0.4722, + "step": 1626 + }, + { + "epoch": 0.7, + "grad_norm": 0.5566363926907538, + "learning_rate": 4.155533532871855e-06, + "loss": 0.471, + "step": 1627 + }, + { + "epoch": 0.7, + "grad_norm": 0.5687649356444076, + "learning_rate": 4.154125144735819e-06, + "loss": 0.4593, + "step": 1628 + }, + { + "epoch": 0.7, + "grad_norm": 0.5805813995919267, + "learning_rate": 4.1527158222370134e-06, + "loss": 0.4825, + "step": 1629 + }, + { + "epoch": 0.7, + "grad_norm": 0.5698382648767448, + "learning_rate": 4.151305566171521e-06, + "loss": 0.4744, + "step": 1630 + }, + { + "epoch": 0.7, + "grad_norm": 0.5478296421305997, + "learning_rate": 4.149894377335951e-06, + "loss": 0.4708, + "step": 1631 + }, + { + "epoch": 0.7, + "grad_norm": 0.5579153962795045, + "learning_rate": 4.148482256527438e-06, + "loss": 0.4838, + "step": 1632 + }, + { + "epoch": 0.7, + "grad_norm": 0.5487271513915988, + "learning_rate": 4.147069204543645e-06, + "loss": 0.4538, + "step": 1633 + }, + { + "epoch": 0.7, + "eval_loss": 0.4806346595287323, + "eval_runtime": 6927.3711, + "eval_samples_per_second": 41.919, + "eval_steps_per_second": 2.096, + "step": 1633 + }, + { + "epoch": 0.7, + "grad_norm": 0.5571550450326609, + "learning_rate": 4.14565522218276e-06, + "loss": 0.4696, + "step": 1634 + }, + { + "epoch": 0.7, + "grad_norm": 0.573294746763072, + "learning_rate": 4.144240310243496e-06, + "loss": 0.476, + "step": 1635 + }, + { + "epoch": 0.7, + "grad_norm": 0.5571754212391226, + "learning_rate": 4.142824469525093e-06, + "loss": 0.4823, + "step": 1636 + }, + { + "epoch": 0.7, + "grad_norm": 0.5921760399908439, + "learning_rate": 4.1414077008273134e-06, + "loss": 0.4787, + "step": 1637 + }, + { + "epoch": 0.7, + "grad_norm": 0.5663213930738107, + "learning_rate": 4.139990004950446e-06, + "loss": 0.4837, + "step": 1638 + }, + { + "epoch": 0.7, + "grad_norm": 0.6090434172084536, + "learning_rate": 4.138571382695301e-06, + "loss": 0.472, + "step": 1639 + }, + { + "epoch": 0.7, + "grad_norm": 0.6277842926973998, + "learning_rate": 4.137151834863213e-06, + "loss": 0.4627, + "step": 1640 + }, + { + "epoch": 0.7, + "grad_norm": 0.5518929513736244, + "learning_rate": 4.13573136225604e-06, + "loss": 0.4815, + "step": 1641 + }, + { + "epoch": 0.7, + "grad_norm": 0.5750076801420694, + "learning_rate": 4.1343099656761635e-06, + "loss": 0.4713, + "step": 1642 + }, + { + "epoch": 0.7, + "grad_norm": 0.5636682836359242, + "learning_rate": 4.132887645926482e-06, + "loss": 0.4739, + "step": 1643 + }, + { + "epoch": 0.7, + "grad_norm": 0.5582878941115875, + "learning_rate": 4.1314644038104215e-06, + "loss": 0.4846, + "step": 1644 + }, + { + "epoch": 0.7, + "grad_norm": 0.5355410249521172, + "learning_rate": 4.130040240131925e-06, + "loss": 0.487, + "step": 1645 + }, + { + "epoch": 0.7, + "grad_norm": 0.6050538286426506, + "learning_rate": 4.128615155695458e-06, + "loss": 0.4791, + "step": 1646 + }, + { + "epoch": 0.7, + "grad_norm": 0.551251347571686, + "learning_rate": 4.127189151306004e-06, + "loss": 0.4636, + "step": 1647 + }, + { + "epoch": 0.7, + "grad_norm": 0.5586211992866543, + "learning_rate": 4.12576222776907e-06, + "loss": 0.5068, + "step": 1648 + }, + { + "epoch": 0.7, + "grad_norm": 0.5541371767356188, + "learning_rate": 4.124334385890678e-06, + "loss": 0.4666, + "step": 1649 + }, + { + "epoch": 0.7, + "grad_norm": 0.597101794217058, + "learning_rate": 4.122905626477371e-06, + "loss": 0.4905, + "step": 1650 + }, + { + "epoch": 0.71, + "grad_norm": 0.5675341113402599, + "learning_rate": 4.121475950336209e-06, + "loss": 0.5052, + "step": 1651 + }, + { + "epoch": 0.71, + "grad_norm": 0.5863054446966842, + "learning_rate": 4.120045358274772e-06, + "loss": 0.4974, + "step": 1652 + }, + { + "epoch": 0.71, + "grad_norm": 0.553734529174061, + "learning_rate": 4.118613851101156e-06, + "loss": 0.4674, + "step": 1653 + }, + { + "epoch": 0.71, + "grad_norm": 0.5949725607990067, + "learning_rate": 4.117181429623973e-06, + "loss": 0.4659, + "step": 1654 + }, + { + "epoch": 0.71, + "grad_norm": 0.5834567071239104, + "learning_rate": 4.115748094652352e-06, + "loss": 0.4743, + "step": 1655 + }, + { + "epoch": 0.71, + "grad_norm": 0.5535876065946561, + "learning_rate": 4.114313846995941e-06, + "loss": 0.5171, + "step": 1656 + }, + { + "epoch": 0.71, + "grad_norm": 0.5838621642562685, + "learning_rate": 4.112878687464898e-06, + "loss": 0.5032, + "step": 1657 + }, + { + "epoch": 0.71, + "grad_norm": 0.5823106011343094, + "learning_rate": 4.111442616869901e-06, + "loss": 0.4829, + "step": 1658 + }, + { + "epoch": 0.71, + "grad_norm": 0.5559063219280944, + "learning_rate": 4.110005636022138e-06, + "loss": 0.4636, + "step": 1659 + }, + { + "epoch": 0.71, + "grad_norm": 0.5736354627247767, + "learning_rate": 4.108567745733318e-06, + "loss": 0.4933, + "step": 1660 + }, + { + "epoch": 0.71, + "grad_norm": 0.5690533891379516, + "learning_rate": 4.107128946815657e-06, + "loss": 0.4771, + "step": 1661 + }, + { + "epoch": 0.71, + "grad_norm": 0.5816163405766323, + "learning_rate": 4.105689240081886e-06, + "loss": 0.4599, + "step": 1662 + }, + { + "epoch": 0.71, + "grad_norm": 0.5686008337838981, + "learning_rate": 4.104248626345252e-06, + "loss": 0.4588, + "step": 1663 + }, + { + "epoch": 0.71, + "grad_norm": 0.5713712745582588, + "learning_rate": 4.102807106419511e-06, + "loss": 0.4707, + "step": 1664 + }, + { + "epoch": 0.71, + "grad_norm": 0.5568768914986018, + "learning_rate": 4.10136468111893e-06, + "loss": 0.491, + "step": 1665 + }, + { + "epoch": 0.71, + "grad_norm": 0.5680603398114878, + "learning_rate": 4.099921351258292e-06, + "loss": 0.4887, + "step": 1666 + }, + { + "epoch": 0.71, + "grad_norm": 0.555747852519542, + "learning_rate": 4.098477117652887e-06, + "loss": 0.4678, + "step": 1667 + }, + { + "epoch": 0.71, + "grad_norm": 0.5732419584091223, + "learning_rate": 4.097031981118516e-06, + "loss": 0.5059, + "step": 1668 + }, + { + "epoch": 0.71, + "grad_norm": 0.5978764739542107, + "learning_rate": 4.095585942471492e-06, + "loss": 0.4862, + "step": 1669 + }, + { + "epoch": 0.71, + "grad_norm": 0.5641825612908864, + "learning_rate": 4.094139002528635e-06, + "loss": 0.4892, + "step": 1670 + }, + { + "epoch": 0.71, + "grad_norm": 0.59291044888431, + "learning_rate": 4.092691162107277e-06, + "loss": 0.4689, + "step": 1671 + }, + { + "epoch": 0.71, + "grad_norm": 0.5633228261402596, + "learning_rate": 4.091242422025256e-06, + "loss": 0.4867, + "step": 1672 + }, + { + "epoch": 0.71, + "grad_norm": 0.5446421814689791, + "learning_rate": 4.08979278310092e-06, + "loss": 0.4963, + "step": 1673 + }, + { + "epoch": 0.72, + "grad_norm": 0.5900103216285727, + "learning_rate": 4.088342246153123e-06, + "loss": 0.4784, + "step": 1674 + }, + { + "epoch": 0.72, + "grad_norm": 0.5662846183673187, + "learning_rate": 4.086890812001228e-06, + "loss": 0.4728, + "step": 1675 + }, + { + "epoch": 0.72, + "grad_norm": 0.5481824376820944, + "learning_rate": 4.085438481465104e-06, + "loss": 0.4925, + "step": 1676 + }, + { + "epoch": 0.72, + "grad_norm": 0.603893688943784, + "learning_rate": 4.083985255365127e-06, + "loss": 0.4775, + "step": 1677 + }, + { + "epoch": 0.72, + "grad_norm": 0.5616169558946099, + "learning_rate": 4.082531134522176e-06, + "loss": 0.4836, + "step": 1678 + }, + { + "epoch": 0.72, + "grad_norm": 0.5771671536691606, + "learning_rate": 4.0810761197576405e-06, + "loss": 0.4729, + "step": 1679 + }, + { + "epoch": 0.72, + "grad_norm": 0.5356849896576423, + "learning_rate": 4.07962021189341e-06, + "loss": 0.4831, + "step": 1680 + }, + { + "epoch": 0.72, + "grad_norm": 0.5705452601718124, + "learning_rate": 4.078163411751882e-06, + "loss": 0.5036, + "step": 1681 + }, + { + "epoch": 0.72, + "grad_norm": 0.5853521037301916, + "learning_rate": 4.076705720155956e-06, + "loss": 0.4663, + "step": 1682 + }, + { + "epoch": 0.72, + "grad_norm": 0.5767304367108156, + "learning_rate": 4.075247137929036e-06, + "loss": 0.4696, + "step": 1683 + }, + { + "epoch": 0.72, + "grad_norm": 0.5469609445931426, + "learning_rate": 4.073787665895029e-06, + "loss": 0.4861, + "step": 1684 + }, + { + "epoch": 0.72, + "grad_norm": 0.5647896369896342, + "learning_rate": 4.0723273048783426e-06, + "loss": 0.4751, + "step": 1685 + }, + { + "epoch": 0.72, + "grad_norm": 0.5471961307604523, + "learning_rate": 4.070866055703892e-06, + "loss": 0.4755, + "step": 1686 + }, + { + "epoch": 0.72, + "grad_norm": 0.5409469094446197, + "learning_rate": 4.069403919197087e-06, + "loss": 0.4485, + "step": 1687 + }, + { + "epoch": 0.72, + "grad_norm": 0.5493683437426159, + "learning_rate": 4.067940896183843e-06, + "loss": 0.466, + "step": 1688 + }, + { + "epoch": 0.72, + "grad_norm": 0.6184085336074597, + "learning_rate": 4.0664769874905765e-06, + "loss": 0.4941, + "step": 1689 + }, + { + "epoch": 0.72, + "grad_norm": 0.5897131879671903, + "learning_rate": 4.065012193944201e-06, + "loss": 0.4949, + "step": 1690 + }, + { + "epoch": 0.72, + "grad_norm": 0.5476514705576049, + "learning_rate": 4.063546516372134e-06, + "loss": 0.4627, + "step": 1691 + }, + { + "epoch": 0.72, + "grad_norm": 0.5639046127555836, + "learning_rate": 4.0620799556022886e-06, + "loss": 0.507, + "step": 1692 + }, + { + "epoch": 0.72, + "grad_norm": 0.569493917934072, + "learning_rate": 4.060612512463079e-06, + "loss": 0.4689, + "step": 1693 + }, + { + "epoch": 0.72, + "grad_norm": 0.5576467520547685, + "learning_rate": 4.059144187783417e-06, + "loss": 0.4533, + "step": 1694 + }, + { + "epoch": 0.72, + "grad_norm": 0.5785819029326567, + "learning_rate": 4.057674982392713e-06, + "loss": 0.484, + "step": 1695 + }, + { + "epoch": 0.72, + "grad_norm": 0.5800649358100449, + "learning_rate": 4.056204897120875e-06, + "loss": 0.475, + "step": 1696 + }, + { + "epoch": 0.73, + "grad_norm": 0.5547737396991846, + "learning_rate": 4.054733932798306e-06, + "loss": 0.4617, + "step": 1697 + }, + { + "epoch": 0.73, + "grad_norm": 0.55033684788177, + "learning_rate": 4.053262090255908e-06, + "loss": 0.4748, + "step": 1698 + }, + { + "epoch": 0.73, + "grad_norm": 0.5624353739448029, + "learning_rate": 4.051789370325079e-06, + "loss": 0.4869, + "step": 1699 + }, + { + "epoch": 0.73, + "grad_norm": 0.5452044799501949, + "learning_rate": 4.050315773837708e-06, + "loss": 0.4463, + "step": 1700 + }, + { + "epoch": 0.73, + "grad_norm": 0.5802412977647277, + "learning_rate": 4.048841301626188e-06, + "loss": 0.4821, + "step": 1701 + }, + { + "epoch": 0.73, + "grad_norm": 0.5909786502202287, + "learning_rate": 4.047365954523398e-06, + "loss": 0.4666, + "step": 1702 + }, + { + "epoch": 0.73, + "grad_norm": 0.5825689334745631, + "learning_rate": 4.045889733362717e-06, + "loss": 0.4833, + "step": 1703 + }, + { + "epoch": 0.73, + "grad_norm": 0.5429276122164189, + "learning_rate": 4.044412638978012e-06, + "loss": 0.4792, + "step": 1704 + }, + { + "epoch": 0.73, + "eval_loss": 0.4791695773601532, + "eval_runtime": 6928.6024, + "eval_samples_per_second": 41.912, + "eval_steps_per_second": 2.096, + "step": 1704 + }, + { + "epoch": 0.73, + "grad_norm": 0.5506114214338211, + "learning_rate": 4.042934672203651e-06, + "loss": 0.4495, + "step": 1705 + }, + { + "epoch": 0.73, + "grad_norm": 0.5578625634328879, + "learning_rate": 4.041455833874488e-06, + "loss": 0.4522, + "step": 1706 + }, + { + "epoch": 0.73, + "grad_norm": 0.5795772583330789, + "learning_rate": 4.039976124825872e-06, + "loss": 0.5001, + "step": 1707 + }, + { + "epoch": 0.73, + "grad_norm": 0.5536178873684257, + "learning_rate": 4.038495545893643e-06, + "loss": 0.4803, + "step": 1708 + }, + { + "epoch": 0.73, + "grad_norm": 0.5392842977706797, + "learning_rate": 4.037014097914135e-06, + "loss": 0.4682, + "step": 1709 + }, + { + "epoch": 0.73, + "grad_norm": 0.5708019402077392, + "learning_rate": 4.0355317817241705e-06, + "loss": 0.4785, + "step": 1710 + }, + { + "epoch": 0.73, + "grad_norm": 0.6113657504825251, + "learning_rate": 4.034048598161061e-06, + "loss": 0.4797, + "step": 1711 + }, + { + "epoch": 0.73, + "grad_norm": 0.5620870459786824, + "learning_rate": 4.032564548062612e-06, + "loss": 0.472, + "step": 1712 + }, + { + "epoch": 0.73, + "grad_norm": 0.5660234184012428, + "learning_rate": 4.0310796322671144e-06, + "loss": 0.4867, + "step": 1713 + }, + { + "epoch": 0.73, + "grad_norm": 0.5428002908374318, + "learning_rate": 4.029593851613351e-06, + "loss": 0.4772, + "step": 1714 + }, + { + "epoch": 0.73, + "grad_norm": 0.5576743567400213, + "learning_rate": 4.028107206940592e-06, + "loss": 0.484, + "step": 1715 + }, + { + "epoch": 0.73, + "grad_norm": 0.5472128158032424, + "learning_rate": 4.0266196990885955e-06, + "loss": 0.5003, + "step": 1716 + }, + { + "epoch": 0.73, + "grad_norm": 0.5194449059148619, + "learning_rate": 4.025131328897608e-06, + "loss": 0.4718, + "step": 1717 + }, + { + "epoch": 0.73, + "grad_norm": 0.5611014049931291, + "learning_rate": 4.023642097208362e-06, + "loss": 0.4812, + "step": 1718 + }, + { + "epoch": 0.73, + "grad_norm": 0.6211118572373506, + "learning_rate": 4.022152004862079e-06, + "loss": 0.4877, + "step": 1719 + }, + { + "epoch": 0.73, + "grad_norm": 0.541074320013028, + "learning_rate": 4.020661052700462e-06, + "loss": 0.4996, + "step": 1720 + }, + { + "epoch": 0.74, + "grad_norm": 0.5552304848575638, + "learning_rate": 4.019169241565704e-06, + "loss": 0.5153, + "step": 1721 + }, + { + "epoch": 0.74, + "grad_norm": 0.553005883252194, + "learning_rate": 4.0176765723004805e-06, + "loss": 0.4583, + "step": 1722 + }, + { + "epoch": 0.74, + "grad_norm": 0.5609121544313215, + "learning_rate": 4.0161830457479555e-06, + "loss": 0.4524, + "step": 1723 + }, + { + "epoch": 0.74, + "grad_norm": 0.5641653969492663, + "learning_rate": 4.014688662751773e-06, + "loss": 0.4543, + "step": 1724 + }, + { + "epoch": 0.74, + "grad_norm": 0.5328244993549478, + "learning_rate": 4.013193424156062e-06, + "loss": 0.4417, + "step": 1725 + }, + { + "epoch": 0.74, + "grad_norm": 0.5561692361326591, + "learning_rate": 4.011697330805436e-06, + "loss": 0.488, + "step": 1726 + }, + { + "epoch": 0.74, + "grad_norm": 0.5700577799667438, + "learning_rate": 4.010200383544992e-06, + "loss": 0.4964, + "step": 1727 + }, + { + "epoch": 0.74, + "grad_norm": 0.5451521766339095, + "learning_rate": 4.0087025832203065e-06, + "loss": 0.4556, + "step": 1728 + }, + { + "epoch": 0.74, + "grad_norm": 0.5817456153792706, + "learning_rate": 4.007203930677438e-06, + "loss": 0.4983, + "step": 1729 + }, + { + "epoch": 0.74, + "grad_norm": 0.6380188720129754, + "learning_rate": 4.00570442676293e-06, + "loss": 0.4805, + "step": 1730 + }, + { + "epoch": 0.74, + "grad_norm": 0.5205063698001692, + "learning_rate": 4.0042040723238055e-06, + "loss": 0.4659, + "step": 1731 + }, + { + "epoch": 0.74, + "grad_norm": 0.5768671155441834, + "learning_rate": 4.002702868207563e-06, + "loss": 0.4873, + "step": 1732 + }, + { + "epoch": 0.74, + "grad_norm": 0.5494137373354541, + "learning_rate": 4.001200815262188e-06, + "loss": 0.4723, + "step": 1733 + }, + { + "epoch": 0.74, + "grad_norm": 0.5585419846591574, + "learning_rate": 3.999697914336143e-06, + "loss": 0.4873, + "step": 1734 + }, + { + "epoch": 0.74, + "grad_norm": 0.606023064302735, + "learning_rate": 3.9981941662783675e-06, + "loss": 0.5029, + "step": 1735 + }, + { + "epoch": 0.74, + "grad_norm": 0.5623066425478451, + "learning_rate": 3.996689571938282e-06, + "loss": 0.463, + "step": 1736 + }, + { + "epoch": 0.74, + "grad_norm": 0.5971959876411371, + "learning_rate": 3.995184132165783e-06, + "loss": 0.4753, + "step": 1737 + }, + { + "epoch": 0.74, + "grad_norm": 0.54900424681578, + "learning_rate": 3.993677847811247e-06, + "loss": 0.4872, + "step": 1738 + }, + { + "epoch": 0.74, + "grad_norm": 0.574046829299625, + "learning_rate": 3.992170719725524e-06, + "loss": 0.4602, + "step": 1739 + }, + { + "epoch": 0.74, + "grad_norm": 0.588055937602866, + "learning_rate": 3.990662748759946e-06, + "loss": 0.4915, + "step": 1740 + }, + { + "epoch": 0.74, + "grad_norm": 0.5684082620173647, + "learning_rate": 3.989153935766314e-06, + "loss": 0.4765, + "step": 1741 + }, + { + "epoch": 0.74, + "grad_norm": 0.5755931683742708, + "learning_rate": 3.987644281596913e-06, + "loss": 0.4909, + "step": 1742 + }, + { + "epoch": 0.74, + "grad_norm": 0.553216450579215, + "learning_rate": 3.986133787104496e-06, + "loss": 0.4723, + "step": 1743 + }, + { + "epoch": 0.75, + "grad_norm": 0.5366461565908377, + "learning_rate": 3.984622453142293e-06, + "loss": 0.4624, + "step": 1744 + }, + { + "epoch": 0.75, + "grad_norm": 0.5610507340686801, + "learning_rate": 3.983110280564009e-06, + "loss": 0.5002, + "step": 1745 + }, + { + "epoch": 0.75, + "grad_norm": 0.5676547948280201, + "learning_rate": 3.981597270223822e-06, + "loss": 0.4897, + "step": 1746 + }, + { + "epoch": 0.75, + "grad_norm": 0.5702435461535129, + "learning_rate": 3.980083422976386e-06, + "loss": 0.4706, + "step": 1747 + }, + { + "epoch": 0.75, + "grad_norm": 0.5682513989307584, + "learning_rate": 3.978568739676822e-06, + "loss": 0.4631, + "step": 1748 + }, + { + "epoch": 0.75, + "grad_norm": 0.5668309362429137, + "learning_rate": 3.977053221180729e-06, + "loss": 0.4878, + "step": 1749 + }, + { + "epoch": 0.75, + "grad_norm": 0.5630149523518507, + "learning_rate": 3.975536868344174e-06, + "loss": 0.4971, + "step": 1750 + }, + { + "epoch": 0.75, + "grad_norm": 0.5525662343852639, + "learning_rate": 3.974019682023695e-06, + "loss": 0.4765, + "step": 1751 + }, + { + "epoch": 0.75, + "grad_norm": 0.5456185795288315, + "learning_rate": 3.972501663076306e-06, + "loss": 0.4695, + "step": 1752 + }, + { + "epoch": 0.75, + "grad_norm": 0.543284167845247, + "learning_rate": 3.9709828123594855e-06, + "loss": 0.4546, + "step": 1753 + }, + { + "epoch": 0.75, + "grad_norm": 0.5783138022249001, + "learning_rate": 3.969463130731183e-06, + "loss": 0.5026, + "step": 1754 + }, + { + "epoch": 0.75, + "grad_norm": 0.5821103906157934, + "learning_rate": 3.96794261904982e-06, + "loss": 0.4685, + "step": 1755 + }, + { + "epoch": 0.75, + "grad_norm": 0.5776445776955914, + "learning_rate": 3.9664212781742865e-06, + "loss": 0.4696, + "step": 1756 + }, + { + "epoch": 0.75, + "grad_norm": 0.5444444652574508, + "learning_rate": 3.964899108963937e-06, + "loss": 0.4715, + "step": 1757 + }, + { + "epoch": 0.75, + "grad_norm": 0.5791101056295087, + "learning_rate": 3.963376112278597e-06, + "loss": 0.4892, + "step": 1758 + }, + { + "epoch": 0.75, + "grad_norm": 0.5409942798323824, + "learning_rate": 3.96185228897856e-06, + "loss": 0.4679, + "step": 1759 + }, + { + "epoch": 0.75, + "grad_norm": 0.5313906519312775, + "learning_rate": 3.9603276399245864e-06, + "loss": 0.4364, + "step": 1760 + }, + { + "epoch": 0.75, + "grad_norm": 0.5590372661816775, + "learning_rate": 3.9588021659779e-06, + "loss": 0.4478, + "step": 1761 + }, + { + "epoch": 0.75, + "grad_norm": 0.5535499771886758, + "learning_rate": 3.957275868000192e-06, + "loss": 0.4771, + "step": 1762 + }, + { + "epoch": 0.75, + "grad_norm": 0.5343930689867736, + "learning_rate": 3.9557487468536225e-06, + "loss": 0.4735, + "step": 1763 + }, + { + "epoch": 0.75, + "grad_norm": 0.5610690419155818, + "learning_rate": 3.954220803400811e-06, + "loss": 0.4762, + "step": 1764 + }, + { + "epoch": 0.75, + "grad_norm": 0.5305051159735634, + "learning_rate": 3.9526920385048465e-06, + "loss": 0.4619, + "step": 1765 + }, + { + "epoch": 0.75, + "grad_norm": 0.5436426867711756, + "learning_rate": 3.951162453029278e-06, + "loss": 0.4793, + "step": 1766 + }, + { + "epoch": 0.75, + "grad_norm": 0.5326612510661454, + "learning_rate": 3.94963204783812e-06, + "loss": 0.4849, + "step": 1767 + }, + { + "epoch": 0.76, + "grad_norm": 0.5515320255957604, + "learning_rate": 3.948100823795851e-06, + "loss": 0.503, + "step": 1768 + }, + { + "epoch": 0.76, + "grad_norm": 0.5694215728858405, + "learning_rate": 3.946568781767409e-06, + "loss": 0.4945, + "step": 1769 + }, + { + "epoch": 0.76, + "grad_norm": 0.5641781422463842, + "learning_rate": 3.945035922618198e-06, + "loss": 0.478, + "step": 1770 + }, + { + "epoch": 0.76, + "grad_norm": 0.5588846997248246, + "learning_rate": 3.94350224721408e-06, + "loss": 0.4798, + "step": 1771 + }, + { + "epoch": 0.76, + "grad_norm": 0.5569535296640705, + "learning_rate": 3.9419677564213795e-06, + "loss": 0.4866, + "step": 1772 + }, + { + "epoch": 0.76, + "grad_norm": 0.6058189007310192, + "learning_rate": 3.9404324511068825e-06, + "loss": 0.4777, + "step": 1773 + }, + { + "epoch": 0.76, + "grad_norm": 0.5488550968313441, + "learning_rate": 3.938896332137834e-06, + "loss": 0.4885, + "step": 1774 + }, + { + "epoch": 0.76, + "grad_norm": 0.5717371621896641, + "learning_rate": 3.937359400381938e-06, + "loss": 0.481, + "step": 1775 + }, + { + "epoch": 0.76, + "eval_loss": 0.47748449444770813, + "eval_runtime": 6921.6137, + "eval_samples_per_second": 41.954, + "eval_steps_per_second": 2.098, + "step": 1775 + }, + { + "epoch": 0.76, + "grad_norm": 0.5658570817407027, + "learning_rate": 3.935821656707359e-06, + "loss": 0.4739, + "step": 1776 + }, + { + "epoch": 0.76, + "grad_norm": 0.6083719336101632, + "learning_rate": 3.93428310198272e-06, + "loss": 0.5087, + "step": 1777 + }, + { + "epoch": 0.76, + "grad_norm": 0.5358828780708986, + "learning_rate": 3.932743737077101e-06, + "loss": 0.4849, + "step": 1778 + }, + { + "epoch": 0.76, + "grad_norm": 0.550854913111367, + "learning_rate": 3.931203562860042e-06, + "loss": 0.5071, + "step": 1779 + }, + { + "epoch": 0.76, + "grad_norm": 0.5347707663990211, + "learning_rate": 3.929662580201536e-06, + "loss": 0.4794, + "step": 1780 + }, + { + "epoch": 0.76, + "grad_norm": 0.5551468944100317, + "learning_rate": 3.928120789972036e-06, + "loss": 0.4689, + "step": 1781 + }, + { + "epoch": 0.76, + "grad_norm": 0.5727517444264358, + "learning_rate": 3.926578193042451e-06, + "loss": 0.4952, + "step": 1782 + }, + { + "epoch": 0.76, + "grad_norm": 0.5623029835503726, + "learning_rate": 3.9250347902841456e-06, + "loss": 0.4723, + "step": 1783 + }, + { + "epoch": 0.76, + "grad_norm": 0.5452388086485114, + "learning_rate": 3.923490582568937e-06, + "loss": 0.4746, + "step": 1784 + }, + { + "epoch": 0.76, + "grad_norm": 0.525242855196112, + "learning_rate": 3.9219455707691004e-06, + "loss": 0.4802, + "step": 1785 + }, + { + "epoch": 0.76, + "grad_norm": 0.6002892698931297, + "learning_rate": 3.920399755757365e-06, + "loss": 0.4951, + "step": 1786 + }, + { + "epoch": 0.76, + "grad_norm": 0.5602495779757628, + "learning_rate": 3.9188531384069095e-06, + "loss": 0.4791, + "step": 1787 + }, + { + "epoch": 0.76, + "grad_norm": 0.5483232695317782, + "learning_rate": 3.917305719591372e-06, + "loss": 0.4699, + "step": 1788 + }, + { + "epoch": 0.76, + "grad_norm": 0.5742707937304822, + "learning_rate": 3.915757500184838e-06, + "loss": 0.4729, + "step": 1789 + }, + { + "epoch": 0.76, + "grad_norm": 0.5424718840301662, + "learning_rate": 3.91420848106185e-06, + "loss": 0.4983, + "step": 1790 + }, + { + "epoch": 0.77, + "grad_norm": 0.5297073714321522, + "learning_rate": 3.912658663097396e-06, + "loss": 0.4725, + "step": 1791 + }, + { + "epoch": 0.77, + "grad_norm": 0.5391456048094847, + "learning_rate": 3.911108047166924e-06, + "loss": 0.4588, + "step": 1792 + }, + { + "epoch": 0.77, + "grad_norm": 0.5422617623881943, + "learning_rate": 3.909556634146323e-06, + "loss": 0.461, + "step": 1793 + }, + { + "epoch": 0.77, + "grad_norm": 0.5559883587716514, + "learning_rate": 3.908004424911939e-06, + "loss": 0.4981, + "step": 1794 + }, + { + "epoch": 0.77, + "grad_norm": 0.5355343530496602, + "learning_rate": 3.906451420340566e-06, + "loss": 0.4845, + "step": 1795 + }, + { + "epoch": 0.77, + "grad_norm": 0.6731733861479123, + "learning_rate": 3.904897621309446e-06, + "loss": 0.4936, + "step": 1796 + }, + { + "epoch": 0.77, + "grad_norm": 0.5495562245224338, + "learning_rate": 3.9033430286962714e-06, + "loss": 0.4577, + "step": 1797 + }, + { + "epoch": 0.77, + "grad_norm": 0.5574105971150268, + "learning_rate": 3.901787643379183e-06, + "loss": 0.4693, + "step": 1798 + }, + { + "epoch": 0.77, + "grad_norm": 0.5755389775504788, + "learning_rate": 3.900231466236766e-06, + "loss": 0.4844, + "step": 1799 + }, + { + "epoch": 0.77, + "grad_norm": 0.6078323318223101, + "learning_rate": 3.898674498148058e-06, + "loss": 0.4728, + "step": 1800 + }, + { + "epoch": 0.77, + "grad_norm": 0.5907606344024237, + "learning_rate": 3.897116739992539e-06, + "loss": 0.485, + "step": 1801 + }, + { + "epoch": 0.77, + "grad_norm": 0.5509448118458101, + "learning_rate": 3.89555819265014e-06, + "loss": 0.4868, + "step": 1802 + }, + { + "epoch": 0.77, + "grad_norm": 0.5247691225819013, + "learning_rate": 3.893998857001231e-06, + "loss": 0.4919, + "step": 1803 + }, + { + "epoch": 0.77, + "grad_norm": 0.5708932504567875, + "learning_rate": 3.892438733926634e-06, + "loss": 0.4607, + "step": 1804 + }, + { + "epoch": 0.77, + "grad_norm": 0.8363516955156233, + "learning_rate": 3.890877824307611e-06, + "loss": 0.4721, + "step": 1805 + }, + { + "epoch": 0.77, + "grad_norm": 0.5591473348688127, + "learning_rate": 3.889316129025873e-06, + "loss": 0.4905, + "step": 1806 + }, + { + "epoch": 0.77, + "grad_norm": 0.5918209400494773, + "learning_rate": 3.887753648963569e-06, + "loss": 0.473, + "step": 1807 + }, + { + "epoch": 0.77, + "grad_norm": 0.5897196038146727, + "learning_rate": 3.886190385003297e-06, + "loss": 0.4528, + "step": 1808 + }, + { + "epoch": 0.77, + "grad_norm": 0.6070139742560858, + "learning_rate": 3.884626338028093e-06, + "loss": 0.4734, + "step": 1809 + }, + { + "epoch": 0.77, + "grad_norm": 0.6106551950471045, + "learning_rate": 3.883061508921439e-06, + "loss": 0.5053, + "step": 1810 + }, + { + "epoch": 0.77, + "grad_norm": 0.5919165573743996, + "learning_rate": 3.8814958985672564e-06, + "loss": 0.4792, + "step": 1811 + }, + { + "epoch": 0.77, + "grad_norm": 0.5683775432095641, + "learning_rate": 3.87992950784991e-06, + "loss": 0.4869, + "step": 1812 + }, + { + "epoch": 0.77, + "grad_norm": 0.5693276502530639, + "learning_rate": 3.878362337654203e-06, + "loss": 0.4919, + "step": 1813 + }, + { + "epoch": 0.78, + "grad_norm": 0.6456814342581492, + "learning_rate": 3.87679438886538e-06, + "loss": 0.5012, + "step": 1814 + }, + { + "epoch": 0.78, + "grad_norm": 0.6047567254595965, + "learning_rate": 3.875225662369125e-06, + "loss": 0.489, + "step": 1815 + }, + { + "epoch": 0.78, + "grad_norm": 0.6019100134305163, + "learning_rate": 3.8736561590515646e-06, + "loss": 0.4998, + "step": 1816 + }, + { + "epoch": 0.78, + "grad_norm": 0.5455946748258141, + "learning_rate": 3.872085879799258e-06, + "loss": 0.4617, + "step": 1817 + }, + { + "epoch": 0.78, + "grad_norm": 0.593910191752196, + "learning_rate": 3.870514825499208e-06, + "loss": 0.4898, + "step": 1818 + }, + { + "epoch": 0.78, + "grad_norm": 0.6125446125342663, + "learning_rate": 3.868942997038853e-06, + "loss": 0.4781, + "step": 1819 + }, + { + "epoch": 0.78, + "grad_norm": 0.5571000922345232, + "learning_rate": 3.8673703953060685e-06, + "loss": 0.5001, + "step": 1820 + }, + { + "epoch": 0.78, + "grad_norm": 0.6338578928227893, + "learning_rate": 3.865797021189167e-06, + "loss": 0.5032, + "step": 1821 + }, + { + "epoch": 0.78, + "grad_norm": 0.5693059830561665, + "learning_rate": 3.864222875576898e-06, + "loss": 0.4717, + "step": 1822 + }, + { + "epoch": 0.78, + "grad_norm": 0.5713524469999574, + "learning_rate": 3.862647959358447e-06, + "loss": 0.469, + "step": 1823 + }, + { + "epoch": 0.78, + "grad_norm": 0.5469501972304562, + "learning_rate": 3.861072273423434e-06, + "loss": 0.5001, + "step": 1824 + }, + { + "epoch": 0.78, + "grad_norm": 0.5643553209363702, + "learning_rate": 3.859495818661914e-06, + "loss": 0.4752, + "step": 1825 + }, + { + "epoch": 0.78, + "grad_norm": 0.6079497862523476, + "learning_rate": 3.857918595964375e-06, + "loss": 0.4657, + "step": 1826 + }, + { + "epoch": 0.78, + "grad_norm": 0.5940401601609584, + "learning_rate": 3.8563406062217405e-06, + "loss": 0.4794, + "step": 1827 + }, + { + "epoch": 0.78, + "grad_norm": 0.5224134371483142, + "learning_rate": 3.8547618503253685e-06, + "loss": 0.4596, + "step": 1828 + }, + { + "epoch": 0.78, + "grad_norm": 0.5691746499773465, + "learning_rate": 3.8531823291670455e-06, + "loss": 0.4705, + "step": 1829 + }, + { + "epoch": 0.78, + "grad_norm": 0.5709823691070093, + "learning_rate": 3.8516020436389945e-06, + "loss": 0.4802, + "step": 1830 + }, + { + "epoch": 0.78, + "grad_norm": 0.5674686057693449, + "learning_rate": 3.850020994633869e-06, + "loss": 0.5046, + "step": 1831 + }, + { + "epoch": 0.78, + "grad_norm": 0.5942932229729947, + "learning_rate": 3.848439183044751e-06, + "loss": 0.465, + "step": 1832 + }, + { + "epoch": 0.78, + "grad_norm": 0.6007440298375468, + "learning_rate": 3.846856609765158e-06, + "loss": 0.485, + "step": 1833 + }, + { + "epoch": 0.78, + "grad_norm": 0.5905695400663254, + "learning_rate": 3.845273275689035e-06, + "loss": 0.4898, + "step": 1834 + }, + { + "epoch": 0.78, + "grad_norm": 0.5526340123188758, + "learning_rate": 3.843689181710756e-06, + "loss": 0.4895, + "step": 1835 + }, + { + "epoch": 0.78, + "grad_norm": 0.585057913171662, + "learning_rate": 3.842104328725127e-06, + "loss": 0.457, + "step": 1836 + }, + { + "epoch": 0.78, + "grad_norm": 0.5917091841007728, + "learning_rate": 3.8405187176273794e-06, + "loss": 0.4745, + "step": 1837 + }, + { + "epoch": 0.79, + "grad_norm": 0.5831734022479653, + "learning_rate": 3.838932349313176e-06, + "loss": 0.5003, + "step": 1838 + }, + { + "epoch": 0.79, + "grad_norm": 0.5475814395504766, + "learning_rate": 3.837345224678605e-06, + "loss": 0.4875, + "step": 1839 + }, + { + "epoch": 0.79, + "grad_norm": 0.5752248721709146, + "learning_rate": 3.835757344620183e-06, + "loss": 0.4636, + "step": 1840 + }, + { + "epoch": 0.79, + "grad_norm": 0.5901567903691399, + "learning_rate": 3.8341687100348536e-06, + "loss": 0.4924, + "step": 1841 + }, + { + "epoch": 0.79, + "grad_norm": 0.5434020555712842, + "learning_rate": 3.832579321819985e-06, + "loss": 0.4634, + "step": 1842 + }, + { + "epoch": 0.79, + "grad_norm": 0.5492021117737101, + "learning_rate": 3.830989180873373e-06, + "loss": 0.4777, + "step": 1843 + }, + { + "epoch": 0.79, + "grad_norm": 0.5306994578054051, + "learning_rate": 3.829398288093237e-06, + "loss": 0.472, + "step": 1844 + }, + { + "epoch": 0.79, + "grad_norm": 0.5921460420396056, + "learning_rate": 3.827806644378221e-06, + "loss": 0.4951, + "step": 1845 + }, + { + "epoch": 0.79, + "grad_norm": 0.5211955272058552, + "learning_rate": 3.826214250627397e-06, + "loss": 0.4513, + "step": 1846 + }, + { + "epoch": 0.79, + "eval_loss": 0.47622737288475037, + "eval_runtime": 6919.5773, + "eval_samples_per_second": 41.966, + "eval_steps_per_second": 2.098, + "step": 1846 + }, + { + "epoch": 0.79, + "grad_norm": 0.572627752021322, + "learning_rate": 3.824621107740255e-06, + "loss": 0.5055, + "step": 1847 + }, + { + "epoch": 0.79, + "grad_norm": 0.5855256130142836, + "learning_rate": 3.823027216616711e-06, + "loss": 0.4712, + "step": 1848 + }, + { + "epoch": 0.79, + "grad_norm": 0.5330067035136337, + "learning_rate": 3.821432578157105e-06, + "loss": 0.461, + "step": 1849 + }, + { + "epoch": 0.79, + "grad_norm": 0.5379073734644638, + "learning_rate": 3.819837193262197e-06, + "loss": 0.4646, + "step": 1850 + }, + { + "epoch": 0.79, + "grad_norm": 0.5550409399367109, + "learning_rate": 3.818241062833168e-06, + "loss": 0.5092, + "step": 1851 + }, + { + "epoch": 0.79, + "grad_norm": 0.5475883585070609, + "learning_rate": 3.816644187771624e-06, + "loss": 0.4845, + "step": 1852 + }, + { + "epoch": 0.79, + "grad_norm": 0.5304094349485199, + "learning_rate": 3.815046568979585e-06, + "loss": 0.5045, + "step": 1853 + }, + { + "epoch": 0.79, + "grad_norm": 0.5902094724693763, + "learning_rate": 3.8134482073594997e-06, + "loss": 0.4703, + "step": 1854 + }, + { + "epoch": 0.79, + "grad_norm": 0.5373657261604207, + "learning_rate": 3.811849103814229e-06, + "loss": 0.4438, + "step": 1855 + }, + { + "epoch": 0.79, + "grad_norm": 0.5500441237475201, + "learning_rate": 3.8102492592470562e-06, + "loss": 0.4808, + "step": 1856 + }, + { + "epoch": 0.79, + "grad_norm": 0.587067338502479, + "learning_rate": 3.808648674561683e-06, + "loss": 0.484, + "step": 1857 + }, + { + "epoch": 0.79, + "grad_norm": 0.5762212544281782, + "learning_rate": 3.8070473506622283e-06, + "loss": 0.4765, + "step": 1858 + }, + { + "epoch": 0.79, + "grad_norm": 0.5750217699914456, + "learning_rate": 3.80544528845323e-06, + "loss": 0.4695, + "step": 1859 + }, + { + "epoch": 0.79, + "grad_norm": 0.5928903124224453, + "learning_rate": 3.803842488839642e-06, + "loss": 0.4931, + "step": 1860 + }, + { + "epoch": 0.8, + "grad_norm": 0.5211947228388228, + "learning_rate": 3.8022389527268344e-06, + "loss": 0.4671, + "step": 1861 + }, + { + "epoch": 0.8, + "grad_norm": 0.5624138113341162, + "learning_rate": 3.8006346810205935e-06, + "loss": 0.462, + "step": 1862 + }, + { + "epoch": 0.8, + "grad_norm": 0.6055264031899258, + "learning_rate": 3.7990296746271227e-06, + "loss": 0.4951, + "step": 1863 + }, + { + "epoch": 0.8, + "grad_norm": 0.575871000906671, + "learning_rate": 3.797423934453038e-06, + "loss": 0.4613, + "step": 1864 + }, + { + "epoch": 0.8, + "grad_norm": 0.5436757648363332, + "learning_rate": 3.795817461405372e-06, + "loss": 0.495, + "step": 1865 + }, + { + "epoch": 0.8, + "grad_norm": 0.5489759393271353, + "learning_rate": 3.7942102563915693e-06, + "loss": 0.4761, + "step": 1866 + }, + { + "epoch": 0.8, + "grad_norm": 0.5799706124416135, + "learning_rate": 3.79260232031949e-06, + "loss": 0.4992, + "step": 1867 + }, + { + "epoch": 0.8, + "grad_norm": 0.6021574593638155, + "learning_rate": 3.7909936540974052e-06, + "loss": 0.4573, + "step": 1868 + }, + { + "epoch": 0.8, + "grad_norm": 0.5528129676449581, + "learning_rate": 3.7893842586340003e-06, + "loss": 0.515, + "step": 1869 + }, + { + "epoch": 0.8, + "grad_norm": 0.5695311954254395, + "learning_rate": 3.7877741348383703e-06, + "loss": 0.4728, + "step": 1870 + }, + { + "epoch": 0.8, + "grad_norm": 0.5592177344188098, + "learning_rate": 3.7861632836200245e-06, + "loss": 0.4673, + "step": 1871 + }, + { + "epoch": 0.8, + "grad_norm": 0.5611705084883242, + "learning_rate": 3.784551705888881e-06, + "loss": 0.4584, + "step": 1872 + }, + { + "epoch": 0.8, + "grad_norm": 0.5539834069231695, + "learning_rate": 3.7829394025552684e-06, + "loss": 0.4644, + "step": 1873 + }, + { + "epoch": 0.8, + "grad_norm": 0.537446301860318, + "learning_rate": 3.7813263745299257e-06, + "loss": 0.4927, + "step": 1874 + }, + { + "epoch": 0.8, + "grad_norm": 0.5596544860638828, + "learning_rate": 3.779712622724003e-06, + "loss": 0.4647, + "step": 1875 + }, + { + "epoch": 0.8, + "grad_norm": 0.544652883431319, + "learning_rate": 3.7780981480490554e-06, + "loss": 0.4955, + "step": 1876 + }, + { + "epoch": 0.8, + "grad_norm": 0.5512313969653201, + "learning_rate": 3.776482951417049e-06, + "loss": 0.4644, + "step": 1877 + }, + { + "epoch": 0.8, + "grad_norm": 0.5497506195632861, + "learning_rate": 3.774867033740357e-06, + "loss": 0.4789, + "step": 1878 + }, + { + "epoch": 0.8, + "grad_norm": 0.5742672691362793, + "learning_rate": 3.77325039593176e-06, + "loss": 0.4729, + "step": 1879 + }, + { + "epoch": 0.8, + "grad_norm": 0.5629948393220319, + "learning_rate": 3.7716330389044463e-06, + "loss": 0.4723, + "step": 1880 + }, + { + "epoch": 0.8, + "grad_norm": 0.5888605808332452, + "learning_rate": 3.7700149635720086e-06, + "loss": 0.4716, + "step": 1881 + }, + { + "epoch": 0.8, + "grad_norm": 0.5578428602453922, + "learning_rate": 3.768396170848445e-06, + "loss": 0.4958, + "step": 1882 + }, + { + "epoch": 0.8, + "grad_norm": 0.5466856701572237, + "learning_rate": 3.766776661648163e-06, + "loss": 0.4712, + "step": 1883 + }, + { + "epoch": 0.8, + "grad_norm": 0.5650833519106695, + "learning_rate": 3.76515643688597e-06, + "loss": 0.4812, + "step": 1884 + }, + { + "epoch": 0.81, + "grad_norm": 0.5662580501911987, + "learning_rate": 3.76353549747708e-06, + "loss": 0.477, + "step": 1885 + }, + { + "epoch": 0.81, + "grad_norm": 0.5515090242427748, + "learning_rate": 3.76191384433711e-06, + "loss": 0.4784, + "step": 1886 + }, + { + "epoch": 0.81, + "grad_norm": 0.5651933942612933, + "learning_rate": 3.76029147838208e-06, + "loss": 0.4493, + "step": 1887 + }, + { + "epoch": 0.81, + "grad_norm": 0.5534626182885516, + "learning_rate": 3.7586684005284146e-06, + "loss": 0.4823, + "step": 1888 + }, + { + "epoch": 0.81, + "grad_norm": 0.5308600240405152, + "learning_rate": 3.7570446116929372e-06, + "loss": 0.448, + "step": 1889 + }, + { + "epoch": 0.81, + "grad_norm": 0.5654195108385544, + "learning_rate": 3.7554201127928747e-06, + "loss": 0.4679, + "step": 1890 + }, + { + "epoch": 0.81, + "grad_norm": 0.5815086921921774, + "learning_rate": 3.7537949047458567e-06, + "loss": 0.4637, + "step": 1891 + }, + { + "epoch": 0.81, + "grad_norm": 0.5603546362158033, + "learning_rate": 3.7521689884699093e-06, + "loss": 0.4805, + "step": 1892 + }, + { + "epoch": 0.81, + "grad_norm": 0.5835928980888774, + "learning_rate": 3.750542364883462e-06, + "loss": 0.4665, + "step": 1893 + }, + { + "epoch": 0.81, + "grad_norm": 0.5706169202769774, + "learning_rate": 3.748915034905344e-06, + "loss": 0.4889, + "step": 1894 + }, + { + "epoch": 0.81, + "grad_norm": 0.5269442750245277, + "learning_rate": 3.74728699945478e-06, + "loss": 0.465, + "step": 1895 + }, + { + "epoch": 0.81, + "grad_norm": 0.5547808175518558, + "learning_rate": 3.745658259451397e-06, + "loss": 0.4639, + "step": 1896 + }, + { + "epoch": 0.81, + "grad_norm": 0.6267511707684223, + "learning_rate": 3.744028815815219e-06, + "loss": 0.4865, + "step": 1897 + }, + { + "epoch": 0.81, + "grad_norm": 0.5643243359785219, + "learning_rate": 3.742398669466665e-06, + "loss": 0.4897, + "step": 1898 + }, + { + "epoch": 0.81, + "grad_norm": 0.5600481323676141, + "learning_rate": 3.740767821326555e-06, + "loss": 0.4708, + "step": 1899 + }, + { + "epoch": 0.81, + "grad_norm": 0.5421854980261044, + "learning_rate": 3.739136272316102e-06, + "loss": 0.4797, + "step": 1900 + }, + { + "epoch": 0.81, + "grad_norm": 0.5504823302075539, + "learning_rate": 3.737504023356916e-06, + "loss": 0.4477, + "step": 1901 + }, + { + "epoch": 0.81, + "grad_norm": 0.5596996727298843, + "learning_rate": 3.735871075371004e-06, + "loss": 0.4751, + "step": 1902 + }, + { + "epoch": 0.81, + "grad_norm": 0.5490200023377907, + "learning_rate": 3.734237429280766e-06, + "loss": 0.4667, + "step": 1903 + }, + { + "epoch": 0.81, + "grad_norm": 0.6262743935742706, + "learning_rate": 3.7326030860089955e-06, + "loss": 0.4838, + "step": 1904 + }, + { + "epoch": 0.81, + "grad_norm": 0.5824379568686556, + "learning_rate": 3.7309680464788835e-06, + "loss": 0.4905, + "step": 1905 + }, + { + "epoch": 0.81, + "grad_norm": 0.53774534973763, + "learning_rate": 3.72933231161401e-06, + "loss": 0.4701, + "step": 1906 + }, + { + "epoch": 0.81, + "grad_norm": 0.5754065009216981, + "learning_rate": 3.72769588233835e-06, + "loss": 0.5086, + "step": 1907 + }, + { + "epoch": 0.82, + "grad_norm": 0.5438398948047919, + "learning_rate": 3.726058759576271e-06, + "loss": 0.5028, + "step": 1908 + }, + { + "epoch": 0.82, + "grad_norm": 0.5545263081214099, + "learning_rate": 3.724420944252531e-06, + "loss": 0.4742, + "step": 1909 + }, + { + "epoch": 0.82, + "grad_norm": 0.5598750002548621, + "learning_rate": 3.72278243729228e-06, + "loss": 0.4568, + "step": 1910 + }, + { + "epoch": 0.82, + "grad_norm": 0.5657656081137544, + "learning_rate": 3.7211432396210595e-06, + "loss": 0.4478, + "step": 1911 + }, + { + "epoch": 0.82, + "grad_norm": 0.5621097014773546, + "learning_rate": 3.7195033521647987e-06, + "loss": 0.4779, + "step": 1912 + }, + { + "epoch": 0.82, + "grad_norm": 0.5412943721977179, + "learning_rate": 3.7178627758498194e-06, + "loss": 0.4553, + "step": 1913 + }, + { + "epoch": 0.82, + "grad_norm": 0.577522286854261, + "learning_rate": 3.71622151160283e-06, + "loss": 0.4829, + "step": 1914 + }, + { + "epoch": 0.82, + "grad_norm": 0.543384848094331, + "learning_rate": 3.7145795603509282e-06, + "loss": 0.4582, + "step": 1915 + }, + { + "epoch": 0.82, + "grad_norm": 0.5778457930279592, + "learning_rate": 3.712936923021602e-06, + "loss": 0.4713, + "step": 1916 + }, + { + "epoch": 0.82, + "grad_norm": 0.5766320078694117, + "learning_rate": 3.7112936005427237e-06, + "loss": 0.4683, + "step": 1917 + }, + { + "epoch": 0.82, + "eval_loss": 0.4745747447013855, + "eval_runtime": 6921.2455, + "eval_samples_per_second": 41.956, + "eval_steps_per_second": 2.098, + "step": 1917 + }, + { + "epoch": 0.82, + "grad_norm": 0.5432751258589548, + "learning_rate": 3.7096495938425537e-06, + "loss": 0.4674, + "step": 1918 + }, + { + "epoch": 0.82, + "grad_norm": 0.569125410720449, + "learning_rate": 3.7080049038497405e-06, + "loss": 0.4793, + "step": 1919 + }, + { + "epoch": 0.82, + "grad_norm": 0.5681975559829856, + "learning_rate": 3.706359531493316e-06, + "loss": 0.4942, + "step": 1920 + }, + { + "epoch": 0.82, + "grad_norm": 0.5598180203564832, + "learning_rate": 3.704713477702699e-06, + "loss": 0.4869, + "step": 1921 + }, + { + "epoch": 0.82, + "grad_norm": 0.5358569958499817, + "learning_rate": 3.703066743407694e-06, + "loss": 0.493, + "step": 1922 + }, + { + "epoch": 0.82, + "grad_norm": 0.5694771092869512, + "learning_rate": 3.701419329538487e-06, + "loss": 0.4798, + "step": 1923 + }, + { + "epoch": 0.82, + "grad_norm": 0.5515301941280839, + "learning_rate": 3.699771237025652e-06, + "loss": 0.4732, + "step": 1924 + }, + { + "epoch": 0.82, + "grad_norm": 0.5836180114241711, + "learning_rate": 3.6981224668001427e-06, + "loss": 0.4868, + "step": 1925 + }, + { + "epoch": 0.82, + "grad_norm": 0.5724166590716755, + "learning_rate": 3.696473019793297e-06, + "loss": 0.4631, + "step": 1926 + }, + { + "epoch": 0.82, + "grad_norm": 0.5794404466148781, + "learning_rate": 3.694822896936836e-06, + "loss": 0.5093, + "step": 1927 + }, + { + "epoch": 0.82, + "grad_norm": 0.5788548411305892, + "learning_rate": 3.6931720991628613e-06, + "loss": 0.4849, + "step": 1928 + }, + { + "epoch": 0.82, + "grad_norm": 0.5388008666947813, + "learning_rate": 3.691520627403856e-06, + "loss": 0.4653, + "step": 1929 + }, + { + "epoch": 0.82, + "grad_norm": 0.548773537353004, + "learning_rate": 3.6898684825926845e-06, + "loss": 0.4823, + "step": 1930 + }, + { + "epoch": 0.82, + "grad_norm": 0.5588102780031893, + "learning_rate": 3.68821566566259e-06, + "loss": 0.4644, + "step": 1931 + }, + { + "epoch": 0.83, + "grad_norm": 0.5614220676467329, + "learning_rate": 3.686562177547197e-06, + "loss": 0.4392, + "step": 1932 + }, + { + "epoch": 0.83, + "grad_norm": 0.57266859860218, + "learning_rate": 3.6849080191805087e-06, + "loss": 0.4784, + "step": 1933 + }, + { + "epoch": 0.83, + "grad_norm": 0.5752230473014237, + "learning_rate": 3.683253191496906e-06, + "loss": 0.4782, + "step": 1934 + }, + { + "epoch": 0.83, + "grad_norm": 0.5692017163618623, + "learning_rate": 3.681597695431149e-06, + "loss": 0.4845, + "step": 1935 + }, + { + "epoch": 0.83, + "grad_norm": 0.5999017797504378, + "learning_rate": 3.6799415319183753e-06, + "loss": 0.5121, + "step": 1936 + }, + { + "epoch": 0.83, + "grad_norm": 0.5542057486345733, + "learning_rate": 3.678284701894097e-06, + "loss": 0.4545, + "step": 1937 + }, + { + "epoch": 0.83, + "grad_norm": 0.5744953689894631, + "learning_rate": 3.6766272062942066e-06, + "loss": 0.4673, + "step": 1938 + }, + { + "epoch": 0.83, + "grad_norm": 0.5519474373087149, + "learning_rate": 3.6749690460549704e-06, + "loss": 0.4768, + "step": 1939 + }, + { + "epoch": 0.83, + "grad_norm": 0.5308709424185206, + "learning_rate": 3.6733102221130303e-06, + "loss": 0.4675, + "step": 1940 + }, + { + "epoch": 0.83, + "grad_norm": 0.5543162922631799, + "learning_rate": 3.6716507354054044e-06, + "loss": 0.4884, + "step": 1941 + }, + { + "epoch": 0.83, + "grad_norm": 0.5525721347222098, + "learning_rate": 3.669990586869482e-06, + "loss": 0.4633, + "step": 1942 + }, + { + "epoch": 0.83, + "grad_norm": 0.5676565038664707, + "learning_rate": 3.6683297774430287e-06, + "loss": 0.4915, + "step": 1943 + }, + { + "epoch": 0.83, + "grad_norm": 0.5354820116105071, + "learning_rate": 3.6666683080641846e-06, + "loss": 0.471, + "step": 1944 + }, + { + "epoch": 0.83, + "grad_norm": 0.5521553904563508, + "learning_rate": 3.6650061796714597e-06, + "loss": 0.4683, + "step": 1945 + }, + { + "epoch": 0.83, + "grad_norm": 0.5853163676085599, + "learning_rate": 3.6633433932037376e-06, + "loss": 0.5075, + "step": 1946 + }, + { + "epoch": 0.83, + "grad_norm": 0.5751517974852843, + "learning_rate": 3.661679949600275e-06, + "loss": 0.4778, + "step": 1947 + }, + { + "epoch": 0.83, + "grad_norm": 0.5440950912602847, + "learning_rate": 3.6600158498006955e-06, + "loss": 0.4706, + "step": 1948 + }, + { + "epoch": 0.83, + "grad_norm": 0.5476897010825572, + "learning_rate": 3.6583510947449983e-06, + "loss": 0.4612, + "step": 1949 + }, + { + "epoch": 0.83, + "grad_norm": 0.5562096216167524, + "learning_rate": 3.656685685373552e-06, + "loss": 0.4692, + "step": 1950 + }, + { + "epoch": 0.83, + "grad_norm": 0.551268497700442, + "learning_rate": 3.6550196226270894e-06, + "loss": 0.4598, + "step": 1951 + }, + { + "epoch": 0.83, + "grad_norm": 0.5391818232642948, + "learning_rate": 3.65335290744672e-06, + "loss": 0.4695, + "step": 1952 + }, + { + "epoch": 0.83, + "grad_norm": 0.5338996664057097, + "learning_rate": 3.6516855407739164e-06, + "loss": 0.4665, + "step": 1953 + }, + { + "epoch": 0.83, + "grad_norm": 0.5579648908867901, + "learning_rate": 3.6500175235505226e-06, + "loss": 0.4883, + "step": 1954 + }, + { + "epoch": 0.84, + "grad_norm": 0.5331291112438312, + "learning_rate": 3.6483488567187473e-06, + "loss": 0.4836, + "step": 1955 + }, + { + "epoch": 0.84, + "grad_norm": 0.556976603909284, + "learning_rate": 3.646679541221168e-06, + "loss": 0.4893, + "step": 1956 + }, + { + "epoch": 0.84, + "grad_norm": 0.5623081423655151, + "learning_rate": 3.6450095780007277e-06, + "loss": 0.4966, + "step": 1957 + }, + { + "epoch": 0.84, + "grad_norm": 0.5490180827544557, + "learning_rate": 3.643338968000736e-06, + "loss": 0.4592, + "step": 1958 + }, + { + "epoch": 0.84, + "grad_norm": 0.5673115477683868, + "learning_rate": 3.641667712164867e-06, + "loss": 0.488, + "step": 1959 + }, + { + "epoch": 0.84, + "grad_norm": 0.5550807543029536, + "learning_rate": 3.6399958114371597e-06, + "loss": 0.477, + "step": 1960 + }, + { + "epoch": 0.84, + "grad_norm": 0.5857922488282928, + "learning_rate": 3.6383232667620195e-06, + "loss": 0.4715, + "step": 1961 + }, + { + "epoch": 0.84, + "grad_norm": 1.31408407563173, + "learning_rate": 3.6366500790842113e-06, + "loss": 0.4798, + "step": 1962 + }, + { + "epoch": 0.84, + "grad_norm": 0.5616626137169414, + "learning_rate": 3.634976249348867e-06, + "loss": 0.4779, + "step": 1963 + }, + { + "epoch": 0.84, + "grad_norm": 0.5426687056510846, + "learning_rate": 3.633301778501481e-06, + "loss": 0.478, + "step": 1964 + }, + { + "epoch": 0.84, + "grad_norm": 0.5819270419939396, + "learning_rate": 3.631626667487906e-06, + "loss": 0.4819, + "step": 1965 + }, + { + "epoch": 0.84, + "grad_norm": 0.5530881380000352, + "learning_rate": 3.6299509172543616e-06, + "loss": 0.4939, + "step": 1966 + }, + { + "epoch": 0.84, + "grad_norm": 0.5705862965767243, + "learning_rate": 3.628274528747424e-06, + "loss": 0.4805, + "step": 1967 + }, + { + "epoch": 0.84, + "grad_norm": 0.6039400830717704, + "learning_rate": 3.6265975029140334e-06, + "loss": 0.4728, + "step": 1968 + }, + { + "epoch": 0.84, + "grad_norm": 0.5576346901828154, + "learning_rate": 3.624919840701488e-06, + "loss": 0.4876, + "step": 1969 + }, + { + "epoch": 0.84, + "grad_norm": 0.58645769749298, + "learning_rate": 3.623241543057445e-06, + "loss": 0.4852, + "step": 1970 + }, + { + "epoch": 0.84, + "grad_norm": 0.5694595348460465, + "learning_rate": 3.6215626109299218e-06, + "loss": 0.4943, + "step": 1971 + }, + { + "epoch": 0.84, + "grad_norm": 0.5505430973110726, + "learning_rate": 3.6198830452672944e-06, + "loss": 0.4823, + "step": 1972 + }, + { + "epoch": 0.84, + "grad_norm": 0.5666099153148945, + "learning_rate": 3.618202847018296e-06, + "loss": 0.4794, + "step": 1973 + }, + { + "epoch": 0.84, + "grad_norm": 0.5649501462357687, + "learning_rate": 3.616522017132017e-06, + "loss": 0.4729, + "step": 1974 + }, + { + "epoch": 0.84, + "grad_norm": 0.6007742923118407, + "learning_rate": 3.614840556557905e-06, + "loss": 0.4757, + "step": 1975 + }, + { + "epoch": 0.84, + "grad_norm": 0.5766917548962726, + "learning_rate": 3.613158466245763e-06, + "loss": 0.4645, + "step": 1976 + }, + { + "epoch": 0.84, + "grad_norm": 0.5540384171742403, + "learning_rate": 3.6114757471457514e-06, + "loss": 0.4525, + "step": 1977 + }, + { + "epoch": 0.85, + "grad_norm": 0.5488107248965611, + "learning_rate": 3.6097924002083838e-06, + "loss": 0.4632, + "step": 1978 + }, + { + "epoch": 0.85, + "grad_norm": 0.556985216041662, + "learning_rate": 3.60810842638453e-06, + "loss": 0.4871, + "step": 1979 + }, + { + "epoch": 0.85, + "grad_norm": 0.5302372510269003, + "learning_rate": 3.606423826625414e-06, + "loss": 0.4819, + "step": 1980 + }, + { + "epoch": 0.85, + "grad_norm": 0.5650443172227675, + "learning_rate": 3.604738601882612e-06, + "loss": 0.4668, + "step": 1981 + }, + { + "epoch": 0.85, + "grad_norm": 0.5687893746146396, + "learning_rate": 3.6030527531080533e-06, + "loss": 0.4668, + "step": 1982 + }, + { + "epoch": 0.85, + "grad_norm": 0.5483149007131946, + "learning_rate": 3.6013662812540217e-06, + "loss": 0.4633, + "step": 1983 + }, + { + "epoch": 0.85, + "grad_norm": 0.5666223715356314, + "learning_rate": 3.5996791872731508e-06, + "loss": 0.4996, + "step": 1984 + }, + { + "epoch": 0.85, + "grad_norm": 0.5610496348612013, + "learning_rate": 3.5979914721184263e-06, + "loss": 0.4638, + "step": 1985 + }, + { + "epoch": 0.85, + "grad_norm": 0.556813398196998, + "learning_rate": 3.5963031367431856e-06, + "loss": 0.489, + "step": 1986 + }, + { + "epoch": 0.85, + "grad_norm": 0.555985238033488, + "learning_rate": 3.594614182101115e-06, + "loss": 0.4644, + "step": 1987 + }, + { + "epoch": 0.85, + "grad_norm": 1.2268257169954715, + "learning_rate": 3.592924609146251e-06, + "loss": 0.4715, + "step": 1988 + }, + { + "epoch": 0.85, + "eval_loss": 0.4732716977596283, + "eval_runtime": 6918.9919, + "eval_samples_per_second": 41.97, + "eval_steps_per_second": 2.099, + "step": 1988 + }, + { + "epoch": 0.85, + "grad_norm": 0.5830835777523167, + "learning_rate": 3.5912344188329812e-06, + "loss": 0.4622, + "step": 1989 + }, + { + "epoch": 0.85, + "grad_norm": 0.5532809255407818, + "learning_rate": 3.5895436121160388e-06, + "loss": 0.5057, + "step": 1990 + }, + { + "epoch": 0.85, + "grad_norm": 0.5548636273482329, + "learning_rate": 3.5878521899505083e-06, + "loss": 0.4717, + "step": 1991 + }, + { + "epoch": 0.85, + "grad_norm": 0.5756081970951604, + "learning_rate": 3.5861601532918188e-06, + "loss": 0.4828, + "step": 1992 + }, + { + "epoch": 0.85, + "grad_norm": 0.561307751747662, + "learning_rate": 3.58446750309575e-06, + "loss": 0.471, + "step": 1993 + }, + { + "epoch": 0.85, + "grad_norm": 0.5716452195722197, + "learning_rate": 3.5827742403184246e-06, + "loss": 0.4799, + "step": 1994 + }, + { + "epoch": 0.85, + "grad_norm": 0.6374115187137489, + "learning_rate": 3.5810803659163136e-06, + "loss": 0.4685, + "step": 1995 + }, + { + "epoch": 0.85, + "grad_norm": 0.5584668047661877, + "learning_rate": 3.579385880846232e-06, + "loss": 0.4733, + "step": 1996 + }, + { + "epoch": 0.85, + "grad_norm": 0.5582439256078539, + "learning_rate": 3.577690786065343e-06, + "loss": 0.4657, + "step": 1997 + }, + { + "epoch": 0.85, + "grad_norm": 0.5551491606524519, + "learning_rate": 3.5759950825311497e-06, + "loss": 0.499, + "step": 1998 + }, + { + "epoch": 0.85, + "grad_norm": 0.6015108198668244, + "learning_rate": 3.5742987712015016e-06, + "loss": 0.4971, + "step": 1999 + }, + { + "epoch": 0.85, + "grad_norm": 0.5552048556458112, + "learning_rate": 3.5726018530345913e-06, + "loss": 0.4661, + "step": 2000 + }, + { + "epoch": 0.85, + "grad_norm": 0.5346633218936818, + "learning_rate": 3.5709043289889538e-06, + "loss": 0.4608, + "step": 2001 + }, + { + "epoch": 0.86, + "grad_norm": 0.5373071208933639, + "learning_rate": 3.5692062000234663e-06, + "loss": 0.4647, + "step": 2002 + }, + { + "epoch": 0.86, + "grad_norm": 0.5416519357021035, + "learning_rate": 3.5675074670973485e-06, + "loss": 0.4738, + "step": 2003 + }, + { + "epoch": 0.86, + "grad_norm": 0.569652388343622, + "learning_rate": 3.565808131170161e-06, + "loss": 0.4749, + "step": 2004 + }, + { + "epoch": 0.86, + "grad_norm": 0.540670625527253, + "learning_rate": 3.564108193201804e-06, + "loss": 0.4843, + "step": 2005 + }, + { + "epoch": 0.86, + "grad_norm": 0.5637489546305884, + "learning_rate": 3.562407654152518e-06, + "loss": 0.4718, + "step": 2006 + }, + { + "epoch": 0.86, + "grad_norm": 0.5563549147978012, + "learning_rate": 3.5607065149828845e-06, + "loss": 0.4862, + "step": 2007 + }, + { + "epoch": 0.86, + "grad_norm": 0.5371061058438552, + "learning_rate": 3.559004776653823e-06, + "loss": 0.4974, + "step": 2008 + }, + { + "epoch": 0.86, + "grad_norm": 0.544463201871563, + "learning_rate": 3.557302440126591e-06, + "loss": 0.4674, + "step": 2009 + }, + { + "epoch": 0.86, + "grad_norm": 0.5563948529476817, + "learning_rate": 3.5555995063627842e-06, + "loss": 0.4703, + "step": 2010 + }, + { + "epoch": 0.86, + "grad_norm": 0.5433738422335685, + "learning_rate": 3.5538959763243363e-06, + "loss": 0.4801, + "step": 2011 + }, + { + "epoch": 0.86, + "grad_norm": 0.5616100198019316, + "learning_rate": 3.552191850973517e-06, + "loss": 0.496, + "step": 2012 + }, + { + "epoch": 0.86, + "grad_norm": 0.5891584297064337, + "learning_rate": 3.550487131272933e-06, + "loss": 0.4766, + "step": 2013 + }, + { + "epoch": 0.86, + "grad_norm": 0.5684784781174675, + "learning_rate": 3.5487818181855253e-06, + "loss": 0.4633, + "step": 2014 + }, + { + "epoch": 0.86, + "grad_norm": 0.544328038939811, + "learning_rate": 3.5470759126745726e-06, + "loss": 0.4694, + "step": 2015 + }, + { + "epoch": 0.86, + "grad_norm": 0.5505644001026635, + "learning_rate": 3.545369415703685e-06, + "loss": 0.4834, + "step": 2016 + }, + { + "epoch": 0.86, + "grad_norm": 0.605202877335372, + "learning_rate": 3.54366232823681e-06, + "loss": 0.4879, + "step": 2017 + }, + { + "epoch": 0.86, + "grad_norm": 0.5543782398842882, + "learning_rate": 3.5419546512382264e-06, + "loss": 0.461, + "step": 2018 + }, + { + "epoch": 0.86, + "grad_norm": 0.564246989631432, + "learning_rate": 3.540246385672547e-06, + "loss": 0.4655, + "step": 2019 + }, + { + "epoch": 0.86, + "grad_norm": 0.5286400658391471, + "learning_rate": 3.5385375325047167e-06, + "loss": 0.4352, + "step": 2020 + }, + { + "epoch": 0.86, + "grad_norm": 0.5577835291298394, + "learning_rate": 3.536828092700012e-06, + "loss": 0.4591, + "step": 2021 + }, + { + "epoch": 0.86, + "grad_norm": 0.5649431678973527, + "learning_rate": 3.5351180672240413e-06, + "loss": 0.4604, + "step": 2022 + }, + { + "epoch": 0.86, + "grad_norm": 0.5672528493306068, + "learning_rate": 3.5334074570427444e-06, + "loss": 0.5038, + "step": 2023 + }, + { + "epoch": 0.86, + "grad_norm": 0.5562216889012358, + "learning_rate": 3.5316962631223896e-06, + "loss": 0.4651, + "step": 2024 + }, + { + "epoch": 0.87, + "grad_norm": 0.5639239221337747, + "learning_rate": 3.5299844864295773e-06, + "loss": 0.438, + "step": 2025 + }, + { + "epoch": 0.87, + "grad_norm": 0.5501230990120523, + "learning_rate": 3.5282721279312343e-06, + "loss": 0.4518, + "step": 2026 + }, + { + "epoch": 0.87, + "grad_norm": 0.565754118751582, + "learning_rate": 3.5265591885946184e-06, + "loss": 0.5105, + "step": 2027 + }, + { + "epoch": 0.87, + "grad_norm": 0.5488876736412479, + "learning_rate": 3.5248456693873152e-06, + "loss": 0.4705, + "step": 2028 + }, + { + "epoch": 0.87, + "grad_norm": 0.567410215517672, + "learning_rate": 3.523131571277235e-06, + "loss": 0.4649, + "step": 2029 + }, + { + "epoch": 0.87, + "grad_norm": 0.56236312948077, + "learning_rate": 3.5214168952326205e-06, + "loss": 0.4625, + "step": 2030 + }, + { + "epoch": 0.87, + "grad_norm": 0.5713132266773925, + "learning_rate": 3.519701642222036e-06, + "loss": 0.4613, + "step": 2031 + }, + { + "epoch": 0.87, + "grad_norm": 0.5586906230581218, + "learning_rate": 3.5179858132143727e-06, + "loss": 0.4749, + "step": 2032 + }, + { + "epoch": 0.87, + "grad_norm": 0.5452698201212837, + "learning_rate": 3.5162694091788506e-06, + "loss": 0.4704, + "step": 2033 + }, + { + "epoch": 0.87, + "grad_norm": 0.5472112238723226, + "learning_rate": 3.5145524310850088e-06, + "loss": 0.4723, + "step": 2034 + }, + { + "epoch": 0.87, + "grad_norm": 0.5239386822886098, + "learning_rate": 3.5128348799027157e-06, + "loss": 0.4832, + "step": 2035 + }, + { + "epoch": 0.87, + "grad_norm": 0.5582354151966005, + "learning_rate": 3.5111167566021607e-06, + "loss": 0.4825, + "step": 2036 + }, + { + "epoch": 0.87, + "grad_norm": 0.5661871633973496, + "learning_rate": 3.509398062153857e-06, + "loss": 0.4808, + "step": 2037 + }, + { + "epoch": 0.87, + "grad_norm": 0.5450889485025441, + "learning_rate": 3.507678797528641e-06, + "loss": 0.4908, + "step": 2038 + }, + { + "epoch": 0.87, + "grad_norm": 0.5299540248896896, + "learning_rate": 3.5059589636976704e-06, + "loss": 0.4392, + "step": 2039 + }, + { + "epoch": 0.87, + "grad_norm": 0.5347817988173922, + "learning_rate": 3.5042385616324243e-06, + "loss": 0.4673, + "step": 2040 + }, + { + "epoch": 0.87, + "grad_norm": 0.545451063247587, + "learning_rate": 3.5025175923047034e-06, + "loss": 0.4531, + "step": 2041 + }, + { + "epoch": 0.87, + "grad_norm": 0.657050095176287, + "learning_rate": 3.5007960566866296e-06, + "loss": 0.4599, + "step": 2042 + }, + { + "epoch": 0.87, + "grad_norm": 0.5948751733409405, + "learning_rate": 3.499073955750642e-06, + "loss": 0.4796, + "step": 2043 + }, + { + "epoch": 0.87, + "grad_norm": 0.5467310616780172, + "learning_rate": 3.497351290469503e-06, + "loss": 0.5012, + "step": 2044 + }, + { + "epoch": 0.87, + "grad_norm": 0.538398730069935, + "learning_rate": 3.4956280618162887e-06, + "loss": 0.4799, + "step": 2045 + }, + { + "epoch": 0.87, + "grad_norm": 0.5563044950019395, + "learning_rate": 3.4939042707643983e-06, + "loss": 0.4456, + "step": 2046 + }, + { + "epoch": 0.87, + "grad_norm": 0.5552862407440945, + "learning_rate": 3.492179918287547e-06, + "loss": 0.4715, + "step": 2047 + }, + { + "epoch": 0.87, + "grad_norm": 0.5443648119446477, + "learning_rate": 3.4904550053597646e-06, + "loss": 0.4737, + "step": 2048 + }, + { + "epoch": 0.88, + "grad_norm": 0.533397172596125, + "learning_rate": 3.488729532955401e-06, + "loss": 0.4327, + "step": 2049 + }, + { + "epoch": 0.88, + "grad_norm": 0.5290432865271623, + "learning_rate": 3.4870035020491216e-06, + "loss": 0.4568, + "step": 2050 + }, + { + "epoch": 0.88, + "grad_norm": 0.5653745834920318, + "learning_rate": 3.4852769136159047e-06, + "loss": 0.4719, + "step": 2051 + }, + { + "epoch": 0.88, + "grad_norm": 3.260354916091318, + "learning_rate": 3.4835497686310458e-06, + "loss": 0.4502, + "step": 2052 + }, + { + "epoch": 0.88, + "grad_norm": 0.5407916301520294, + "learning_rate": 3.4818220680701554e-06, + "loss": 0.493, + "step": 2053 + }, + { + "epoch": 0.88, + "grad_norm": 0.5530804605769145, + "learning_rate": 3.480093812909155e-06, + "loss": 0.4591, + "step": 2054 + }, + { + "epoch": 0.88, + "grad_norm": 0.5560882191153973, + "learning_rate": 3.4783650041242823e-06, + "loss": 0.4814, + "step": 2055 + }, + { + "epoch": 0.88, + "grad_norm": 0.5491339314657214, + "learning_rate": 3.4766356426920854e-06, + "loss": 0.459, + "step": 2056 + }, + { + "epoch": 0.88, + "grad_norm": 0.5446665246830417, + "learning_rate": 3.474905729589427e-06, + "loss": 0.4721, + "step": 2057 + }, + { + "epoch": 0.88, + "grad_norm": 0.5597955365837155, + "learning_rate": 3.4731752657934793e-06, + "loss": 0.4657, + "step": 2058 + }, + { + "epoch": 0.88, + "grad_norm": 0.5257446537011154, + "learning_rate": 3.471444252281726e-06, + "loss": 0.4791, + "step": 2059 + }, + { + "epoch": 0.88, + "eval_loss": 0.47191861271858215, + "eval_runtime": 6926.5022, + "eval_samples_per_second": 41.924, + "eval_steps_per_second": 2.096, + "step": 2059 + }, + { + "epoch": 0.88, + "grad_norm": 0.5550797881194796, + "learning_rate": 3.469712690031962e-06, + "loss": 0.4586, + "step": 2060 + }, + { + "epoch": 0.88, + "grad_norm": 0.5435199223357358, + "learning_rate": 3.467980580022293e-06, + "loss": 0.4572, + "step": 2061 + }, + { + "epoch": 0.88, + "grad_norm": 0.5445376220038961, + "learning_rate": 3.466247923231131e-06, + "loss": 0.4535, + "step": 2062 + }, + { + "epoch": 0.88, + "grad_norm": 0.5438364103684316, + "learning_rate": 3.4645147206371997e-06, + "loss": 0.469, + "step": 2063 + }, + { + "epoch": 0.88, + "grad_norm": 0.5622544652606651, + "learning_rate": 3.4627809732195306e-06, + "loss": 0.5053, + "step": 2064 + }, + { + "epoch": 0.88, + "grad_norm": 0.5787220304673627, + "learning_rate": 3.4610466819574617e-06, + "loss": 0.4916, + "step": 2065 + }, + { + "epoch": 0.88, + "grad_norm": 0.5665331755731509, + "learning_rate": 3.45931184783064e-06, + "loss": 0.461, + "step": 2066 + }, + { + "epoch": 0.88, + "grad_norm": 0.5465798109205211, + "learning_rate": 3.4575764718190174e-06, + "loss": 0.4681, + "step": 2067 + }, + { + "epoch": 0.88, + "grad_norm": 0.5519620649686449, + "learning_rate": 3.455840554902853e-06, + "loss": 0.4842, + "step": 2068 + }, + { + "epoch": 0.88, + "grad_norm": 0.5679253676438992, + "learning_rate": 3.4541040980627117e-06, + "loss": 0.4714, + "step": 2069 + }, + { + "epoch": 0.88, + "grad_norm": 0.5537751246266448, + "learning_rate": 3.4523671022794612e-06, + "loss": 0.4746, + "step": 2070 + }, + { + "epoch": 0.88, + "grad_norm": 0.5895813898958205, + "learning_rate": 3.450629568534277e-06, + "loss": 0.483, + "step": 2071 + }, + { + "epoch": 0.89, + "grad_norm": 0.5820282697400692, + "learning_rate": 3.448891497808636e-06, + "loss": 0.4948, + "step": 2072 + }, + { + "epoch": 0.89, + "grad_norm": 0.5849936796434134, + "learning_rate": 3.4471528910843193e-06, + "loss": 0.4604, + "step": 2073 + }, + { + "epoch": 0.89, + "grad_norm": 0.5994615032661528, + "learning_rate": 3.4454137493434107e-06, + "loss": 0.4893, + "step": 2074 + }, + { + "epoch": 0.89, + "grad_norm": 0.5648890671471385, + "learning_rate": 3.443674073568296e-06, + "loss": 0.4761, + "step": 2075 + }, + { + "epoch": 0.89, + "grad_norm": 0.5800587534232966, + "learning_rate": 3.441933864741663e-06, + "loss": 0.4747, + "step": 2076 + }, + { + "epoch": 0.89, + "grad_norm": 0.5515407287133164, + "learning_rate": 3.4401931238464996e-06, + "loss": 0.4339, + "step": 2077 + }, + { + "epoch": 0.89, + "grad_norm": 0.5814192658954972, + "learning_rate": 3.438451851866097e-06, + "loss": 0.4769, + "step": 2078 + }, + { + "epoch": 0.89, + "grad_norm": 0.5393565470119203, + "learning_rate": 3.4367100497840416e-06, + "loss": 0.4843, + "step": 2079 + }, + { + "epoch": 0.89, + "grad_norm": 0.5789119658212182, + "learning_rate": 3.4349677185842246e-06, + "loss": 0.4767, + "step": 2080 + }, + { + "epoch": 0.89, + "grad_norm": 0.5605931723414512, + "learning_rate": 3.433224859250832e-06, + "loss": 0.4735, + "step": 2081 + }, + { + "epoch": 0.89, + "grad_norm": 0.6040063691004713, + "learning_rate": 3.4314814727683506e-06, + "loss": 0.4968, + "step": 2082 + }, + { + "epoch": 0.89, + "grad_norm": 0.5148532844857273, + "learning_rate": 3.429737560121564e-06, + "loss": 0.4516, + "step": 2083 + }, + { + "epoch": 0.89, + "grad_norm": 0.5936934662433498, + "learning_rate": 3.427993122295552e-06, + "loss": 0.4827, + "step": 2084 + }, + { + "epoch": 0.89, + "grad_norm": 0.5666076166587783, + "learning_rate": 3.4262481602756937e-06, + "loss": 0.4535, + "step": 2085 + }, + { + "epoch": 0.89, + "grad_norm": 0.542895247264907, + "learning_rate": 3.4245026750476618e-06, + "loss": 0.4531, + "step": 2086 + }, + { + "epoch": 0.89, + "grad_norm": 0.541516478103584, + "learning_rate": 3.4227566675974256e-06, + "loss": 0.4892, + "step": 2087 + }, + { + "epoch": 0.89, + "grad_norm": 0.5605571680518421, + "learning_rate": 3.421010138911249e-06, + "loss": 0.4508, + "step": 2088 + }, + { + "epoch": 0.89, + "grad_norm": 0.5849102411377991, + "learning_rate": 3.4192630899756924e-06, + "loss": 0.4528, + "step": 2089 + }, + { + "epoch": 0.89, + "grad_norm": 0.5336283093255381, + "learning_rate": 3.4175155217776057e-06, + "loss": 0.4516, + "step": 2090 + }, + { + "epoch": 0.89, + "grad_norm": 0.5558368518531489, + "learning_rate": 3.4157674353041358e-06, + "loss": 0.4703, + "step": 2091 + }, + { + "epoch": 0.89, + "grad_norm": 0.5552349223460866, + "learning_rate": 3.4140188315427216e-06, + "loss": 0.497, + "step": 2092 + }, + { + "epoch": 0.89, + "grad_norm": 0.5511369795033969, + "learning_rate": 3.4122697114810934e-06, + "loss": 0.5119, + "step": 2093 + }, + { + "epoch": 0.89, + "grad_norm": 0.5583288164174555, + "learning_rate": 3.410520076107273e-06, + "loss": 0.4827, + "step": 2094 + }, + { + "epoch": 0.9, + "grad_norm": 0.567817131338669, + "learning_rate": 3.4087699264095746e-06, + "loss": 0.4634, + "step": 2095 + }, + { + "epoch": 0.9, + "grad_norm": 0.5815719817752771, + "learning_rate": 3.4070192633766025e-06, + "loss": 0.4886, + "step": 2096 + }, + { + "epoch": 0.9, + "grad_norm": 0.5479218943057999, + "learning_rate": 3.405268087997251e-06, + "loss": 0.4886, + "step": 2097 + }, + { + "epoch": 0.9, + "grad_norm": 0.5538188239242408, + "learning_rate": 3.4035164012607013e-06, + "loss": 0.4459, + "step": 2098 + }, + { + "epoch": 0.9, + "grad_norm": 0.54601055265423, + "learning_rate": 3.401764204156428e-06, + "loss": 0.4682, + "step": 2099 + }, + { + "epoch": 0.9, + "grad_norm": 0.5752200802837831, + "learning_rate": 3.4000114976741905e-06, + "loss": 0.4858, + "step": 2100 + }, + { + "epoch": 0.9, + "grad_norm": 0.5329434779873036, + "learning_rate": 3.3982582828040373e-06, + "loss": 0.4508, + "step": 2101 + }, + { + "epoch": 0.9, + "grad_norm": 0.5626679379467617, + "learning_rate": 3.3965045605363036e-06, + "loss": 0.4974, + "step": 2102 + }, + { + "epoch": 0.9, + "grad_norm": 0.5227078861625193, + "learning_rate": 3.3947503318616117e-06, + "loss": 0.447, + "step": 2103 + }, + { + "epoch": 0.9, + "grad_norm": 0.5318507506941723, + "learning_rate": 3.3929955977708686e-06, + "loss": 0.4611, + "step": 2104 + }, + { + "epoch": 0.9, + "grad_norm": 0.5652337604907314, + "learning_rate": 3.391240359255269e-06, + "loss": 0.4825, + "step": 2105 + }, + { + "epoch": 0.9, + "grad_norm": 0.5661117477966576, + "learning_rate": 3.3894846173062917e-06, + "loss": 0.4569, + "step": 2106 + }, + { + "epoch": 0.9, + "grad_norm": 0.5225946246169721, + "learning_rate": 3.3877283729156983e-06, + "loss": 0.4646, + "step": 2107 + }, + { + "epoch": 0.9, + "grad_norm": 0.522801955720487, + "learning_rate": 3.385971627075537e-06, + "loss": 0.4846, + "step": 2108 + }, + { + "epoch": 0.9, + "grad_norm": 0.5654497625396676, + "learning_rate": 3.3842143807781363e-06, + "loss": 0.4641, + "step": 2109 + }, + { + "epoch": 0.9, + "grad_norm": 0.553968034176698, + "learning_rate": 3.38245663501611e-06, + "loss": 0.4771, + "step": 2110 + }, + { + "epoch": 0.9, + "grad_norm": 0.5512083756758241, + "learning_rate": 3.3806983907823526e-06, + "loss": 0.4711, + "step": 2111 + }, + { + "epoch": 0.9, + "grad_norm": 0.5612387241279982, + "learning_rate": 3.378939649070039e-06, + "loss": 0.4929, + "step": 2112 + }, + { + "epoch": 0.9, + "grad_norm": 0.5634378274470825, + "learning_rate": 3.3771804108726294e-06, + "loss": 0.4668, + "step": 2113 + }, + { + "epoch": 0.9, + "grad_norm": 0.5691963273905253, + "learning_rate": 3.375420677183859e-06, + "loss": 0.4857, + "step": 2114 + }, + { + "epoch": 0.9, + "grad_norm": 0.556084686207028, + "learning_rate": 3.3736604489977465e-06, + "loss": 0.4756, + "step": 2115 + }, + { + "epoch": 0.9, + "grad_norm": 0.5453441170478283, + "learning_rate": 3.3718997273085883e-06, + "loss": 0.4633, + "step": 2116 + }, + { + "epoch": 0.9, + "grad_norm": 0.5336519493865814, + "learning_rate": 3.3701385131109617e-06, + "loss": 0.4587, + "step": 2117 + }, + { + "epoch": 0.9, + "grad_norm": 0.54183517888334, + "learning_rate": 3.368376807399719e-06, + "loss": 0.4897, + "step": 2118 + }, + { + "epoch": 0.91, + "grad_norm": 0.537940362868582, + "learning_rate": 3.3666146111699926e-06, + "loss": 0.4613, + "step": 2119 + }, + { + "epoch": 0.91, + "grad_norm": 0.5229756265690545, + "learning_rate": 3.3648519254171906e-06, + "loss": 0.4689, + "step": 2120 + }, + { + "epoch": 0.91, + "grad_norm": 0.6066336811051063, + "learning_rate": 3.363088751136999e-06, + "loss": 0.5001, + "step": 2121 + }, + { + "epoch": 0.91, + "grad_norm": 0.5518196727992455, + "learning_rate": 3.3613250893253794e-06, + "loss": 0.5078, + "step": 2122 + }, + { + "epoch": 0.91, + "grad_norm": 0.5602072507870511, + "learning_rate": 3.3595609409785668e-06, + "loss": 0.479, + "step": 2123 + }, + { + "epoch": 0.91, + "grad_norm": 0.5609819914853433, + "learning_rate": 3.357796307093074e-06, + "loss": 0.4798, + "step": 2124 + }, + { + "epoch": 0.91, + "grad_norm": 0.540124268427881, + "learning_rate": 3.3560311886656855e-06, + "loss": 0.4707, + "step": 2125 + }, + { + "epoch": 0.91, + "grad_norm": 0.5273549257492, + "learning_rate": 3.3542655866934613e-06, + "loss": 0.451, + "step": 2126 + }, + { + "epoch": 0.91, + "grad_norm": 0.5257257097173317, + "learning_rate": 3.352499502173734e-06, + "loss": 0.4422, + "step": 2127 + }, + { + "epoch": 0.91, + "grad_norm": 0.5242099348991289, + "learning_rate": 3.350732936104108e-06, + "loss": 0.4553, + "step": 2128 + }, + { + "epoch": 0.91, + "grad_norm": 0.5871633066769245, + "learning_rate": 3.3489658894824614e-06, + "loss": 0.4706, + "step": 2129 + }, + { + "epoch": 0.91, + "grad_norm": 0.5558589637343108, + "learning_rate": 3.3471983633069414e-06, + "loss": 0.4506, + "step": 2130 + }, + { + "epoch": 0.91, + "eval_loss": 0.4703803062438965, + "eval_runtime": 6929.62, + "eval_samples_per_second": 41.905, + "eval_steps_per_second": 2.095, + "step": 2130 + }, + { + "epoch": 0.91, + "grad_norm": 0.538163796065571, + "learning_rate": 3.3454303585759684e-06, + "loss": 0.4616, + "step": 2131 + }, + { + "epoch": 0.91, + "grad_norm": 0.6059394097926508, + "learning_rate": 3.3436618762882322e-06, + "loss": 0.4803, + "step": 2132 + }, + { + "epoch": 0.91, + "grad_norm": 1.0796137145313163, + "learning_rate": 3.3418929174426918e-06, + "loss": 0.4908, + "step": 2133 + }, + { + "epoch": 0.91, + "grad_norm": 0.5577675000716914, + "learning_rate": 3.3401234830385753e-06, + "loss": 0.457, + "step": 2134 + }, + { + "epoch": 0.91, + "grad_norm": 0.5830147544071842, + "learning_rate": 3.3383535740753813e-06, + "loss": 0.4715, + "step": 2135 + }, + { + "epoch": 0.91, + "grad_norm": 0.5860738051813335, + "learning_rate": 3.336583191552876e-06, + "loss": 0.4749, + "step": 2136 + }, + { + "epoch": 0.91, + "grad_norm": 0.5718370942498533, + "learning_rate": 3.334812336471089e-06, + "loss": 0.462, + "step": 2137 + }, + { + "epoch": 0.91, + "grad_norm": 0.5677796507381937, + "learning_rate": 3.3330410098303224e-06, + "loss": 0.4658, + "step": 2138 + }, + { + "epoch": 0.91, + "grad_norm": 0.5660363318045506, + "learning_rate": 3.3312692126311424e-06, + "loss": 0.4654, + "step": 2139 + }, + { + "epoch": 0.91, + "grad_norm": 0.5529225478270423, + "learning_rate": 3.32949694587438e-06, + "loss": 0.4853, + "step": 2140 + }, + { + "epoch": 0.91, + "grad_norm": 0.558224467702825, + "learning_rate": 3.3277242105611334e-06, + "loss": 0.4635, + "step": 2141 + }, + { + "epoch": 0.92, + "grad_norm": 0.5895844984799891, + "learning_rate": 3.3259510076927644e-06, + "loss": 0.4831, + "step": 2142 + }, + { + "epoch": 0.92, + "grad_norm": 0.5474505234053266, + "learning_rate": 3.324177338270898e-06, + "loss": 0.4723, + "step": 2143 + }, + { + "epoch": 0.92, + "grad_norm": 0.5565011263756835, + "learning_rate": 3.322403203297424e-06, + "loss": 0.4524, + "step": 2144 + }, + { + "epoch": 0.92, + "grad_norm": 0.653742264632734, + "learning_rate": 3.320628603774496e-06, + "loss": 0.4539, + "step": 2145 + }, + { + "epoch": 0.92, + "grad_norm": 0.5762670304100043, + "learning_rate": 3.3188535407045274e-06, + "loss": 0.4779, + "step": 2146 + }, + { + "epoch": 0.92, + "grad_norm": 0.549175111157888, + "learning_rate": 3.317078015090197e-06, + "loss": 0.5013, + "step": 2147 + }, + { + "epoch": 0.92, + "grad_norm": 0.5736471852199507, + "learning_rate": 3.315302027934441e-06, + "loss": 0.4816, + "step": 2148 + }, + { + "epoch": 0.92, + "grad_norm": 0.557613887379806, + "learning_rate": 3.313525580240459e-06, + "loss": 0.4756, + "step": 2149 + }, + { + "epoch": 0.92, + "grad_norm": 0.5468948120560473, + "learning_rate": 3.3117486730117092e-06, + "loss": 0.4753, + "step": 2150 + }, + { + "epoch": 0.92, + "grad_norm": 0.5267251356037557, + "learning_rate": 3.309971307251911e-06, + "loss": 0.4672, + "step": 2151 + }, + { + "epoch": 0.92, + "grad_norm": 0.5661351158225999, + "learning_rate": 3.3081934839650404e-06, + "loss": 0.4607, + "step": 2152 + }, + { + "epoch": 0.92, + "grad_norm": 0.5777629397975975, + "learning_rate": 3.3064152041553356e-06, + "loss": 0.4451, + "step": 2153 + }, + { + "epoch": 0.92, + "grad_norm": 0.5700643158197273, + "learning_rate": 3.304636468827288e-06, + "loss": 0.4695, + "step": 2154 + }, + { + "epoch": 0.92, + "grad_norm": 0.5615418059663319, + "learning_rate": 3.3028572789856507e-06, + "loss": 0.469, + "step": 2155 + }, + { + "epoch": 0.92, + "grad_norm": 0.5871509589578386, + "learning_rate": 3.30107763563543e-06, + "loss": 0.4827, + "step": 2156 + }, + { + "epoch": 0.92, + "grad_norm": 0.5558568799307586, + "learning_rate": 3.299297539781891e-06, + "loss": 0.4716, + "step": 2157 + }, + { + "epoch": 0.92, + "grad_norm": 0.5545006329981151, + "learning_rate": 3.2975169924305524e-06, + "loss": 0.4463, + "step": 2158 + }, + { + "epoch": 0.92, + "grad_norm": 3.548575566071066, + "learning_rate": 3.29573599458719e-06, + "loss": 0.4889, + "step": 2159 + }, + { + "epoch": 0.92, + "grad_norm": 0.5213212611415827, + "learning_rate": 3.2939545472578314e-06, + "loss": 0.4867, + "step": 2160 + }, + { + "epoch": 0.92, + "grad_norm": 0.5843877680577335, + "learning_rate": 3.292172651448761e-06, + "loss": 0.4658, + "step": 2161 + }, + { + "epoch": 0.92, + "grad_norm": 0.5710485476550279, + "learning_rate": 3.290390308166515e-06, + "loss": 0.4722, + "step": 2162 + }, + { + "epoch": 0.92, + "grad_norm": 0.5539562494433544, + "learning_rate": 3.2886075184178817e-06, + "loss": 0.4885, + "step": 2163 + }, + { + "epoch": 0.92, + "grad_norm": 0.6485074098343331, + "learning_rate": 3.2868242832099034e-06, + "loss": 0.471, + "step": 2164 + }, + { + "epoch": 0.92, + "grad_norm": 0.6332549066175103, + "learning_rate": 3.285040603549872e-06, + "loss": 0.4821, + "step": 2165 + }, + { + "epoch": 0.93, + "grad_norm": 0.5894833989878483, + "learning_rate": 3.2832564804453327e-06, + "loss": 0.4571, + "step": 2166 + }, + { + "epoch": 0.93, + "grad_norm": 0.5558289841136125, + "learning_rate": 3.281471914904079e-06, + "loss": 0.4744, + "step": 2167 + }, + { + "epoch": 0.93, + "grad_norm": 0.5809032359736926, + "learning_rate": 3.2796869079341555e-06, + "loss": 0.4868, + "step": 2168 + }, + { + "epoch": 0.93, + "grad_norm": 0.5635833248240615, + "learning_rate": 3.2779014605438563e-06, + "loss": 0.4736, + "step": 2169 + }, + { + "epoch": 0.93, + "grad_norm": 0.5668615337342484, + "learning_rate": 3.276115573741724e-06, + "loss": 0.4737, + "step": 2170 + }, + { + "epoch": 0.93, + "grad_norm": 0.5702039104950186, + "learning_rate": 3.274329248536548e-06, + "loss": 0.459, + "step": 2171 + }, + { + "epoch": 0.93, + "grad_norm": 0.5220961360234252, + "learning_rate": 3.272542485937369e-06, + "loss": 0.4656, + "step": 2172 + }, + { + "epoch": 0.93, + "grad_norm": 0.5594033494520262, + "learning_rate": 3.270755286953471e-06, + "loss": 0.4814, + "step": 2173 + }, + { + "epoch": 0.93, + "grad_norm": 0.5741756929068141, + "learning_rate": 3.2689676525943854e-06, + "loss": 0.4815, + "step": 2174 + }, + { + "epoch": 0.93, + "grad_norm": 0.5464966733679589, + "learning_rate": 3.267179583869892e-06, + "loss": 0.4593, + "step": 2175 + }, + { + "epoch": 0.93, + "grad_norm": 0.5721831411264321, + "learning_rate": 3.265391081790012e-06, + "loss": 0.4761, + "step": 2176 + }, + { + "epoch": 0.93, + "grad_norm": 0.5456090495324294, + "learning_rate": 3.2636021473650143e-06, + "loss": 0.4875, + "step": 2177 + }, + { + "epoch": 0.93, + "grad_norm": 0.5463724345795404, + "learning_rate": 3.2618127816054117e-06, + "loss": 0.4726, + "step": 2178 + }, + { + "epoch": 0.93, + "grad_norm": 0.5367573084821134, + "learning_rate": 3.2600229855219595e-06, + "loss": 0.473, + "step": 2179 + }, + { + "epoch": 0.93, + "grad_norm": 0.5855822548291622, + "learning_rate": 3.2582327601256567e-06, + "loss": 0.4782, + "step": 2180 + }, + { + "epoch": 0.93, + "grad_norm": 0.5565384228650471, + "learning_rate": 3.256442106427745e-06, + "loss": 0.457, + "step": 2181 + }, + { + "epoch": 0.93, + "grad_norm": 0.5386204130175131, + "learning_rate": 3.254651025439707e-06, + "loss": 0.4684, + "step": 2182 + }, + { + "epoch": 0.93, + "grad_norm": 0.5334182218422864, + "learning_rate": 3.252859518173269e-06, + "loss": 0.4645, + "step": 2183 + }, + { + "epoch": 0.93, + "grad_norm": 0.5715830265673014, + "learning_rate": 3.251067585640395e-06, + "loss": 0.4772, + "step": 2184 + }, + { + "epoch": 0.93, + "grad_norm": 0.5741015269134933, + "learning_rate": 3.249275228853292e-06, + "loss": 0.4627, + "step": 2185 + }, + { + "epoch": 0.93, + "grad_norm": 0.5672462313761777, + "learning_rate": 3.247482448824405e-06, + "loss": 0.4903, + "step": 2186 + }, + { + "epoch": 0.93, + "grad_norm": 0.5373081330698016, + "learning_rate": 3.245689246566418e-06, + "loss": 0.4816, + "step": 2187 + }, + { + "epoch": 0.93, + "grad_norm": 0.5476617186268243, + "learning_rate": 3.243895623092254e-06, + "loss": 0.4886, + "step": 2188 + }, + { + "epoch": 0.94, + "grad_norm": 0.5356939267254818, + "learning_rate": 3.2421015794150755e-06, + "loss": 0.4765, + "step": 2189 + }, + { + "epoch": 0.94, + "grad_norm": 0.5494604506543436, + "learning_rate": 3.240307116548279e-06, + "loss": 0.5035, + "step": 2190 + }, + { + "epoch": 0.94, + "grad_norm": 0.5930523473140368, + "learning_rate": 3.2385122355055004e-06, + "loss": 0.4962, + "step": 2191 + }, + { + "epoch": 0.94, + "grad_norm": 0.602008763063395, + "learning_rate": 3.2367169373006114e-06, + "loss": 0.4892, + "step": 2192 + }, + { + "epoch": 0.94, + "grad_norm": 0.5651243722247239, + "learning_rate": 3.234921222947718e-06, + "loss": 0.4829, + "step": 2193 + }, + { + "epoch": 0.94, + "grad_norm": 0.5385560583261362, + "learning_rate": 3.2331250934611623e-06, + "loss": 0.482, + "step": 2194 + }, + { + "epoch": 0.94, + "grad_norm": 0.5547490494472022, + "learning_rate": 3.231328549855522e-06, + "loss": 0.4612, + "step": 2195 + }, + { + "epoch": 0.94, + "grad_norm": 0.5470853885746786, + "learning_rate": 3.2295315931456057e-06, + "loss": 0.4593, + "step": 2196 + }, + { + "epoch": 0.94, + "grad_norm": 0.5786280065830184, + "learning_rate": 3.227734224346458e-06, + "loss": 0.4769, + "step": 2197 + }, + { + "epoch": 0.94, + "grad_norm": 0.5344368582425154, + "learning_rate": 3.2259364444733567e-06, + "loss": 0.466, + "step": 2198 + }, + { + "epoch": 0.94, + "grad_norm": 0.5555074280873257, + "learning_rate": 3.2241382545418087e-06, + "loss": 0.4648, + "step": 2199 + }, + { + "epoch": 0.94, + "grad_norm": 0.5482269284810969, + "learning_rate": 3.222339655567556e-06, + "loss": 0.4769, + "step": 2200 + }, + { + "epoch": 0.94, + "grad_norm": 0.5667532379426644, + "learning_rate": 3.2205406485665693e-06, + "loss": 0.4695, + "step": 2201 + }, + { + "epoch": 0.94, + "eval_loss": 0.46910035610198975, + "eval_runtime": 6929.3666, + "eval_samples_per_second": 41.907, + "eval_steps_per_second": 2.095, + "step": 2201 + }, + { + "epoch": 0.94, + "grad_norm": 0.5516775609671802, + "learning_rate": 3.2187412345550493e-06, + "loss": 0.4626, + "step": 2202 + }, + { + "epoch": 0.94, + "grad_norm": 0.5274995673676252, + "learning_rate": 3.2169414145494306e-06, + "loss": 0.4542, + "step": 2203 + }, + { + "epoch": 0.94, + "grad_norm": 0.5594394034476171, + "learning_rate": 3.2151411895663713e-06, + "loss": 0.5077, + "step": 2204 + }, + { + "epoch": 0.94, + "grad_norm": 0.5592366706538346, + "learning_rate": 3.2133405606227636e-06, + "loss": 0.4836, + "step": 2205 + }, + { + "epoch": 0.94, + "grad_norm": 0.5598764473379652, + "learning_rate": 3.2115395287357247e-06, + "loss": 0.4808, + "step": 2206 + }, + { + "epoch": 0.94, + "grad_norm": 0.5593396084233587, + "learning_rate": 3.2097380949226004e-06, + "loss": 0.4692, + "step": 2207 + }, + { + "epoch": 0.94, + "grad_norm": 0.5338314791656622, + "learning_rate": 3.2079362602009633e-06, + "loss": 0.4823, + "step": 2208 + }, + { + "epoch": 0.94, + "grad_norm": 0.530750479189892, + "learning_rate": 3.2061340255886135e-06, + "loss": 0.4787, + "step": 2209 + }, + { + "epoch": 0.94, + "grad_norm": 0.5710624896170967, + "learning_rate": 3.2043313921035747e-06, + "loss": 0.4536, + "step": 2210 + }, + { + "epoch": 0.94, + "grad_norm": 0.5731148940480776, + "learning_rate": 3.2025283607640985e-06, + "loss": 0.4813, + "step": 2211 + }, + { + "epoch": 0.95, + "grad_norm": 0.5946357271072944, + "learning_rate": 3.200724932588659e-06, + "loss": 0.4819, + "step": 2212 + }, + { + "epoch": 0.95, + "grad_norm": 0.5728672004460359, + "learning_rate": 3.1989211085959558e-06, + "loss": 0.4919, + "step": 2213 + }, + { + "epoch": 0.95, + "grad_norm": 0.5568909639263977, + "learning_rate": 3.197116889804913e-06, + "loss": 0.4864, + "step": 2214 + }, + { + "epoch": 0.95, + "grad_norm": 0.5378921515155873, + "learning_rate": 3.1953122772346757e-06, + "loss": 0.4708, + "step": 2215 + }, + { + "epoch": 0.95, + "grad_norm": 0.5540326517638553, + "learning_rate": 3.193507271904612e-06, + "loss": 0.4961, + "step": 2216 + }, + { + "epoch": 0.95, + "grad_norm": 0.5264231160255753, + "learning_rate": 3.191701874834312e-06, + "loss": 0.4335, + "step": 2217 + }, + { + "epoch": 0.95, + "grad_norm": 0.5506768071233391, + "learning_rate": 3.1898960870435875e-06, + "loss": 0.4793, + "step": 2218 + }, + { + "epoch": 0.95, + "grad_norm": 0.5399042085543564, + "learning_rate": 3.1880899095524698e-06, + "loss": 0.4538, + "step": 2219 + }, + { + "epoch": 0.95, + "grad_norm": 0.5470743327622264, + "learning_rate": 3.1862833433812137e-06, + "loss": 0.4624, + "step": 2220 + }, + { + "epoch": 0.95, + "grad_norm": 0.5366708239822617, + "learning_rate": 3.1844763895502876e-06, + "loss": 0.4735, + "step": 2221 + }, + { + "epoch": 0.95, + "grad_norm": 0.5593381531413127, + "learning_rate": 3.1826690490803846e-06, + "loss": 0.4577, + "step": 2222 + }, + { + "epoch": 0.95, + "grad_norm": 0.5380013310652985, + "learning_rate": 3.180861322992414e-06, + "loss": 0.4526, + "step": 2223 + }, + { + "epoch": 0.95, + "grad_norm": 0.5415307669206325, + "learning_rate": 3.179053212307502e-06, + "loss": 0.484, + "step": 2224 + }, + { + "epoch": 0.95, + "grad_norm": 0.5610218584434514, + "learning_rate": 3.1772447180469934e-06, + "loss": 0.4998, + "step": 2225 + }, + { + "epoch": 0.95, + "grad_norm": 0.5407354208309232, + "learning_rate": 3.1754358412324483e-06, + "loss": 0.4603, + "step": 2226 + }, + { + "epoch": 0.95, + "grad_norm": 0.557457231879274, + "learning_rate": 3.173626582885645e-06, + "loss": 0.4657, + "step": 2227 + }, + { + "epoch": 0.95, + "grad_norm": 0.5712441602450918, + "learning_rate": 3.1718169440285763e-06, + "loss": 0.4893, + "step": 2228 + }, + { + "epoch": 0.95, + "grad_norm": 0.5433914547191268, + "learning_rate": 3.1700069256834478e-06, + "loss": 0.503, + "step": 2229 + }, + { + "epoch": 0.95, + "grad_norm": 0.529494115478655, + "learning_rate": 3.1681965288726825e-06, + "loss": 0.4474, + "step": 2230 + }, + { + "epoch": 0.95, + "grad_norm": 0.5572186853202381, + "learning_rate": 3.166385754618917e-06, + "loss": 0.4572, + "step": 2231 + }, + { + "epoch": 0.95, + "grad_norm": 0.5258044718918907, + "learning_rate": 3.1645746039449987e-06, + "loss": 0.4461, + "step": 2232 + }, + { + "epoch": 0.95, + "grad_norm": 0.5420554964039433, + "learning_rate": 3.16276307787399e-06, + "loss": 0.4553, + "step": 2233 + }, + { + "epoch": 0.95, + "grad_norm": 0.5405232645917257, + "learning_rate": 3.1609511774291646e-06, + "loss": 0.4557, + "step": 2234 + }, + { + "epoch": 0.95, + "grad_norm": 0.5847047282231195, + "learning_rate": 3.1591389036340064e-06, + "loss": 0.4527, + "step": 2235 + }, + { + "epoch": 0.96, + "grad_norm": 0.5423932667911106, + "learning_rate": 3.157326257512212e-06, + "loss": 0.457, + "step": 2236 + }, + { + "epoch": 0.96, + "grad_norm": 0.5692656034737968, + "learning_rate": 3.1555132400876877e-06, + "loss": 0.4943, + "step": 2237 + }, + { + "epoch": 0.96, + "grad_norm": 0.5413038156860533, + "learning_rate": 3.15369985238455e-06, + "loss": 0.4516, + "step": 2238 + }, + { + "epoch": 0.96, + "grad_norm": 0.5381862150816308, + "learning_rate": 3.151886095427123e-06, + "loss": 0.4753, + "step": 2239 + }, + { + "epoch": 0.96, + "grad_norm": 0.5727293376987612, + "learning_rate": 3.1500719702399406e-06, + "loss": 0.5059, + "step": 2240 + }, + { + "epoch": 0.96, + "grad_norm": 0.5805006268410494, + "learning_rate": 3.1482574778477447e-06, + "loss": 0.4524, + "step": 2241 + }, + { + "epoch": 0.96, + "grad_norm": 0.528343362595312, + "learning_rate": 3.146442619275486e-06, + "loss": 0.471, + "step": 2242 + }, + { + "epoch": 0.96, + "grad_norm": 0.5677192912627839, + "learning_rate": 3.1446273955483173e-06, + "loss": 0.4462, + "step": 2243 + }, + { + "epoch": 0.96, + "grad_norm": 0.569474282068865, + "learning_rate": 3.142811807691603e-06, + "loss": 0.4663, + "step": 2244 + }, + { + "epoch": 0.96, + "grad_norm": 0.5669828732246334, + "learning_rate": 3.1409958567309114e-06, + "loss": 0.4694, + "step": 2245 + }, + { + "epoch": 0.96, + "grad_norm": 0.529668789550494, + "learning_rate": 3.1391795436920136e-06, + "loss": 0.4679, + "step": 2246 + }, + { + "epoch": 0.96, + "grad_norm": 0.5397155007947799, + "learning_rate": 3.1373628696008883e-06, + "loss": 0.4856, + "step": 2247 + }, + { + "epoch": 0.96, + "grad_norm": 0.5495612328617239, + "learning_rate": 3.1355458354837183e-06, + "loss": 0.4909, + "step": 2248 + }, + { + "epoch": 0.96, + "grad_norm": 0.5196306467160406, + "learning_rate": 3.133728442366885e-06, + "loss": 0.4525, + "step": 2249 + }, + { + "epoch": 0.96, + "grad_norm": 0.5278171503733031, + "learning_rate": 3.1319106912769797e-06, + "loss": 0.4458, + "step": 2250 + }, + { + "epoch": 0.96, + "grad_norm": 0.5621344379861554, + "learning_rate": 3.13009258324079e-06, + "loss": 0.4288, + "step": 2251 + }, + { + "epoch": 0.96, + "grad_norm": 0.5340342660151414, + "learning_rate": 3.128274119285309e-06, + "loss": 0.449, + "step": 2252 + }, + { + "epoch": 0.96, + "grad_norm": 0.5771753795948189, + "learning_rate": 3.1264553004377285e-06, + "loss": 0.493, + "step": 2253 + }, + { + "epoch": 0.96, + "grad_norm": 0.6109163801950218, + "learning_rate": 3.1246361277254405e-06, + "loss": 0.5092, + "step": 2254 + }, + { + "epoch": 0.96, + "grad_norm": 0.5464493547347894, + "learning_rate": 3.122816602176039e-06, + "loss": 0.4831, + "step": 2255 + }, + { + "epoch": 0.96, + "grad_norm": 0.5388061503869606, + "learning_rate": 3.1209967248173167e-06, + "loss": 0.474, + "step": 2256 + }, + { + "epoch": 0.96, + "grad_norm": 0.5671598890046339, + "learning_rate": 3.119176496677263e-06, + "loss": 0.4911, + "step": 2257 + }, + { + "epoch": 0.96, + "grad_norm": 0.5456085685153534, + "learning_rate": 3.1173559187840683e-06, + "loss": 0.4614, + "step": 2258 + }, + { + "epoch": 0.97, + "grad_norm": 0.54513522335301, + "learning_rate": 3.115534992166119e-06, + "loss": 0.4692, + "step": 2259 + }, + { + "epoch": 0.97, + "grad_norm": 0.5262225869939646, + "learning_rate": 3.1137137178519983e-06, + "loss": 0.4568, + "step": 2260 + }, + { + "epoch": 0.97, + "grad_norm": 0.5473252038967218, + "learning_rate": 3.111892096870487e-06, + "loss": 0.4611, + "step": 2261 + }, + { + "epoch": 0.97, + "grad_norm": 0.5599766804442933, + "learning_rate": 3.1100701302505586e-06, + "loss": 0.4753, + "step": 2262 + }, + { + "epoch": 0.97, + "grad_norm": 0.5756313356809942, + "learning_rate": 3.1082478190213872e-06, + "loss": 0.4845, + "step": 2263 + }, + { + "epoch": 0.97, + "grad_norm": 0.5480923324133978, + "learning_rate": 3.106425164212338e-06, + "loss": 0.487, + "step": 2264 + }, + { + "epoch": 0.97, + "grad_norm": 0.5315541819290027, + "learning_rate": 3.1046021668529684e-06, + "loss": 0.4702, + "step": 2265 + }, + { + "epoch": 0.97, + "grad_norm": 0.5476930314697923, + "learning_rate": 3.1027788279730343e-06, + "loss": 0.4817, + "step": 2266 + }, + { + "epoch": 0.97, + "grad_norm": 0.544515736247213, + "learning_rate": 3.1009551486024814e-06, + "loss": 0.4587, + "step": 2267 + }, + { + "epoch": 0.97, + "grad_norm": 0.5703546973497986, + "learning_rate": 3.099131129771448e-06, + "loss": 0.4617, + "step": 2268 + }, + { + "epoch": 0.97, + "grad_norm": 0.5424534272905377, + "learning_rate": 3.0973067725102636e-06, + "loss": 0.4484, + "step": 2269 + }, + { + "epoch": 0.97, + "grad_norm": 0.5733254545058311, + "learning_rate": 3.0954820778494516e-06, + "loss": 0.49, + "step": 2270 + }, + { + "epoch": 0.97, + "grad_norm": 0.5844361084620427, + "learning_rate": 3.093657046819722e-06, + "loss": 0.4734, + "step": 2271 + }, + { + "epoch": 0.97, + "grad_norm": 0.5574279308944012, + "learning_rate": 3.0918316804519784e-06, + "loss": 0.4574, + "step": 2272 + }, + { + "epoch": 0.97, + "eval_loss": 0.467759370803833, + "eval_runtime": 6932.3606, + "eval_samples_per_second": 41.889, + "eval_steps_per_second": 2.095, + "step": 2272 + }, + { + "epoch": 0.97, + "grad_norm": 0.546522412221111, + "learning_rate": 3.0900059797773114e-06, + "loss": 0.4886, + "step": 2273 + }, + { + "epoch": 0.97, + "grad_norm": 0.553571296978123, + "learning_rate": 3.0881799458270005e-06, + "loss": 0.4543, + "step": 2274 + }, + { + "epoch": 0.97, + "grad_norm": 0.5237411938748142, + "learning_rate": 3.0863535796325173e-06, + "loss": 0.4556, + "step": 2275 + }, + { + "epoch": 0.97, + "grad_norm": 0.544211874410188, + "learning_rate": 3.0845268822255155e-06, + "loss": 0.4775, + "step": 2276 + }, + { + "epoch": 0.97, + "grad_norm": 0.5661742324216515, + "learning_rate": 3.0826998546378385e-06, + "loss": 0.4602, + "step": 2277 + }, + { + "epoch": 0.97, + "grad_norm": 0.5241341056853029, + "learning_rate": 3.080872497901518e-06, + "loss": 0.4673, + "step": 2278 + }, + { + "epoch": 0.97, + "grad_norm": 0.5483140951807146, + "learning_rate": 3.079044813048768e-06, + "loss": 0.4986, + "step": 2279 + }, + { + "epoch": 0.97, + "grad_norm": 0.5515056093607098, + "learning_rate": 3.0772168011119894e-06, + "loss": 0.4472, + "step": 2280 + }, + { + "epoch": 0.97, + "grad_norm": 0.5594300479805302, + "learning_rate": 3.0753884631237706e-06, + "loss": 0.4762, + "step": 2281 + }, + { + "epoch": 0.97, + "grad_norm": 0.5326582325370033, + "learning_rate": 3.073559800116879e-06, + "loss": 0.4652, + "step": 2282 + }, + { + "epoch": 0.98, + "grad_norm": 0.5354171305776954, + "learning_rate": 3.0717308131242695e-06, + "loss": 0.4585, + "step": 2283 + }, + { + "epoch": 0.98, + "grad_norm": 0.5414502214146208, + "learning_rate": 3.069901503179079e-06, + "loss": 0.4721, + "step": 2284 + }, + { + "epoch": 0.98, + "grad_norm": 0.5586551837722931, + "learning_rate": 3.068071871314626e-06, + "loss": 0.4283, + "step": 2285 + }, + { + "epoch": 0.98, + "grad_norm": 0.5627424065729489, + "learning_rate": 3.0662419185644117e-06, + "loss": 0.4811, + "step": 2286 + }, + { + "epoch": 0.98, + "grad_norm": 0.5553189229312547, + "learning_rate": 3.0644116459621177e-06, + "loss": 0.4749, + "step": 2287 + }, + { + "epoch": 0.98, + "grad_norm": 0.5758016314417642, + "learning_rate": 3.0625810545416066e-06, + "loss": 0.4696, + "step": 2288 + }, + { + "epoch": 0.98, + "grad_norm": 0.5625491392976549, + "learning_rate": 3.060750145336924e-06, + "loss": 0.4641, + "step": 2289 + }, + { + "epoch": 0.98, + "grad_norm": 0.5325354662459477, + "learning_rate": 3.0589189193822894e-06, + "loss": 0.4501, + "step": 2290 + }, + { + "epoch": 0.98, + "grad_norm": 0.5498616147688943, + "learning_rate": 3.057087377712106e-06, + "loss": 0.4697, + "step": 2291 + }, + { + "epoch": 0.98, + "grad_norm": 0.5402471923474919, + "learning_rate": 3.0552555213609526e-06, + "loss": 0.4746, + "step": 2292 + }, + { + "epoch": 0.98, + "grad_norm": 0.5742344531078118, + "learning_rate": 3.0534233513635863e-06, + "loss": 0.4597, + "step": 2293 + }, + { + "epoch": 0.98, + "grad_norm": 0.5660060095582498, + "learning_rate": 3.0515908687549427e-06, + "loss": 0.4712, + "step": 2294 + }, + { + "epoch": 0.98, + "grad_norm": 0.5449862197515469, + "learning_rate": 3.0497580745701334e-06, + "loss": 0.4658, + "step": 2295 + }, + { + "epoch": 0.98, + "grad_norm": 0.5637723063866732, + "learning_rate": 3.047924969844444e-06, + "loss": 0.4863, + "step": 2296 + }, + { + "epoch": 0.98, + "grad_norm": 0.5495824769869212, + "learning_rate": 3.046091555613339e-06, + "loss": 0.4703, + "step": 2297 + }, + { + "epoch": 0.98, + "grad_norm": 0.5382416875131586, + "learning_rate": 3.0442578329124545e-06, + "loss": 0.4584, + "step": 2298 + }, + { + "epoch": 0.98, + "grad_norm": 0.5463125942933166, + "learning_rate": 3.042423802777602e-06, + "loss": 0.4772, + "step": 2299 + }, + { + "epoch": 0.98, + "grad_norm": 0.5597328954873347, + "learning_rate": 3.0405894662447682e-06, + "loss": 0.4639, + "step": 2300 + }, + { + "epoch": 0.98, + "grad_norm": 0.5559561549207039, + "learning_rate": 3.038754824350111e-06, + "loss": 0.4847, + "step": 2301 + }, + { + "epoch": 0.98, + "grad_norm": 0.548445944370762, + "learning_rate": 3.0369198781299615e-06, + "loss": 0.47, + "step": 2302 + }, + { + "epoch": 0.98, + "grad_norm": 0.5292092956017875, + "learning_rate": 3.0350846286208223e-06, + "loss": 0.4515, + "step": 2303 + }, + { + "epoch": 0.98, + "grad_norm": 0.5674894268853128, + "learning_rate": 3.0332490768593676e-06, + "loss": 0.4685, + "step": 2304 + }, + { + "epoch": 0.98, + "grad_norm": 0.5336634810588987, + "learning_rate": 3.0314132238824416e-06, + "loss": 0.4581, + "step": 2305 + }, + { + "epoch": 0.99, + "grad_norm": 0.5533286565854473, + "learning_rate": 3.029577070727061e-06, + "loss": 0.4477, + "step": 2306 + }, + { + "epoch": 0.99, + "grad_norm": 0.5680828232808299, + "learning_rate": 3.027740618430409e-06, + "loss": 0.4584, + "step": 2307 + }, + { + "epoch": 0.99, + "grad_norm": 0.5469367387429259, + "learning_rate": 3.0259038680298403e-06, + "loss": 0.4566, + "step": 2308 + }, + { + "epoch": 0.99, + "grad_norm": 0.575161918224578, + "learning_rate": 3.0240668205628757e-06, + "loss": 0.4918, + "step": 2309 + }, + { + "epoch": 0.99, + "grad_norm": 0.5576124636437852, + "learning_rate": 3.0222294770672054e-06, + "loss": 0.4358, + "step": 2310 + }, + { + "epoch": 0.99, + "grad_norm": 0.5802410529155669, + "learning_rate": 3.0203918385806874e-06, + "loss": 0.4712, + "step": 2311 + }, + { + "epoch": 0.99, + "grad_norm": 0.5305957990981474, + "learning_rate": 3.018553906141343e-06, + "loss": 0.4656, + "step": 2312 + }, + { + "epoch": 0.99, + "grad_norm": 0.5503984759313564, + "learning_rate": 3.0167156807873637e-06, + "loss": 0.4711, + "step": 2313 + }, + { + "epoch": 0.99, + "grad_norm": 0.5629590646395772, + "learning_rate": 3.014877163557105e-06, + "loss": 0.4604, + "step": 2314 + }, + { + "epoch": 0.99, + "grad_norm": 0.5731070115431295, + "learning_rate": 3.013038355489086e-06, + "loss": 0.4543, + "step": 2315 + }, + { + "epoch": 0.99, + "grad_norm": 0.5625771430390166, + "learning_rate": 3.0111992576219905e-06, + "loss": 0.4564, + "step": 2316 + }, + { + "epoch": 0.99, + "grad_norm": 0.5811573131231536, + "learning_rate": 3.009359870994668e-06, + "loss": 0.4792, + "step": 2317 + }, + { + "epoch": 0.99, + "grad_norm": 0.5768867352155639, + "learning_rate": 3.0075201966461286e-06, + "loss": 0.4802, + "step": 2318 + }, + { + "epoch": 0.99, + "grad_norm": 0.5833346195049011, + "learning_rate": 3.0056802356155455e-06, + "loss": 0.4669, + "step": 2319 + }, + { + "epoch": 0.99, + "grad_norm": 0.5506865485168906, + "learning_rate": 3.0038399889422553e-06, + "loss": 0.4483, + "step": 2320 + }, + { + "epoch": 0.99, + "grad_norm": 0.549893343788274, + "learning_rate": 3.001999457665754e-06, + "loss": 0.4608, + "step": 2321 + }, + { + "epoch": 0.99, + "grad_norm": 0.5742059606229357, + "learning_rate": 3.0001586428257006e-06, + "loss": 0.4632, + "step": 2322 + }, + { + "epoch": 0.99, + "grad_norm": 0.5710353252614344, + "learning_rate": 2.9983175454619114e-06, + "loss": 0.4906, + "step": 2323 + }, + { + "epoch": 0.99, + "grad_norm": 0.5661140686401654, + "learning_rate": 2.9964761666143638e-06, + "loss": 0.4691, + "step": 2324 + }, + { + "epoch": 0.99, + "grad_norm": 0.5832412124730281, + "learning_rate": 2.9946345073231964e-06, + "loss": 0.4715, + "step": 2325 + }, + { + "epoch": 0.99, + "grad_norm": 0.5322313638153089, + "learning_rate": 2.9927925686287006e-06, + "loss": 0.4397, + "step": 2326 + }, + { + "epoch": 0.99, + "grad_norm": 0.5532684507212673, + "learning_rate": 2.9909503515713324e-06, + "loss": 0.4526, + "step": 2327 + }, + { + "epoch": 0.99, + "grad_norm": 0.5380218677936013, + "learning_rate": 2.9891078571917004e-06, + "loss": 0.4627, + "step": 2328 + }, + { + "epoch": 1.0, + "grad_norm": 0.575331323008352, + "learning_rate": 2.987265086530571e-06, + "loss": 0.4582, + "step": 2329 + }, + { + "epoch": 1.0, + "grad_norm": 0.5720361448465936, + "learning_rate": 2.985422040628867e-06, + "loss": 0.4663, + "step": 2330 + }, + { + "epoch": 1.0, + "grad_norm": 0.54998684465839, + "learning_rate": 2.983578720527667e-06, + "loss": 0.455, + "step": 2331 + }, + { + "epoch": 1.0, + "grad_norm": 0.5552738272333081, + "learning_rate": 2.981735127268202e-06, + "loss": 0.4705, + "step": 2332 + }, + { + "epoch": 1.0, + "grad_norm": 0.552744557893692, + "learning_rate": 2.9798912618918617e-06, + "loss": 0.4633, + "step": 2333 + }, + { + "epoch": 1.0, + "grad_norm": 0.5445751179840383, + "learning_rate": 2.9780471254401868e-06, + "loss": 0.4736, + "step": 2334 + }, + { + "epoch": 1.0, + "grad_norm": 0.5333392734486967, + "learning_rate": 2.976202718954869e-06, + "loss": 0.4777, + "step": 2335 + }, + { + "epoch": 1.0, + "grad_norm": 0.572472547878964, + "learning_rate": 2.9743580434777586e-06, + "loss": 0.487, + "step": 2336 + }, + { + "epoch": 1.0, + "grad_norm": 0.5344393742783472, + "learning_rate": 2.972513100050851e-06, + "loss": 0.4707, + "step": 2337 + }, + { + "epoch": 1.0, + "grad_norm": 0.5630355400902859, + "learning_rate": 2.970667889716298e-06, + "loss": 0.4366, + "step": 2338 + }, + { + "epoch": 1.0, + "grad_norm": 0.5643967515499763, + "learning_rate": 2.9688224135164e-06, + "loss": 0.466, + "step": 2339 + }, + { + "epoch": 1.0, + "grad_norm": 0.5624350508625989, + "learning_rate": 2.9669766724936074e-06, + "loss": 0.4352, + "step": 2340 + }, + { + "epoch": 1.0, + "grad_norm": 0.5863321023005329, + "learning_rate": 2.9651306676905213e-06, + "loss": 0.4963, + "step": 2341 + }, + { + "epoch": 1.0, + "grad_norm": 0.5263776129880875, + "learning_rate": 2.9632844001498908e-06, + "loss": 0.4565, + "step": 2342 + }, + { + "epoch": 1.0, + "grad_norm": 0.5610974245603717, + "learning_rate": 2.9614378709146136e-06, + "loss": 0.4661, + "step": 2343 + }, + { + "epoch": 1.0, + "eval_loss": 0.4664944112300873, + "eval_runtime": 6923.3541, + "eval_samples_per_second": 41.943, + "eval_steps_per_second": 2.097, + "step": 2343 + }, + { + "epoch": 1.0, + "grad_norm": 0.5460765503187325, + "learning_rate": 2.9595910810277367e-06, + "loss": 0.4629, + "step": 2344 + }, + { + "epoch": 1.0, + "grad_norm": 0.5468673482394827, + "learning_rate": 2.957744031532451e-06, + "loss": 0.4583, + "step": 2345 + }, + { + "epoch": 1.0, + "grad_norm": 0.5470321763441149, + "learning_rate": 2.9558967234720976e-06, + "loss": 0.5088, + "step": 2346 + }, + { + "epoch": 1.0, + "grad_norm": 0.5599348363396319, + "learning_rate": 2.9540491578901625e-06, + "loss": 0.4596, + "step": 2347 + }, + { + "epoch": 1.0, + "grad_norm": 0.554159234404833, + "learning_rate": 2.9522013358302754e-06, + "loss": 0.4577, + "step": 2348 + }, + { + "epoch": 1.0, + "grad_norm": 0.6005976463431216, + "learning_rate": 2.9503532583362126e-06, + "loss": 0.468, + "step": 2349 + }, + { + "epoch": 1.0, + "grad_norm": 0.5407554418974608, + "learning_rate": 2.948504926451896e-06, + "loss": 0.4715, + "step": 2350 + }, + { + "epoch": 1.0, + "grad_norm": 0.5744854947285752, + "learning_rate": 2.9466563412213873e-06, + "loss": 0.4941, + "step": 2351 + }, + { + "epoch": 1.0, + "grad_norm": 0.5637411990253361, + "learning_rate": 2.9448075036888944e-06, + "loss": 0.4573, + "step": 2352 + }, + { + "epoch": 1.01, + "grad_norm": 0.5656040322788783, + "learning_rate": 2.942958414898768e-06, + "loss": 0.4586, + "step": 2353 + }, + { + "epoch": 1.01, + "grad_norm": 0.5735314101382244, + "learning_rate": 2.941109075895499e-06, + "loss": 0.4493, + "step": 2354 + }, + { + "epoch": 1.01, + "grad_norm": 0.5467877193091774, + "learning_rate": 2.9392594877237194e-06, + "loss": 0.5006, + "step": 2355 + }, + { + "epoch": 1.01, + "grad_norm": 0.5362633073872797, + "learning_rate": 2.937409651428205e-06, + "loss": 0.4874, + "step": 2356 + }, + { + "epoch": 1.01, + "grad_norm": 0.5325968068801722, + "learning_rate": 2.935559568053867e-06, + "loss": 0.4914, + "step": 2357 + }, + { + "epoch": 1.01, + "grad_norm": 0.5378549381823456, + "learning_rate": 2.93370923864576e-06, + "loss": 0.4607, + "step": 2358 + }, + { + "epoch": 1.01, + "grad_norm": 0.5271154873814673, + "learning_rate": 2.9318586642490766e-06, + "loss": 0.44, + "step": 2359 + }, + { + "epoch": 1.01, + "grad_norm": 0.5924972443494668, + "learning_rate": 2.930007845909146e-06, + "loss": 0.5007, + "step": 2360 + }, + { + "epoch": 1.01, + "grad_norm": 0.5759884987421106, + "learning_rate": 2.9281567846714393e-06, + "loss": 0.4847, + "step": 2361 + }, + { + "epoch": 1.01, + "grad_norm": 0.5452440313358291, + "learning_rate": 2.92630548158156e-06, + "loss": 0.4728, + "step": 2362 + }, + { + "epoch": 1.01, + "grad_norm": 0.5491210743756382, + "learning_rate": 2.924453937685251e-06, + "loss": 0.4814, + "step": 2363 + }, + { + "epoch": 1.01, + "grad_norm": 0.5497417413168237, + "learning_rate": 2.9226021540283914e-06, + "loss": 0.479, + "step": 2364 + }, + { + "epoch": 1.01, + "grad_norm": 0.5440302176322239, + "learning_rate": 2.9207501316569936e-06, + "loss": 0.469, + "step": 2365 + }, + { + "epoch": 1.01, + "grad_norm": 0.5307306470354942, + "learning_rate": 2.918897871617207e-06, + "loss": 0.4494, + "step": 2366 + }, + { + "epoch": 1.01, + "grad_norm": 0.5533694018060741, + "learning_rate": 2.9170453749553158e-06, + "loss": 0.4528, + "step": 2367 + }, + { + "epoch": 1.01, + "grad_norm": 0.5821093073073446, + "learning_rate": 2.9151926427177345e-06, + "loss": 0.4847, + "step": 2368 + }, + { + "epoch": 1.01, + "grad_norm": 0.5810561155735557, + "learning_rate": 2.913339675951014e-06, + "loss": 0.4577, + "step": 2369 + }, + { + "epoch": 1.01, + "grad_norm": 0.5637656659230893, + "learning_rate": 2.911486475701835e-06, + "loss": 0.4705, + "step": 2370 + }, + { + "epoch": 1.01, + "grad_norm": 0.5646507829525883, + "learning_rate": 2.909633043017013e-06, + "loss": 0.4655, + "step": 2371 + }, + { + "epoch": 1.01, + "grad_norm": 0.5560085235238604, + "learning_rate": 2.9077793789434925e-06, + "loss": 0.4645, + "step": 2372 + }, + { + "epoch": 1.01, + "grad_norm": 0.5804591657223265, + "learning_rate": 2.905925484528349e-06, + "loss": 0.4473, + "step": 2373 + }, + { + "epoch": 1.0, + "grad_norm": 0.5183741960616122, + "learning_rate": 2.9040713608187896e-06, + "loss": 0.4561, + "step": 2374 + }, + { + "epoch": 1.0, + "grad_norm": 0.6496654700785708, + "learning_rate": 2.9022170088621497e-06, + "loss": 0.4481, + "step": 2375 + }, + { + "epoch": 1.0, + "grad_norm": 0.6258547954302262, + "learning_rate": 2.900362429705893e-06, + "loss": 0.4322, + "step": 2376 + }, + { + "epoch": 1.0, + "grad_norm": 0.7675149067928225, + "learning_rate": 2.8985076243976133e-06, + "loss": 0.3944, + "step": 2377 + }, + { + "epoch": 1.0, + "grad_norm": 0.6600899912780784, + "learning_rate": 2.896652593985031e-06, + "loss": 0.4245, + "step": 2378 + }, + { + "epoch": 1.0, + "grad_norm": 0.7306824483448988, + "learning_rate": 2.8947973395159934e-06, + "loss": 0.4426, + "step": 2379 + }, + { + "epoch": 1.0, + "grad_norm": 0.6516484002884269, + "learning_rate": 2.892941862038475e-06, + "loss": 0.4297, + "step": 2380 + }, + { + "epoch": 1.0, + "grad_norm": 0.5912395370707995, + "learning_rate": 2.8910861626005774e-06, + "loss": 0.4273, + "step": 2381 + }, + { + "epoch": 1.0, + "grad_norm": 0.6132721088402558, + "learning_rate": 2.889230242250525e-06, + "loss": 0.3981, + "step": 2382 + }, + { + "epoch": 1.0, + "grad_norm": 0.6491777150140556, + "learning_rate": 2.887374102036668e-06, + "loss": 0.4167, + "step": 2383 + }, + { + "epoch": 1.0, + "grad_norm": 0.6143918069720361, + "learning_rate": 2.8855177430074817e-06, + "loss": 0.4215, + "step": 2384 + }, + { + "epoch": 1.0, + "grad_norm": 0.5796986965059269, + "learning_rate": 2.883661166211564e-06, + "loss": 0.4328, + "step": 2385 + }, + { + "epoch": 1.01, + "grad_norm": 0.6385596001339221, + "learning_rate": 2.881804372697637e-06, + "loss": 0.4189, + "step": 2386 + }, + { + "epoch": 1.01, + "grad_norm": 0.6774158174574826, + "learning_rate": 2.879947363514543e-06, + "loss": 0.4257, + "step": 2387 + }, + { + "epoch": 1.01, + "grad_norm": 0.6439324568387629, + "learning_rate": 2.878090139711249e-06, + "loss": 0.4203, + "step": 2388 + }, + { + "epoch": 1.01, + "grad_norm": 0.5928966275845924, + "learning_rate": 2.8762327023368408e-06, + "loss": 0.4118, + "step": 2389 + }, + { + "epoch": 1.01, + "grad_norm": 0.6017304771309222, + "learning_rate": 2.8743750524405254e-06, + "loss": 0.4698, + "step": 2390 + }, + { + "epoch": 1.01, + "grad_norm": 0.5709073058681067, + "learning_rate": 2.872517191071631e-06, + "loss": 0.4357, + "step": 2391 + }, + { + "epoch": 1.01, + "grad_norm": 0.5680125069824045, + "learning_rate": 2.870659119279605e-06, + "loss": 0.4331, + "step": 2392 + }, + { + "epoch": 1.01, + "grad_norm": 0.5694888697631014, + "learning_rate": 2.8688008381140126e-06, + "loss": 0.4225, + "step": 2393 + }, + { + "epoch": 1.01, + "grad_norm": 0.533441546679153, + "learning_rate": 2.866942348624538e-06, + "loss": 0.4106, + "step": 2394 + }, + { + "epoch": 1.01, + "grad_norm": 0.6115603135836928, + "learning_rate": 2.8650836518609814e-06, + "loss": 0.4167, + "step": 2395 + }, + { + "epoch": 1.01, + "grad_norm": 0.6206156844939555, + "learning_rate": 2.863224748873264e-06, + "loss": 0.4153, + "step": 2396 + }, + { + "epoch": 1.01, + "grad_norm": 0.5748859460500882, + "learning_rate": 2.8613656407114197e-06, + "loss": 0.4134, + "step": 2397 + }, + { + "epoch": 1.01, + "grad_norm": 0.5420087579510725, + "learning_rate": 2.8595063284255997e-06, + "loss": 0.42, + "step": 2398 + }, + { + "epoch": 1.01, + "grad_norm": 0.6393458020851205, + "learning_rate": 2.85764681306607e-06, + "loss": 0.4448, + "step": 2399 + }, + { + "epoch": 1.01, + "grad_norm": 0.5570280986330635, + "learning_rate": 2.8557870956832135e-06, + "loss": 0.4192, + "step": 2400 + }, + { + "epoch": 1.01, + "grad_norm": 0.5742816886506813, + "learning_rate": 2.853927177327524e-06, + "loss": 0.4063, + "step": 2401 + }, + { + "epoch": 1.01, + "grad_norm": 0.5631462441989407, + "learning_rate": 2.85206705904961e-06, + "loss": 0.4187, + "step": 2402 + }, + { + "epoch": 1.01, + "grad_norm": 0.5759070659432811, + "learning_rate": 2.850206741900195e-06, + "loss": 0.4294, + "step": 2403 + }, + { + "epoch": 1.01, + "grad_norm": 0.5954714993206741, + "learning_rate": 2.8483462269301117e-06, + "loss": 0.4414, + "step": 2404 + }, + { + "epoch": 1.01, + "grad_norm": 0.5847923954668218, + "learning_rate": 2.8464855151903065e-06, + "loss": 0.4153, + "step": 2405 + }, + { + "epoch": 1.01, + "grad_norm": 0.6128854329193607, + "learning_rate": 2.844624607731836e-06, + "loss": 0.4528, + "step": 2406 + }, + { + "epoch": 1.01, + "grad_norm": 0.5703811176973029, + "learning_rate": 2.842763505605867e-06, + "loss": 0.4222, + "step": 2407 + }, + { + "epoch": 1.01, + "grad_norm": 0.5552066352368414, + "learning_rate": 2.8409022098636797e-06, + "loss": 0.436, + "step": 2408 + }, + { + "epoch": 1.02, + "grad_norm": 0.6157208158352602, + "learning_rate": 2.839040721556658e-06, + "loss": 0.4302, + "step": 2409 + }, + { + "epoch": 1.02, + "grad_norm": 0.5647862801761142, + "learning_rate": 2.837179041736299e-06, + "loss": 0.4083, + "step": 2410 + }, + { + "epoch": 1.02, + "grad_norm": 0.5547355674329157, + "learning_rate": 2.835317171454206e-06, + "loss": 0.4162, + "step": 2411 + }, + { + "epoch": 1.02, + "grad_norm": 0.5766014610774604, + "learning_rate": 2.8334551117620908e-06, + "loss": 0.4226, + "step": 2412 + }, + { + "epoch": 1.02, + "grad_norm": 0.6015597806488204, + "learning_rate": 2.8315928637117713e-06, + "loss": 0.4291, + "step": 2413 + }, + { + "epoch": 1.02, + "grad_norm": 0.5788471843088463, + "learning_rate": 2.829730428355173e-06, + "loss": 0.4251, + "step": 2414 + }, + { + "epoch": 1.02, + "eval_loss": 0.4677947163581848, + "eval_runtime": 6939.1205, + "eval_samples_per_second": 41.848, + "eval_steps_per_second": 2.092, + "step": 2414 + }, + { + "epoch": 1.02, + "grad_norm": 0.586414114042501, + "learning_rate": 2.8278678067443255e-06, + "loss": 0.4145, + "step": 2415 + }, + { + "epoch": 1.02, + "grad_norm": 0.5691685054792884, + "learning_rate": 2.826004999931365e-06, + "loss": 0.4376, + "step": 2416 + }, + { + "epoch": 1.02, + "grad_norm": 0.5972775147112437, + "learning_rate": 2.8241420089685327e-06, + "loss": 0.4306, + "step": 2417 + }, + { + "epoch": 1.02, + "grad_norm": 0.5620380464963803, + "learning_rate": 2.8222788349081724e-06, + "loss": 0.4096, + "step": 2418 + }, + { + "epoch": 1.02, + "grad_norm": 0.5784135735472834, + "learning_rate": 2.820415478802733e-06, + "loss": 0.4322, + "step": 2419 + }, + { + "epoch": 1.02, + "grad_norm": 0.5885392171954246, + "learning_rate": 2.8185519417047624e-06, + "loss": 0.4039, + "step": 2420 + }, + { + "epoch": 1.02, + "grad_norm": 0.5536074618340685, + "learning_rate": 2.8166882246669158e-06, + "loss": 0.4291, + "step": 2421 + }, + { + "epoch": 1.02, + "grad_norm": 0.581909431594722, + "learning_rate": 2.814824328741948e-06, + "loss": 0.4144, + "step": 2422 + }, + { + "epoch": 1.02, + "grad_norm": 0.5675861206607421, + "learning_rate": 2.8129602549827133e-06, + "loss": 0.419, + "step": 2423 + }, + { + "epoch": 1.02, + "grad_norm": 0.5632432133699884, + "learning_rate": 2.811096004442168e-06, + "loss": 0.4252, + "step": 2424 + }, + { + "epoch": 1.02, + "grad_norm": 0.5650569070218456, + "learning_rate": 2.80923157817337e-06, + "loss": 0.4136, + "step": 2425 + }, + { + "epoch": 1.02, + "grad_norm": 0.572161063598354, + "learning_rate": 2.8073669772294714e-06, + "loss": 0.4208, + "step": 2426 + }, + { + "epoch": 1.02, + "grad_norm": 0.5719901179312831, + "learning_rate": 2.805502202663728e-06, + "loss": 0.4375, + "step": 2427 + }, + { + "epoch": 1.02, + "grad_norm": 0.5612496619770833, + "learning_rate": 2.8036372555294916e-06, + "loss": 0.4047, + "step": 2428 + }, + { + "epoch": 1.02, + "grad_norm": 0.5900526811179282, + "learning_rate": 2.8017721368802105e-06, + "loss": 0.4295, + "step": 2429 + }, + { + "epoch": 1.02, + "grad_norm": 0.5789988982356299, + "learning_rate": 2.799906847769433e-06, + "loss": 0.4138, + "step": 2430 + }, + { + "epoch": 1.02, + "grad_norm": 0.5438179046584847, + "learning_rate": 2.7980413892507995e-06, + "loss": 0.408, + "step": 2431 + }, + { + "epoch": 1.02, + "grad_norm": 0.5857502120912527, + "learning_rate": 2.79617576237805e-06, + "loss": 0.406, + "step": 2432 + }, + { + "epoch": 1.03, + "grad_norm": 0.5798751246080152, + "learning_rate": 2.7943099682050174e-06, + "loss": 0.4173, + "step": 2433 + }, + { + "epoch": 1.03, + "grad_norm": 0.5830921819980437, + "learning_rate": 2.7924440077856284e-06, + "loss": 0.4353, + "step": 2434 + }, + { + "epoch": 1.03, + "grad_norm": 0.5550145268591252, + "learning_rate": 2.790577882173906e-06, + "loss": 0.4033, + "step": 2435 + }, + { + "epoch": 1.03, + "grad_norm": 0.5665020917284002, + "learning_rate": 2.788711592423966e-06, + "loss": 0.4282, + "step": 2436 + }, + { + "epoch": 1.03, + "grad_norm": 0.5619717205524801, + "learning_rate": 2.786845139590014e-06, + "loss": 0.4161, + "step": 2437 + }, + { + "epoch": 1.03, + "grad_norm": 0.5647215964187846, + "learning_rate": 2.7849785247263515e-06, + "loss": 0.419, + "step": 2438 + }, + { + "epoch": 1.03, + "grad_norm": 0.6047710310723298, + "learning_rate": 2.7831117488873703e-06, + "loss": 0.4489, + "step": 2439 + }, + { + "epoch": 1.03, + "grad_norm": 0.5729169302378637, + "learning_rate": 2.781244813127552e-06, + "loss": 0.4258, + "step": 2440 + }, + { + "epoch": 1.03, + "grad_norm": 0.5726895367017322, + "learning_rate": 2.779377718501469e-06, + "loss": 0.4163, + "step": 2441 + }, + { + "epoch": 1.03, + "grad_norm": 0.5898478621812457, + "learning_rate": 2.7775104660637847e-06, + "loss": 0.4383, + "step": 2442 + }, + { + "epoch": 1.03, + "grad_norm": 0.5776286383338888, + "learning_rate": 2.77564305686925e-06, + "loss": 0.4172, + "step": 2443 + }, + { + "epoch": 1.03, + "grad_norm": 0.5708128845728648, + "learning_rate": 2.7737754919727057e-06, + "loss": 0.433, + "step": 2444 + }, + { + "epoch": 1.03, + "grad_norm": 0.5838647193488192, + "learning_rate": 2.7719077724290793e-06, + "loss": 0.4319, + "step": 2445 + }, + { + "epoch": 1.03, + "grad_norm": 0.5839908538746356, + "learning_rate": 2.7700398992933865e-06, + "loss": 0.3981, + "step": 2446 + }, + { + "epoch": 1.03, + "grad_norm": 0.5793472385138957, + "learning_rate": 2.76817187362073e-06, + "loss": 0.4075, + "step": 2447 + }, + { + "epoch": 1.03, + "grad_norm": 0.5727590490167658, + "learning_rate": 2.7663036964662967e-06, + "loss": 0.4312, + "step": 2448 + }, + { + "epoch": 1.03, + "grad_norm": 0.5920736043473027, + "learning_rate": 2.764435368885362e-06, + "loss": 0.4138, + "step": 2449 + }, + { + "epoch": 1.03, + "grad_norm": 0.5723166701159088, + "learning_rate": 2.762566891933285e-06, + "loss": 0.4073, + "step": 2450 + }, + { + "epoch": 1.03, + "grad_norm": 0.5624668781788753, + "learning_rate": 2.7606982666655074e-06, + "loss": 0.429, + "step": 2451 + }, + { + "epoch": 1.03, + "grad_norm": 0.5704524501724009, + "learning_rate": 2.758829494137557e-06, + "loss": 0.4196, + "step": 2452 + }, + { + "epoch": 1.03, + "grad_norm": 0.5700116449229535, + "learning_rate": 2.7569605754050455e-06, + "loss": 0.4317, + "step": 2453 + }, + { + "epoch": 1.03, + "grad_norm": 0.599236712281217, + "learning_rate": 2.7550915115236636e-06, + "loss": 0.4087, + "step": 2454 + }, + { + "epoch": 1.03, + "grad_norm": 0.5731708478648513, + "learning_rate": 2.7532223035491877e-06, + "loss": 0.4149, + "step": 2455 + }, + { + "epoch": 1.04, + "grad_norm": 0.5748213937213151, + "learning_rate": 2.7513529525374725e-06, + "loss": 0.4041, + "step": 2456 + }, + { + "epoch": 1.04, + "grad_norm": 0.5736837321137316, + "learning_rate": 2.749483459544457e-06, + "loss": 0.4282, + "step": 2457 + }, + { + "epoch": 1.04, + "grad_norm": 0.5637414256487769, + "learning_rate": 2.7476138256261575e-06, + "loss": 0.3905, + "step": 2458 + }, + { + "epoch": 1.04, + "grad_norm": 0.6013990347615658, + "learning_rate": 2.74574405183867e-06, + "loss": 0.4183, + "step": 2459 + }, + { + "epoch": 1.04, + "grad_norm": 0.5581978111233568, + "learning_rate": 2.743874139238171e-06, + "loss": 0.4325, + "step": 2460 + }, + { + "epoch": 1.04, + "grad_norm": 0.5658199630731812, + "learning_rate": 2.7420040888809153e-06, + "loss": 0.4035, + "step": 2461 + }, + { + "epoch": 1.04, + "grad_norm": 0.5691729761752431, + "learning_rate": 2.740133901823234e-06, + "loss": 0.4171, + "step": 2462 + }, + { + "epoch": 1.04, + "grad_norm": 0.5571943454995789, + "learning_rate": 2.7382635791215368e-06, + "loss": 0.4092, + "step": 2463 + }, + { + "epoch": 1.04, + "grad_norm": 0.5584570649456075, + "learning_rate": 2.7363931218323103e-06, + "loss": 0.4192, + "step": 2464 + }, + { + "epoch": 1.04, + "grad_norm": 0.579128238271553, + "learning_rate": 2.7345225310121155e-06, + "loss": 0.4404, + "step": 2465 + }, + { + "epoch": 1.04, + "grad_norm": 0.5929460640663197, + "learning_rate": 2.7326518077175897e-06, + "loss": 0.4197, + "step": 2466 + }, + { + "epoch": 1.04, + "grad_norm": 0.5689782317669538, + "learning_rate": 2.7307809530054456e-06, + "loss": 0.4316, + "step": 2467 + }, + { + "epoch": 1.04, + "grad_norm": 0.5854607172774103, + "learning_rate": 2.7289099679324686e-06, + "loss": 0.4431, + "step": 2468 + }, + { + "epoch": 1.04, + "grad_norm": 0.5694064345867531, + "learning_rate": 2.7270388535555207e-06, + "loss": 0.4229, + "step": 2469 + }, + { + "epoch": 1.04, + "grad_norm": 0.5694897278258253, + "learning_rate": 2.725167610931534e-06, + "loss": 0.4122, + "step": 2470 + }, + { + "epoch": 1.04, + "grad_norm": 0.5799667414945328, + "learning_rate": 2.7232962411175128e-06, + "loss": 0.4227, + "step": 2471 + }, + { + "epoch": 1.04, + "grad_norm": 0.5880421037268969, + "learning_rate": 2.721424745170537e-06, + "loss": 0.4239, + "step": 2472 + }, + { + "epoch": 1.04, + "grad_norm": 0.5645195680029945, + "learning_rate": 2.719553124147753e-06, + "loss": 0.4131, + "step": 2473 + }, + { + "epoch": 1.04, + "grad_norm": 0.5549310632780857, + "learning_rate": 2.717681379106381e-06, + "loss": 0.4221, + "step": 2474 + }, + { + "epoch": 1.04, + "grad_norm": 0.5724363209257333, + "learning_rate": 2.715809511103711e-06, + "loss": 0.4331, + "step": 2475 + }, + { + "epoch": 1.04, + "grad_norm": 0.5646087777604074, + "learning_rate": 2.7139375211971e-06, + "loss": 0.4285, + "step": 2476 + }, + { + "epoch": 1.04, + "grad_norm": 0.5589589680565078, + "learning_rate": 2.712065410443977e-06, + "loss": 0.4225, + "step": 2477 + }, + { + "epoch": 1.04, + "grad_norm": 0.5975042188527351, + "learning_rate": 2.710193179901838e-06, + "loss": 0.4475, + "step": 2478 + }, + { + "epoch": 1.04, + "grad_norm": 0.5632223523751451, + "learning_rate": 2.7083208306282455e-06, + "loss": 0.4195, + "step": 2479 + }, + { + "epoch": 1.05, + "grad_norm": 0.6271794519728584, + "learning_rate": 2.7064483636808314e-06, + "loss": 0.4383, + "step": 2480 + }, + { + "epoch": 1.05, + "grad_norm": 0.6193844438411981, + "learning_rate": 2.7045757801172918e-06, + "loss": 0.4405, + "step": 2481 + }, + { + "epoch": 1.05, + "grad_norm": 0.5762357913780729, + "learning_rate": 2.70270308099539e-06, + "loss": 0.4256, + "step": 2482 + }, + { + "epoch": 1.05, + "grad_norm": 0.5686615359961465, + "learning_rate": 2.7008302673729556e-06, + "loss": 0.4228, + "step": 2483 + }, + { + "epoch": 1.05, + "grad_norm": 0.5846035613411025, + "learning_rate": 2.6989573403078793e-06, + "loss": 0.4424, + "step": 2484 + }, + { + "epoch": 1.05, + "grad_norm": 0.57106132811842, + "learning_rate": 2.69708430085812e-06, + "loss": 0.4249, + "step": 2485 + }, + { + "epoch": 1.05, + "eval_loss": 0.4666215181350708, + "eval_runtime": 6941.4114, + "eval_samples_per_second": 41.834, + "eval_steps_per_second": 2.092, + "step": 2485 + }, + { + "epoch": 1.05, + "grad_norm": 0.5599313713923679, + "learning_rate": 2.6952111500816972e-06, + "loss": 0.4247, + "step": 2486 + }, + { + "epoch": 1.05, + "grad_norm": 0.5633818405700421, + "learning_rate": 2.6933378890366945e-06, + "loss": 0.4292, + "step": 2487 + }, + { + "epoch": 1.05, + "grad_norm": 0.5687785418378354, + "learning_rate": 2.6914645187812573e-06, + "loss": 0.4238, + "step": 2488 + }, + { + "epoch": 1.05, + "grad_norm": 0.5882193651407125, + "learning_rate": 2.6895910403735938e-06, + "loss": 0.4105, + "step": 2489 + }, + { + "epoch": 1.05, + "grad_norm": 0.5745904096533959, + "learning_rate": 2.687717454871971e-06, + "loss": 0.4307, + "step": 2490 + }, + { + "epoch": 1.05, + "grad_norm": 0.5600088359405079, + "learning_rate": 2.6858437633347197e-06, + "loss": 0.4149, + "step": 2491 + }, + { + "epoch": 1.05, + "grad_norm": 0.5900737321936005, + "learning_rate": 2.6839699668202275e-06, + "loss": 0.4426, + "step": 2492 + }, + { + "epoch": 1.05, + "grad_norm": 0.5993093534443086, + "learning_rate": 2.682096066386943e-06, + "loss": 0.4079, + "step": 2493 + }, + { + "epoch": 1.05, + "grad_norm": 0.573184519185299, + "learning_rate": 2.680222063093372e-06, + "loss": 0.4052, + "step": 2494 + }, + { + "epoch": 1.05, + "grad_norm": 0.5618782823660212, + "learning_rate": 2.678347957998081e-06, + "loss": 0.4364, + "step": 2495 + }, + { + "epoch": 1.05, + "grad_norm": 0.584859311676964, + "learning_rate": 2.6764737521596917e-06, + "loss": 0.4052, + "step": 2496 + }, + { + "epoch": 1.05, + "grad_norm": 0.5477234902368038, + "learning_rate": 2.6745994466368846e-06, + "loss": 0.4302, + "step": 2497 + }, + { + "epoch": 1.05, + "grad_norm": 0.6063996852367219, + "learning_rate": 2.672725042488393e-06, + "loss": 0.4181, + "step": 2498 + }, + { + "epoch": 1.05, + "grad_norm": 0.5760512448701784, + "learning_rate": 2.6708505407730106e-06, + "loss": 0.4134, + "step": 2499 + }, + { + "epoch": 1.05, + "grad_norm": 0.5519722703829206, + "learning_rate": 2.6689759425495833e-06, + "loss": 0.4222, + "step": 2500 + }, + { + "epoch": 1.05, + "grad_norm": 0.5683672052752755, + "learning_rate": 2.6671012488770104e-06, + "loss": 0.4268, + "step": 2501 + }, + { + "epoch": 1.05, + "grad_norm": 0.5877420100279226, + "learning_rate": 2.6652264608142487e-06, + "loss": 0.4178, + "step": 2502 + }, + { + "epoch": 1.06, + "grad_norm": 0.607133857685082, + "learning_rate": 2.663351579420307e-06, + "loss": 0.417, + "step": 2503 + }, + { + "epoch": 1.06, + "grad_norm": 0.5772639249289018, + "learning_rate": 2.661476605754244e-06, + "loss": 0.4377, + "step": 2504 + }, + { + "epoch": 1.06, + "grad_norm": 0.6163851599426933, + "learning_rate": 2.659601540875174e-06, + "loss": 0.4059, + "step": 2505 + }, + { + "epoch": 1.06, + "grad_norm": 0.6150547368429744, + "learning_rate": 2.6577263858422623e-06, + "loss": 0.4473, + "step": 2506 + }, + { + "epoch": 1.06, + "grad_norm": 0.6277681834624624, + "learning_rate": 2.6558511417147225e-06, + "loss": 0.4312, + "step": 2507 + }, + { + "epoch": 1.06, + "grad_norm": 0.5576110114818135, + "learning_rate": 2.653975809551823e-06, + "loss": 0.415, + "step": 2508 + }, + { + "epoch": 1.06, + "grad_norm": 0.5603219178351719, + "learning_rate": 2.6521003904128772e-06, + "loss": 0.4021, + "step": 2509 + }, + { + "epoch": 1.06, + "grad_norm": 0.5842308669890593, + "learning_rate": 2.650224885357251e-06, + "loss": 0.4377, + "step": 2510 + }, + { + "epoch": 1.06, + "grad_norm": 0.6027928815424282, + "learning_rate": 2.648349295444358e-06, + "loss": 0.4326, + "step": 2511 + }, + { + "epoch": 1.06, + "grad_norm": 0.6056130362871082, + "learning_rate": 2.646473621733658e-06, + "loss": 0.4253, + "step": 2512 + }, + { + "epoch": 1.06, + "grad_norm": 0.6243077090025687, + "learning_rate": 2.6445978652846605e-06, + "loss": 0.4478, + "step": 2513 + }, + { + "epoch": 1.06, + "grad_norm": 0.6116645505267284, + "learning_rate": 2.6427220271569206e-06, + "loss": 0.4576, + "step": 2514 + }, + { + "epoch": 1.06, + "grad_norm": 0.5491346480936342, + "learning_rate": 2.640846108410039e-06, + "loss": 0.3972, + "step": 2515 + }, + { + "epoch": 1.06, + "grad_norm": 0.5681342786794281, + "learning_rate": 2.6389701101036635e-06, + "loss": 0.4351, + "step": 2516 + }, + { + "epoch": 1.06, + "grad_norm": 0.5866245904284294, + "learning_rate": 2.6370940332974864e-06, + "loss": 0.4201, + "step": 2517 + }, + { + "epoch": 1.06, + "grad_norm": 0.5501785457503201, + "learning_rate": 2.6352178790512425e-06, + "loss": 0.3985, + "step": 2518 + }, + { + "epoch": 1.06, + "grad_norm": 0.5783704626903385, + "learning_rate": 2.6333416484247126e-06, + "loss": 0.4582, + "step": 2519 + }, + { + "epoch": 1.06, + "grad_norm": 0.5601581473403204, + "learning_rate": 2.6314653424777194e-06, + "loss": 0.4502, + "step": 2520 + }, + { + "epoch": 1.06, + "grad_norm": 0.5776661184654673, + "learning_rate": 2.6295889622701287e-06, + "loss": 0.3946, + "step": 2521 + }, + { + "epoch": 1.06, + "grad_norm": 0.6070561840129438, + "learning_rate": 2.6277125088618496e-06, + "loss": 0.4253, + "step": 2522 + }, + { + "epoch": 1.06, + "grad_norm": 0.5860572342161754, + "learning_rate": 2.6258359833128284e-06, + "loss": 0.4134, + "step": 2523 + }, + { + "epoch": 1.06, + "grad_norm": 0.5814356096505094, + "learning_rate": 2.623959386683056e-06, + "loss": 0.4144, + "step": 2524 + }, + { + "epoch": 1.06, + "grad_norm": 0.5645239452943359, + "learning_rate": 2.6220827200325628e-06, + "loss": 0.4072, + "step": 2525 + }, + { + "epoch": 1.07, + "grad_norm": 0.5555072889719695, + "learning_rate": 2.620205984421418e-06, + "loss": 0.4115, + "step": 2526 + }, + { + "epoch": 1.07, + "grad_norm": 0.5857812167489317, + "learning_rate": 2.618329180909728e-06, + "loss": 0.3889, + "step": 2527 + }, + { + "epoch": 1.07, + "grad_norm": 0.5540712238808715, + "learning_rate": 2.6164523105576436e-06, + "loss": 0.4143, + "step": 2528 + }, + { + "epoch": 1.07, + "grad_norm": 0.5726575598093935, + "learning_rate": 2.614575374425345e-06, + "loss": 0.4062, + "step": 2529 + }, + { + "epoch": 1.07, + "grad_norm": 0.580473478944349, + "learning_rate": 2.612698373573056e-06, + "loss": 0.4155, + "step": 2530 + }, + { + "epoch": 1.07, + "grad_norm": 0.5921217133427389, + "learning_rate": 2.6108213090610352e-06, + "loss": 0.4462, + "step": 2531 + }, + { + "epoch": 1.07, + "grad_norm": 0.5801087342197929, + "learning_rate": 2.608944181949575e-06, + "loss": 0.4099, + "step": 2532 + }, + { + "epoch": 1.07, + "grad_norm": 0.5749901636296175, + "learning_rate": 2.607066993299007e-06, + "loss": 0.4093, + "step": 2533 + }, + { + "epoch": 1.07, + "grad_norm": 0.6164831721085331, + "learning_rate": 2.6051897441696926e-06, + "loss": 0.4283, + "step": 2534 + }, + { + "epoch": 1.07, + "grad_norm": 0.5874464536341129, + "learning_rate": 2.603312435622033e-06, + "loss": 0.4446, + "step": 2535 + }, + { + "epoch": 1.07, + "grad_norm": 0.5662147896859149, + "learning_rate": 2.6014350687164598e-06, + "loss": 0.4194, + "step": 2536 + }, + { + "epoch": 1.07, + "grad_norm": 0.5495971494931703, + "learning_rate": 2.5995576445134364e-06, + "loss": 0.4057, + "step": 2537 + }, + { + "epoch": 1.07, + "grad_norm": 0.5722070713750189, + "learning_rate": 2.5976801640734604e-06, + "loss": 0.4355, + "step": 2538 + }, + { + "epoch": 1.07, + "grad_norm": 0.5594258522910831, + "learning_rate": 2.595802628457063e-06, + "loss": 0.4238, + "step": 2539 + }, + { + "epoch": 1.07, + "grad_norm": 0.5700766032916892, + "learning_rate": 2.593925038724802e-06, + "loss": 0.4286, + "step": 2540 + }, + { + "epoch": 1.07, + "grad_norm": 0.5778449462446359, + "learning_rate": 2.5920473959372695e-06, + "loss": 0.4337, + "step": 2541 + }, + { + "epoch": 1.07, + "grad_norm": 0.584060965567766, + "learning_rate": 2.5901697011550857e-06, + "loss": 0.4259, + "step": 2542 + }, + { + "epoch": 1.07, + "grad_norm": 0.563752954121171, + "learning_rate": 2.5882919554389007e-06, + "loss": 0.4309, + "step": 2543 + }, + { + "epoch": 1.07, + "grad_norm": 0.5899104452933682, + "learning_rate": 2.586414159849394e-06, + "loss": 0.4461, + "step": 2544 + }, + { + "epoch": 1.07, + "grad_norm": 0.5820330641642761, + "learning_rate": 2.5845363154472725e-06, + "loss": 0.4301, + "step": 2545 + }, + { + "epoch": 1.07, + "grad_norm": 0.5446163156937063, + "learning_rate": 2.5826584232932707e-06, + "loss": 0.3957, + "step": 2546 + }, + { + "epoch": 1.07, + "grad_norm": 0.5945376021090512, + "learning_rate": 2.5807804844481506e-06, + "loss": 0.4091, + "step": 2547 + }, + { + "epoch": 1.07, + "grad_norm": 0.5638002168999431, + "learning_rate": 2.5789024999727e-06, + "loss": 0.414, + "step": 2548 + }, + { + "epoch": 1.07, + "grad_norm": 0.5951090126233547, + "learning_rate": 2.577024470927732e-06, + "loss": 0.4237, + "step": 2549 + }, + { + "epoch": 1.08, + "grad_norm": 0.5684255566290534, + "learning_rate": 2.575146398374087e-06, + "loss": 0.4431, + "step": 2550 + }, + { + "epoch": 1.08, + "grad_norm": 0.5595853495309726, + "learning_rate": 2.5732682833726274e-06, + "loss": 0.4221, + "step": 2551 + }, + { + "epoch": 1.08, + "grad_norm": 0.5742831043384634, + "learning_rate": 2.5713901269842405e-06, + "loss": 0.4285, + "step": 2552 + }, + { + "epoch": 1.08, + "grad_norm": 0.5640089992024232, + "learning_rate": 2.569511930269839e-06, + "loss": 0.4196, + "step": 2553 + }, + { + "epoch": 1.08, + "grad_norm": 0.5572383872005084, + "learning_rate": 2.5676336942903547e-06, + "loss": 0.4015, + "step": 2554 + }, + { + "epoch": 1.08, + "grad_norm": 0.6180725350224047, + "learning_rate": 2.565755420106744e-06, + "loss": 0.4346, + "step": 2555 + }, + { + "epoch": 1.08, + "grad_norm": 0.586931012696472, + "learning_rate": 2.5638771087799857e-06, + "loss": 0.4377, + "step": 2556 + }, + { + "epoch": 1.08, + "eval_loss": 0.46661555767059326, + "eval_runtime": 6940.8186, + "eval_samples_per_second": 41.838, + "eval_steps_per_second": 2.092, + "step": 2556 + }, + { + "epoch": 1.08, + "grad_norm": 0.60890234512916, + "learning_rate": 2.5619987613710757e-06, + "loss": 0.4203, + "step": 2557 + }, + { + "epoch": 1.08, + "grad_norm": 0.6130809794076856, + "learning_rate": 2.5601203789410344e-06, + "loss": 0.3926, + "step": 2558 + }, + { + "epoch": 1.08, + "grad_norm": 0.5758560877978278, + "learning_rate": 2.5582419625509004e-06, + "loss": 0.4193, + "step": 2559 + }, + { + "epoch": 1.08, + "grad_norm": 0.5611835821629346, + "learning_rate": 2.5563635132617305e-06, + "loss": 0.4106, + "step": 2560 + }, + { + "epoch": 1.08, + "grad_norm": 0.5500032992402379, + "learning_rate": 2.5544850321346026e-06, + "loss": 0.4103, + "step": 2561 + }, + { + "epoch": 1.08, + "grad_norm": 0.5991303836828948, + "learning_rate": 2.55260652023061e-06, + "loss": 0.4176, + "step": 2562 + }, + { + "epoch": 1.08, + "grad_norm": 0.5495267980558447, + "learning_rate": 2.550727978610864e-06, + "loss": 0.412, + "step": 2563 + }, + { + "epoch": 1.08, + "grad_norm": 0.5903181299884372, + "learning_rate": 2.5488494083364946e-06, + "loss": 0.4186, + "step": 2564 + }, + { + "epoch": 1.08, + "grad_norm": 0.5775725024291222, + "learning_rate": 2.5469708104686452e-06, + "loss": 0.4017, + "step": 2565 + }, + { + "epoch": 1.08, + "grad_norm": 0.5794156104604055, + "learning_rate": 2.5450921860684765e-06, + "loss": 0.3933, + "step": 2566 + }, + { + "epoch": 1.08, + "grad_norm": 0.5929127352748987, + "learning_rate": 2.543213536197164e-06, + "loss": 0.4474, + "step": 2567 + }, + { + "epoch": 1.08, + "grad_norm": 0.6307527970701059, + "learning_rate": 2.5413348619158966e-06, + "loss": 0.4367, + "step": 2568 + }, + { + "epoch": 1.08, + "grad_norm": 0.5792027225417186, + "learning_rate": 2.5394561642858785e-06, + "loss": 0.4371, + "step": 2569 + }, + { + "epoch": 1.08, + "grad_norm": 0.5781330553234431, + "learning_rate": 2.5375774443683263e-06, + "loss": 0.416, + "step": 2570 + }, + { + "epoch": 1.08, + "grad_norm": 0.574771297321989, + "learning_rate": 2.5356987032244686e-06, + "loss": 0.4075, + "step": 2571 + }, + { + "epoch": 1.08, + "grad_norm": 0.5558908214262773, + "learning_rate": 2.5338199419155473e-06, + "loss": 0.41, + "step": 2572 + }, + { + "epoch": 1.09, + "grad_norm": 0.570022965299663, + "learning_rate": 2.5319411615028144e-06, + "loss": 0.4197, + "step": 2573 + }, + { + "epoch": 1.09, + "grad_norm": 0.5953309626927854, + "learning_rate": 2.530062363047534e-06, + "loss": 0.4489, + "step": 2574 + }, + { + "epoch": 1.09, + "grad_norm": 0.5678767865025237, + "learning_rate": 2.5281835476109796e-06, + "loss": 0.4328, + "step": 2575 + }, + { + "epoch": 1.09, + "grad_norm": 0.5830901884956002, + "learning_rate": 2.5263047162544335e-06, + "loss": 0.4117, + "step": 2576 + }, + { + "epoch": 1.09, + "grad_norm": 0.565926311345139, + "learning_rate": 2.5244258700391888e-06, + "loss": 0.4232, + "step": 2577 + }, + { + "epoch": 1.09, + "grad_norm": 0.5813850996625575, + "learning_rate": 2.522547010026546e-06, + "loss": 0.4351, + "step": 2578 + }, + { + "epoch": 1.09, + "grad_norm": 0.5766636305766427, + "learning_rate": 2.5206681372778126e-06, + "loss": 0.4281, + "step": 2579 + }, + { + "epoch": 1.09, + "grad_norm": 0.547136463994455, + "learning_rate": 2.518789252854305e-06, + "loss": 0.399, + "step": 2580 + }, + { + "epoch": 1.09, + "grad_norm": 0.5673733087045142, + "learning_rate": 2.5169103578173455e-06, + "loss": 0.4429, + "step": 2581 + }, + { + "epoch": 1.09, + "grad_norm": 0.577885776445999, + "learning_rate": 2.5150314532282615e-06, + "loss": 0.4373, + "step": 2582 + }, + { + "epoch": 1.09, + "grad_norm": 0.5681163890434137, + "learning_rate": 2.5131525401483863e-06, + "loss": 0.4314, + "step": 2583 + }, + { + "epoch": 1.09, + "grad_norm": 0.5914855836747088, + "learning_rate": 2.51127361963906e-06, + "loss": 0.4156, + "step": 2584 + }, + { + "epoch": 1.09, + "grad_norm": 0.5695378199879212, + "learning_rate": 2.5093946927616227e-06, + "loss": 0.4137, + "step": 2585 + }, + { + "epoch": 1.09, + "grad_norm": 0.615520565415579, + "learning_rate": 2.507515760577423e-06, + "loss": 0.4202, + "step": 2586 + }, + { + "epoch": 1.09, + "grad_norm": 0.5962868782142557, + "learning_rate": 2.505636824147808e-06, + "loss": 0.4433, + "step": 2587 + }, + { + "epoch": 1.09, + "grad_norm": 0.5642173652784211, + "learning_rate": 2.50375788453413e-06, + "loss": 0.4296, + "step": 2588 + }, + { + "epoch": 1.09, + "grad_norm": 0.5643198205173939, + "learning_rate": 2.501878942797743e-06, + "loss": 0.4163, + "step": 2589 + }, + { + "epoch": 1.09, + "grad_norm": 0.5730693457148456, + "learning_rate": 2.5e-06, + "loss": 0.4184, + "step": 2590 + }, + { + "epoch": 1.09, + "grad_norm": 0.5648030559058376, + "learning_rate": 2.498121057202258e-06, + "loss": 0.4286, + "step": 2591 + }, + { + "epoch": 1.09, + "grad_norm": 0.7310458154947486, + "learning_rate": 2.4962421154658706e-06, + "loss": 0.4552, + "step": 2592 + }, + { + "epoch": 1.09, + "grad_norm": 0.8642590895577826, + "learning_rate": 2.4943631758521924e-06, + "loss": 0.43, + "step": 2593 + }, + { + "epoch": 1.09, + "grad_norm": 0.5957297069878567, + "learning_rate": 2.492484239422578e-06, + "loss": 0.4174, + "step": 2594 + }, + { + "epoch": 1.09, + "grad_norm": 0.5288913445348521, + "learning_rate": 2.4906053072383773e-06, + "loss": 0.4197, + "step": 2595 + }, + { + "epoch": 1.09, + "grad_norm": 0.5669908964180154, + "learning_rate": 2.4887263803609415e-06, + "loss": 0.4228, + "step": 2596 + }, + { + "epoch": 1.1, + "grad_norm": 0.5619854120471871, + "learning_rate": 2.486847459851614e-06, + "loss": 0.4204, + "step": 2597 + }, + { + "epoch": 1.1, + "grad_norm": 0.573591559908782, + "learning_rate": 2.4849685467717397e-06, + "loss": 0.4043, + "step": 2598 + }, + { + "epoch": 1.1, + "grad_norm": 0.5660399643338767, + "learning_rate": 2.4830896421826554e-06, + "loss": 0.4151, + "step": 2599 + }, + { + "epoch": 1.1, + "grad_norm": 0.6061575332078436, + "learning_rate": 2.4812107471456958e-06, + "loss": 0.4261, + "step": 2600 + }, + { + "epoch": 1.1, + "grad_norm": 0.558833085401998, + "learning_rate": 2.479331862722188e-06, + "loss": 0.4253, + "step": 2601 + }, + { + "epoch": 1.1, + "grad_norm": 0.5661726057886762, + "learning_rate": 2.477452989973455e-06, + "loss": 0.4282, + "step": 2602 + }, + { + "epoch": 1.1, + "grad_norm": 0.5765250370061781, + "learning_rate": 2.475574129960812e-06, + "loss": 0.3895, + "step": 2603 + }, + { + "epoch": 1.1, + "grad_norm": 0.6094913200037948, + "learning_rate": 2.4736952837455665e-06, + "loss": 0.4438, + "step": 2604 + }, + { + "epoch": 1.1, + "grad_norm": 0.5958403528694752, + "learning_rate": 2.4718164523890212e-06, + "loss": 0.4276, + "step": 2605 + }, + { + "epoch": 1.1, + "grad_norm": 0.5711855859912395, + "learning_rate": 2.4699376369524665e-06, + "loss": 0.422, + "step": 2606 + }, + { + "epoch": 1.1, + "grad_norm": 0.5827746728273364, + "learning_rate": 2.4680588384971864e-06, + "loss": 0.4094, + "step": 2607 + }, + { + "epoch": 1.1, + "grad_norm": 0.5652093041093819, + "learning_rate": 2.4661800580844535e-06, + "loss": 0.4404, + "step": 2608 + }, + { + "epoch": 1.1, + "grad_norm": 0.5815716319807065, + "learning_rate": 2.4643012967755327e-06, + "loss": 0.4188, + "step": 2609 + }, + { + "epoch": 1.1, + "grad_norm": 0.5667592036448388, + "learning_rate": 2.4624225556316745e-06, + "loss": 0.4103, + "step": 2610 + }, + { + "epoch": 1.1, + "grad_norm": 0.6003054597720718, + "learning_rate": 2.4605438357141223e-06, + "loss": 0.4422, + "step": 2611 + }, + { + "epoch": 1.1, + "grad_norm": 0.5718504015079162, + "learning_rate": 2.458665138084104e-06, + "loss": 0.4057, + "step": 2612 + }, + { + "epoch": 1.1, + "grad_norm": 0.5894624642862655, + "learning_rate": 2.4567864638028374e-06, + "loss": 0.4164, + "step": 2613 + }, + { + "epoch": 1.1, + "grad_norm": 0.6181729940652014, + "learning_rate": 2.4549078139315243e-06, + "loss": 0.4354, + "step": 2614 + }, + { + "epoch": 1.1, + "grad_norm": 0.5831842560896243, + "learning_rate": 2.453029189531356e-06, + "loss": 0.4059, + "step": 2615 + }, + { + "epoch": 1.1, + "grad_norm": 0.5711907705253331, + "learning_rate": 2.451150591663506e-06, + "loss": 0.4161, + "step": 2616 + }, + { + "epoch": 1.1, + "grad_norm": 0.5840971955664861, + "learning_rate": 2.449272021389136e-06, + "loss": 0.4266, + "step": 2617 + }, + { + "epoch": 1.1, + "grad_norm": 0.6151549174106447, + "learning_rate": 2.4473934797693908e-06, + "loss": 0.4451, + "step": 2618 + }, + { + "epoch": 1.1, + "grad_norm": 0.5912926857180598, + "learning_rate": 2.4455149678653982e-06, + "loss": 0.424, + "step": 2619 + }, + { + "epoch": 1.11, + "grad_norm": 0.5690720352177238, + "learning_rate": 2.44363648673827e-06, + "loss": 0.4094, + "step": 2620 + }, + { + "epoch": 1.11, + "grad_norm": 0.5880654368148057, + "learning_rate": 2.4417580374491e-06, + "loss": 0.4374, + "step": 2621 + }, + { + "epoch": 1.11, + "grad_norm": 0.592818300351086, + "learning_rate": 2.439879621058966e-06, + "loss": 0.4392, + "step": 2622 + }, + { + "epoch": 1.11, + "grad_norm": 0.5904613946521675, + "learning_rate": 2.438001238628925e-06, + "loss": 0.421, + "step": 2623 + }, + { + "epoch": 1.11, + "grad_norm": 0.5780768348236618, + "learning_rate": 2.436122891220016e-06, + "loss": 0.412, + "step": 2624 + }, + { + "epoch": 1.11, + "grad_norm": 0.6165111418892802, + "learning_rate": 2.4342445798932563e-06, + "loss": 0.4225, + "step": 2625 + }, + { + "epoch": 1.11, + "grad_norm": 0.5864395429245397, + "learning_rate": 2.4323663057096466e-06, + "loss": 0.4158, + "step": 2626 + }, + { + "epoch": 1.11, + "grad_norm": 0.5790868452034496, + "learning_rate": 2.4304880697301615e-06, + "loss": 0.4018, + "step": 2627 + }, + { + "epoch": 1.11, + "eval_loss": 0.46582120656967163, + "eval_runtime": 6934.7187, + "eval_samples_per_second": 41.875, + "eval_steps_per_second": 2.094, + "step": 2627 + }, + { + "epoch": 1.11, + "grad_norm": 0.5854060000762511, + "learning_rate": 2.42860987301576e-06, + "loss": 0.429, + "step": 2628 + }, + { + "epoch": 1.11, + "grad_norm": 0.5793144094485164, + "learning_rate": 2.4267317166273734e-06, + "loss": 0.4008, + "step": 2629 + }, + { + "epoch": 1.11, + "grad_norm": 0.557725397610062, + "learning_rate": 2.4248536016259137e-06, + "loss": 0.4339, + "step": 2630 + }, + { + "epoch": 1.11, + "grad_norm": 0.5707416944794899, + "learning_rate": 2.422975529072269e-06, + "loss": 0.4144, + "step": 2631 + }, + { + "epoch": 1.11, + "grad_norm": 0.5781786267589856, + "learning_rate": 2.4210975000273005e-06, + "loss": 0.4475, + "step": 2632 + }, + { + "epoch": 1.11, + "grad_norm": 0.5688305465034205, + "learning_rate": 2.41921951555185e-06, + "loss": 0.4145, + "step": 2633 + }, + { + "epoch": 1.11, + "grad_norm": 0.583143118932043, + "learning_rate": 2.4173415767067297e-06, + "loss": 0.4286, + "step": 2634 + }, + { + "epoch": 1.11, + "grad_norm": 0.5507249017882659, + "learning_rate": 2.4154636845527284e-06, + "loss": 0.4137, + "step": 2635 + }, + { + "epoch": 1.11, + "grad_norm": 0.6128066967756481, + "learning_rate": 2.4135858401506066e-06, + "loss": 0.423, + "step": 2636 + }, + { + "epoch": 1.11, + "grad_norm": 0.5984198722869004, + "learning_rate": 2.4117080445611e-06, + "loss": 0.4315, + "step": 2637 + }, + { + "epoch": 1.11, + "grad_norm": 0.566032787321914, + "learning_rate": 2.4098302988449147e-06, + "loss": 0.4614, + "step": 2638 + }, + { + "epoch": 1.11, + "grad_norm": 0.5507111424965526, + "learning_rate": 2.4079526040627318e-06, + "loss": 0.4359, + "step": 2639 + }, + { + "epoch": 1.11, + "grad_norm": 0.5688047320342015, + "learning_rate": 2.4060749612751987e-06, + "loss": 0.4466, + "step": 2640 + }, + { + "epoch": 1.11, + "grad_norm": 0.580032240572495, + "learning_rate": 2.404197371542938e-06, + "loss": 0.4032, + "step": 2641 + }, + { + "epoch": 1.11, + "grad_norm": 0.5820043004603282, + "learning_rate": 2.40231983592654e-06, + "loss": 0.42, + "step": 2642 + }, + { + "epoch": 1.12, + "grad_norm": 0.5788132559135092, + "learning_rate": 2.400442355486564e-06, + "loss": 0.4183, + "step": 2643 + }, + { + "epoch": 1.12, + "grad_norm": 0.5859683700448465, + "learning_rate": 2.398564931283541e-06, + "loss": 0.4123, + "step": 2644 + }, + { + "epoch": 1.12, + "grad_norm": 0.5862017007386782, + "learning_rate": 2.396687564377967e-06, + "loss": 0.4285, + "step": 2645 + }, + { + "epoch": 1.12, + "grad_norm": 0.5925607615927038, + "learning_rate": 2.394810255830308e-06, + "loss": 0.4283, + "step": 2646 + }, + { + "epoch": 1.12, + "grad_norm": 0.5703213873210229, + "learning_rate": 2.3929330067009944e-06, + "loss": 0.4331, + "step": 2647 + }, + { + "epoch": 1.12, + "grad_norm": 0.5429662869911331, + "learning_rate": 2.391055818050426e-06, + "loss": 0.4009, + "step": 2648 + }, + { + "epoch": 1.12, + "grad_norm": 0.5805429721405676, + "learning_rate": 2.389178690938965e-06, + "loss": 0.4191, + "step": 2649 + }, + { + "epoch": 1.12, + "grad_norm": 0.5669011883051497, + "learning_rate": 2.3873016264269446e-06, + "loss": 0.4112, + "step": 2650 + }, + { + "epoch": 1.12, + "grad_norm": 0.567460249220951, + "learning_rate": 2.3854246255746555e-06, + "loss": 0.4315, + "step": 2651 + }, + { + "epoch": 1.12, + "grad_norm": 0.5623656342926924, + "learning_rate": 2.3835476894423577e-06, + "loss": 0.4059, + "step": 2652 + }, + { + "epoch": 1.12, + "grad_norm": 0.5887115656599653, + "learning_rate": 2.3816708190902722e-06, + "loss": 0.4292, + "step": 2653 + }, + { + "epoch": 1.12, + "grad_norm": 0.5801851574567665, + "learning_rate": 2.3797940155785837e-06, + "loss": 0.4377, + "step": 2654 + }, + { + "epoch": 1.12, + "grad_norm": 0.5957884506649995, + "learning_rate": 2.3779172799674377e-06, + "loss": 0.4291, + "step": 2655 + }, + { + "epoch": 1.12, + "grad_norm": 0.5922308503207093, + "learning_rate": 2.376040613316944e-06, + "loss": 0.4219, + "step": 2656 + }, + { + "epoch": 1.12, + "grad_norm": 0.5657227192474519, + "learning_rate": 2.374164016687173e-06, + "loss": 0.4338, + "step": 2657 + }, + { + "epoch": 1.12, + "grad_norm": 0.6033671454063749, + "learning_rate": 2.3722874911381517e-06, + "loss": 0.4554, + "step": 2658 + }, + { + "epoch": 1.12, + "grad_norm": 0.5691097763031018, + "learning_rate": 2.3704110377298717e-06, + "loss": 0.4105, + "step": 2659 + }, + { + "epoch": 1.12, + "grad_norm": 0.5617846566458164, + "learning_rate": 2.368534657522281e-06, + "loss": 0.4439, + "step": 2660 + }, + { + "epoch": 1.12, + "grad_norm": 0.5551937150400084, + "learning_rate": 2.366658351575288e-06, + "loss": 0.4266, + "step": 2661 + }, + { + "epoch": 1.12, + "grad_norm": 0.5726467432147245, + "learning_rate": 2.364782120948758e-06, + "loss": 0.417, + "step": 2662 + }, + { + "epoch": 1.12, + "grad_norm": 0.6092370186148129, + "learning_rate": 2.362905966702515e-06, + "loss": 0.4201, + "step": 2663 + }, + { + "epoch": 1.12, + "grad_norm": 0.5647441929287327, + "learning_rate": 2.3610298898963373e-06, + "loss": 0.4024, + "step": 2664 + }, + { + "epoch": 1.12, + "grad_norm": 0.5612482266868575, + "learning_rate": 2.359153891589962e-06, + "loss": 0.4272, + "step": 2665 + }, + { + "epoch": 1.12, + "grad_norm": 0.5404969509411939, + "learning_rate": 2.35727797284308e-06, + "loss": 0.4097, + "step": 2666 + }, + { + "epoch": 1.13, + "grad_norm": 0.5847499166914444, + "learning_rate": 2.3554021347153403e-06, + "loss": 0.4522, + "step": 2667 + }, + { + "epoch": 1.13, + "grad_norm": 0.5671574895969574, + "learning_rate": 2.3535263782663425e-06, + "loss": 0.4028, + "step": 2668 + }, + { + "epoch": 1.13, + "grad_norm": 0.552780149490217, + "learning_rate": 2.351650704555643e-06, + "loss": 0.4086, + "step": 2669 + }, + { + "epoch": 1.13, + "grad_norm": 0.5803614826769898, + "learning_rate": 2.3497751146427494e-06, + "loss": 0.4155, + "step": 2670 + }, + { + "epoch": 1.13, + "grad_norm": 0.5652376067469718, + "learning_rate": 2.3478996095871228e-06, + "loss": 0.4084, + "step": 2671 + }, + { + "epoch": 1.13, + "grad_norm": 0.5845008843251347, + "learning_rate": 2.3460241904481778e-06, + "loss": 0.4027, + "step": 2672 + }, + { + "epoch": 1.13, + "grad_norm": 0.5799385879129909, + "learning_rate": 2.3441488582852774e-06, + "loss": 0.4268, + "step": 2673 + }, + { + "epoch": 1.13, + "grad_norm": 0.5738670567740277, + "learning_rate": 2.342273614157739e-06, + "loss": 0.4464, + "step": 2674 + }, + { + "epoch": 1.13, + "grad_norm": 0.544552930903809, + "learning_rate": 2.3403984591248265e-06, + "loss": 0.3964, + "step": 2675 + }, + { + "epoch": 1.13, + "grad_norm": 0.6124679370289692, + "learning_rate": 2.3385233942457574e-06, + "loss": 0.4202, + "step": 2676 + }, + { + "epoch": 1.13, + "grad_norm": 0.5610056639464951, + "learning_rate": 2.336648420579694e-06, + "loss": 0.4094, + "step": 2677 + }, + { + "epoch": 1.13, + "grad_norm": 0.5615215146670192, + "learning_rate": 2.3347735391857517e-06, + "loss": 0.3789, + "step": 2678 + }, + { + "epoch": 1.13, + "grad_norm": 0.5472440549240616, + "learning_rate": 2.33289875112299e-06, + "loss": 0.3896, + "step": 2679 + }, + { + "epoch": 1.13, + "grad_norm": 0.5874793154794317, + "learning_rate": 2.3310240574504184e-06, + "loss": 0.4205, + "step": 2680 + }, + { + "epoch": 1.13, + "grad_norm": 0.5738763492928816, + "learning_rate": 2.3291494592269902e-06, + "loss": 0.4152, + "step": 2681 + }, + { + "epoch": 1.13, + "grad_norm": 0.5752705880264377, + "learning_rate": 2.327274957511607e-06, + "loss": 0.4032, + "step": 2682 + }, + { + "epoch": 1.13, + "grad_norm": 0.6094503057397006, + "learning_rate": 2.3254005533631162e-06, + "loss": 0.4271, + "step": 2683 + }, + { + "epoch": 1.13, + "grad_norm": 0.6066369045019371, + "learning_rate": 2.3235262478403082e-06, + "loss": 0.4241, + "step": 2684 + }, + { + "epoch": 1.13, + "grad_norm": 0.5939355951481937, + "learning_rate": 2.3216520420019194e-06, + "loss": 0.3942, + "step": 2685 + }, + { + "epoch": 1.13, + "grad_norm": 0.5616286358779028, + "learning_rate": 2.3197779369066287e-06, + "loss": 0.4052, + "step": 2686 + }, + { + "epoch": 1.13, + "grad_norm": 0.5714073547567622, + "learning_rate": 2.3179039336130588e-06, + "loss": 0.4156, + "step": 2687 + }, + { + "epoch": 1.13, + "grad_norm": 0.5870213755012056, + "learning_rate": 2.3160300331797734e-06, + "loss": 0.419, + "step": 2688 + }, + { + "epoch": 1.13, + "grad_norm": 0.5872084599723584, + "learning_rate": 2.3141562366652816e-06, + "loss": 0.4094, + "step": 2689 + }, + { + "epoch": 1.14, + "grad_norm": 0.5712310144109413, + "learning_rate": 2.3122825451280294e-06, + "loss": 0.4185, + "step": 2690 + }, + { + "epoch": 1.14, + "grad_norm": 0.5974668557328553, + "learning_rate": 2.3104089596264075e-06, + "loss": 0.4271, + "step": 2691 + }, + { + "epoch": 1.14, + "grad_norm": 0.5925710616458487, + "learning_rate": 2.3085354812187436e-06, + "loss": 0.4445, + "step": 2692 + }, + { + "epoch": 1.14, + "grad_norm": 0.574277995467648, + "learning_rate": 2.306662110963307e-06, + "loss": 0.4042, + "step": 2693 + }, + { + "epoch": 1.14, + "grad_norm": 0.5763082849196323, + "learning_rate": 2.3047888499183036e-06, + "loss": 0.417, + "step": 2694 + }, + { + "epoch": 1.14, + "grad_norm": 0.5593667983969396, + "learning_rate": 2.30291569914188e-06, + "loss": 0.412, + "step": 2695 + }, + { + "epoch": 1.14, + "grad_norm": 0.5768991359362865, + "learning_rate": 2.301042659692121e-06, + "loss": 0.4041, + "step": 2696 + }, + { + "epoch": 1.14, + "grad_norm": 0.592377033641057, + "learning_rate": 2.299169732627045e-06, + "loss": 0.3905, + "step": 2697 + }, + { + "epoch": 1.14, + "grad_norm": 0.5978581423236226, + "learning_rate": 2.2972969190046104e-06, + "loss": 0.3998, + "step": 2698 + }, + { + "epoch": 1.14, + "eval_loss": 0.4642565846443176, + "eval_runtime": 6929.6676, + "eval_samples_per_second": 41.905, + "eval_steps_per_second": 2.095, + "step": 2698 + }, + { + "epoch": 1.14, + "grad_norm": 0.5739162693130145, + "learning_rate": 2.2954242198827082e-06, + "loss": 0.4196, + "step": 2699 + }, + { + "epoch": 1.14, + "grad_norm": 0.5844044219912439, + "learning_rate": 2.2935516363191695e-06, + "loss": 0.433, + "step": 2700 + }, + { + "epoch": 1.14, + "grad_norm": 0.5576878153425602, + "learning_rate": 2.2916791693717553e-06, + "loss": 0.4116, + "step": 2701 + }, + { + "epoch": 1.14, + "grad_norm": 0.5975344250607371, + "learning_rate": 2.2898068200981633e-06, + "loss": 0.4239, + "step": 2702 + }, + { + "epoch": 1.14, + "grad_norm": 0.5486064916195111, + "learning_rate": 2.287934589556024e-06, + "loss": 0.3899, + "step": 2703 + }, + { + "epoch": 1.14, + "grad_norm": 0.5954206100798385, + "learning_rate": 2.2860624788029013e-06, + "loss": 0.3948, + "step": 2704 + }, + { + "epoch": 1.14, + "grad_norm": 0.582033730964023, + "learning_rate": 2.2841904888962903e-06, + "loss": 0.4024, + "step": 2705 + }, + { + "epoch": 1.14, + "grad_norm": 0.61366156641062, + "learning_rate": 2.2823186208936205e-06, + "loss": 0.4492, + "step": 2706 + }, + { + "epoch": 1.14, + "grad_norm": 0.5837374648931264, + "learning_rate": 2.280446875852248e-06, + "loss": 0.4394, + "step": 2707 + }, + { + "epoch": 1.14, + "grad_norm": 0.6159725968971995, + "learning_rate": 2.2785752548294637e-06, + "loss": 0.4345, + "step": 2708 + }, + { + "epoch": 1.14, + "grad_norm": 0.561818864484124, + "learning_rate": 2.2767037588824877e-06, + "loss": 0.4053, + "step": 2709 + }, + { + "epoch": 1.14, + "grad_norm": 0.5693511275506383, + "learning_rate": 2.2748323890684664e-06, + "loss": 0.4167, + "step": 2710 + }, + { + "epoch": 1.14, + "grad_norm": 0.5676875307416003, + "learning_rate": 2.2729611464444797e-06, + "loss": 0.4087, + "step": 2711 + }, + { + "epoch": 1.14, + "grad_norm": 0.5865113412123945, + "learning_rate": 2.2710900320675314e-06, + "loss": 0.4044, + "step": 2712 + }, + { + "epoch": 1.14, + "grad_norm": 0.5877911286647893, + "learning_rate": 2.2692190469945557e-06, + "loss": 0.4416, + "step": 2713 + }, + { + "epoch": 1.15, + "grad_norm": 0.5405283913293546, + "learning_rate": 2.267348192282411e-06, + "loss": 0.3994, + "step": 2714 + }, + { + "epoch": 1.15, + "grad_norm": 0.5721166898641502, + "learning_rate": 2.2654774689878862e-06, + "loss": 0.4269, + "step": 2715 + }, + { + "epoch": 1.15, + "grad_norm": 0.5664221347108425, + "learning_rate": 2.2636068781676905e-06, + "loss": 0.4172, + "step": 2716 + }, + { + "epoch": 1.15, + "grad_norm": 0.5609241860387546, + "learning_rate": 2.261736420878464e-06, + "loss": 0.4323, + "step": 2717 + }, + { + "epoch": 1.15, + "grad_norm": 0.5846699602141291, + "learning_rate": 2.2598660981767667e-06, + "loss": 0.4173, + "step": 2718 + }, + { + "epoch": 1.15, + "grad_norm": 0.5487147727790713, + "learning_rate": 2.257995911119086e-06, + "loss": 0.4091, + "step": 2719 + }, + { + "epoch": 1.15, + "grad_norm": 0.5642527354564277, + "learning_rate": 2.2561258607618296e-06, + "loss": 0.4208, + "step": 2720 + }, + { + "epoch": 1.15, + "grad_norm": 0.5596940402216087, + "learning_rate": 2.25425594816133e-06, + "loss": 0.4177, + "step": 2721 + }, + { + "epoch": 1.15, + "grad_norm": 0.5737740293350555, + "learning_rate": 2.2523861743738433e-06, + "loss": 0.4079, + "step": 2722 + }, + { + "epoch": 1.15, + "grad_norm": 0.5542429347678349, + "learning_rate": 2.2505165404555434e-06, + "loss": 0.4177, + "step": 2723 + }, + { + "epoch": 1.15, + "grad_norm": 0.58808254423625, + "learning_rate": 2.248647047462528e-06, + "loss": 0.4558, + "step": 2724 + }, + { + "epoch": 1.15, + "grad_norm": 0.6083740436546075, + "learning_rate": 2.246777696450813e-06, + "loss": 0.4367, + "step": 2725 + }, + { + "epoch": 1.15, + "grad_norm": 0.577052191765705, + "learning_rate": 2.244908488476337e-06, + "loss": 0.4133, + "step": 2726 + }, + { + "epoch": 1.15, + "grad_norm": 0.59097397240932, + "learning_rate": 2.2430394245949553e-06, + "loss": 0.4327, + "step": 2727 + }, + { + "epoch": 1.15, + "grad_norm": 0.5524670352667417, + "learning_rate": 2.2411705058624437e-06, + "loss": 0.4364, + "step": 2728 + }, + { + "epoch": 1.15, + "grad_norm": 0.5889869749199854, + "learning_rate": 2.2393017333344935e-06, + "loss": 0.4389, + "step": 2729 + }, + { + "epoch": 1.15, + "grad_norm": 0.5781616268517972, + "learning_rate": 2.2374331080667168e-06, + "loss": 0.4146, + "step": 2730 + }, + { + "epoch": 1.15, + "grad_norm": 0.5516338673454813, + "learning_rate": 2.235564631114639e-06, + "loss": 0.42, + "step": 2731 + }, + { + "epoch": 1.15, + "grad_norm": 0.5737980509227198, + "learning_rate": 2.2336963035337037e-06, + "loss": 0.4049, + "step": 2732 + }, + { + "epoch": 1.15, + "grad_norm": 0.6099792863000669, + "learning_rate": 2.2318281263792714e-06, + "loss": 0.4329, + "step": 2733 + }, + { + "epoch": 1.15, + "grad_norm": 0.6215436378061963, + "learning_rate": 2.229960100706614e-06, + "loss": 0.4064, + "step": 2734 + }, + { + "epoch": 1.15, + "grad_norm": 0.6070304940895107, + "learning_rate": 2.2280922275709216e-06, + "loss": 0.4172, + "step": 2735 + }, + { + "epoch": 1.15, + "grad_norm": 0.6137903976399841, + "learning_rate": 2.2262245080272947e-06, + "loss": 0.4191, + "step": 2736 + }, + { + "epoch": 1.16, + "grad_norm": 0.589617423215821, + "learning_rate": 2.2243569431307506e-06, + "loss": 0.4236, + "step": 2737 + }, + { + "epoch": 1.16, + "grad_norm": 0.5714692395337472, + "learning_rate": 2.2224895339362153e-06, + "loss": 0.4174, + "step": 2738 + }, + { + "epoch": 1.16, + "grad_norm": 0.566616099477279, + "learning_rate": 2.2206222814985316e-06, + "loss": 0.4172, + "step": 2739 + }, + { + "epoch": 1.16, + "grad_norm": 0.5918370633556324, + "learning_rate": 2.2187551868724487e-06, + "loss": 0.4022, + "step": 2740 + }, + { + "epoch": 1.16, + "grad_norm": 0.5621764463135852, + "learning_rate": 2.2168882511126306e-06, + "loss": 0.4033, + "step": 2741 + }, + { + "epoch": 1.16, + "grad_norm": 0.5602317292982957, + "learning_rate": 2.215021475273649e-06, + "loss": 0.3948, + "step": 2742 + }, + { + "epoch": 1.16, + "grad_norm": 0.5821498763067597, + "learning_rate": 2.213154860409987e-06, + "loss": 0.4342, + "step": 2743 + }, + { + "epoch": 1.16, + "grad_norm": 0.56289957110064, + "learning_rate": 2.211288407576035e-06, + "loss": 0.4224, + "step": 2744 + }, + { + "epoch": 1.16, + "grad_norm": 0.5891254592215949, + "learning_rate": 2.209422117826094e-06, + "loss": 0.4217, + "step": 2745 + }, + { + "epoch": 1.16, + "grad_norm": 0.5729813741965974, + "learning_rate": 2.207555992214372e-06, + "loss": 0.4369, + "step": 2746 + }, + { + "epoch": 1.16, + "grad_norm": 0.5673765067747608, + "learning_rate": 2.2056900317949835e-06, + "loss": 0.4242, + "step": 2747 + }, + { + "epoch": 1.16, + "grad_norm": 0.5690722337898687, + "learning_rate": 2.203824237621951e-06, + "loss": 0.4284, + "step": 2748 + }, + { + "epoch": 1.16, + "grad_norm": 0.5690375147524728, + "learning_rate": 2.2019586107492005e-06, + "loss": 0.4238, + "step": 2749 + }, + { + "epoch": 1.16, + "grad_norm": 0.5507462792620564, + "learning_rate": 2.200093152230568e-06, + "loss": 0.4135, + "step": 2750 + }, + { + "epoch": 1.16, + "grad_norm": 0.5719332339424014, + "learning_rate": 2.1982278631197895e-06, + "loss": 0.4174, + "step": 2751 + }, + { + "epoch": 1.16, + "grad_norm": 0.5874278219304234, + "learning_rate": 2.1963627444705097e-06, + "loss": 0.4053, + "step": 2752 + }, + { + "epoch": 1.16, + "grad_norm": 0.568889174310937, + "learning_rate": 2.1944977973362728e-06, + "loss": 0.4123, + "step": 2753 + }, + { + "epoch": 1.16, + "grad_norm": 0.6038767993949933, + "learning_rate": 2.19263302277053e-06, + "loss": 0.4349, + "step": 2754 + }, + { + "epoch": 1.16, + "grad_norm": 0.5872897799798638, + "learning_rate": 2.190768421826631e-06, + "loss": 0.4083, + "step": 2755 + }, + { + "epoch": 1.16, + "grad_norm": 0.5905095099111151, + "learning_rate": 2.1889039955578327e-06, + "loss": 0.4146, + "step": 2756 + }, + { + "epoch": 1.16, + "grad_norm": 0.5527103292721002, + "learning_rate": 2.1870397450172876e-06, + "loss": 0.4191, + "step": 2757 + }, + { + "epoch": 1.16, + "grad_norm": 0.5818092591622248, + "learning_rate": 2.1851756712580526e-06, + "loss": 0.3905, + "step": 2758 + }, + { + "epoch": 1.16, + "grad_norm": 0.5560986512545047, + "learning_rate": 2.1833117753330847e-06, + "loss": 0.4212, + "step": 2759 + }, + { + "epoch": 1.17, + "grad_norm": 0.583827443059487, + "learning_rate": 2.1814480582952376e-06, + "loss": 0.439, + "step": 2760 + }, + { + "epoch": 1.17, + "grad_norm": 0.6047213755513012, + "learning_rate": 2.1795845211972684e-06, + "loss": 0.4098, + "step": 2761 + }, + { + "epoch": 1.17, + "grad_norm": 0.5719149813855363, + "learning_rate": 2.1777211650918276e-06, + "loss": 0.4436, + "step": 2762 + }, + { + "epoch": 1.17, + "grad_norm": 0.5636987195875225, + "learning_rate": 2.1758579910314677e-06, + "loss": 0.4212, + "step": 2763 + }, + { + "epoch": 1.17, + "grad_norm": 0.5821057683244233, + "learning_rate": 2.1739950000686354e-06, + "loss": 0.4157, + "step": 2764 + }, + { + "epoch": 1.17, + "grad_norm": 0.5763014032864897, + "learning_rate": 2.1721321932556753e-06, + "loss": 0.4167, + "step": 2765 + }, + { + "epoch": 1.17, + "grad_norm": 0.5741607793695896, + "learning_rate": 2.1702695716448276e-06, + "loss": 0.4118, + "step": 2766 + }, + { + "epoch": 1.17, + "grad_norm": 0.583977923198581, + "learning_rate": 2.16840713628823e-06, + "loss": 0.4362, + "step": 2767 + }, + { + "epoch": 1.17, + "grad_norm": 0.567095712657335, + "learning_rate": 2.16654488823791e-06, + "loss": 0.3994, + "step": 2768 + }, + { + "epoch": 1.17, + "grad_norm": 0.5882945712372565, + "learning_rate": 2.164682828545795e-06, + "loss": 0.415, + "step": 2769 + }, + { + "epoch": 1.17, + "eval_loss": 0.4636525809764862, + "eval_runtime": 6931.4457, + "eval_samples_per_second": 41.894, + "eval_steps_per_second": 2.095, + "step": 2769 + }, + { + "epoch": 1.17, + "grad_norm": 0.5800810184022894, + "learning_rate": 2.1628209582637024e-06, + "loss": 0.423, + "step": 2770 + }, + { + "epoch": 1.17, + "grad_norm": 0.5877561785586146, + "learning_rate": 2.160959278443342e-06, + "loss": 0.4191, + "step": 2771 + }, + { + "epoch": 1.17, + "grad_norm": 0.5450021019526348, + "learning_rate": 2.1590977901363215e-06, + "loss": 0.4121, + "step": 2772 + }, + { + "epoch": 1.17, + "grad_norm": 0.586036868971513, + "learning_rate": 2.157236494394133e-06, + "loss": 0.4231, + "step": 2773 + }, + { + "epoch": 1.17, + "grad_norm": 0.6082884207803223, + "learning_rate": 2.155375392268165e-06, + "loss": 0.4347, + "step": 2774 + }, + { + "epoch": 1.17, + "grad_norm": 0.5818956219426625, + "learning_rate": 2.1535144848096943e-06, + "loss": 0.4267, + "step": 2775 + }, + { + "epoch": 1.17, + "grad_norm": 0.5882181261304475, + "learning_rate": 2.1516537730698895e-06, + "loss": 0.4122, + "step": 2776 + }, + { + "epoch": 1.17, + "grad_norm": 0.5855723087856304, + "learning_rate": 2.1497932580998055e-06, + "loss": 0.4196, + "step": 2777 + }, + { + "epoch": 1.17, + "grad_norm": 0.5621462555991732, + "learning_rate": 2.147932940950391e-06, + "loss": 0.4313, + "step": 2778 + }, + { + "epoch": 1.17, + "grad_norm": 0.5619630139053288, + "learning_rate": 2.1460728226724768e-06, + "loss": 0.4171, + "step": 2779 + }, + { + "epoch": 1.17, + "grad_norm": 0.5803428278169429, + "learning_rate": 2.1442129043167877e-06, + "loss": 0.4137, + "step": 2780 + }, + { + "epoch": 1.17, + "grad_norm": 0.6003743965203625, + "learning_rate": 2.1423531869339307e-06, + "loss": 0.4094, + "step": 2781 + }, + { + "epoch": 1.17, + "grad_norm": 0.5838530366206433, + "learning_rate": 2.140493671574402e-06, + "loss": 0.4256, + "step": 2782 + }, + { + "epoch": 1.17, + "grad_norm": 0.5750052865571543, + "learning_rate": 2.138634359288581e-06, + "loss": 0.4182, + "step": 2783 + }, + { + "epoch": 1.18, + "grad_norm": 0.555105523968193, + "learning_rate": 2.1367752511267366e-06, + "loss": 0.4174, + "step": 2784 + }, + { + "epoch": 1.18, + "grad_norm": 0.569042426766486, + "learning_rate": 2.134916348139019e-06, + "loss": 0.4137, + "step": 2785 + }, + { + "epoch": 1.18, + "grad_norm": 0.5907654743125312, + "learning_rate": 2.133057651375463e-06, + "loss": 0.4148, + "step": 2786 + }, + { + "epoch": 1.18, + "grad_norm": 0.5717943994371465, + "learning_rate": 2.1311991618859883e-06, + "loss": 0.4307, + "step": 2787 + }, + { + "epoch": 1.18, + "grad_norm": 0.5562054796755491, + "learning_rate": 2.129340880720395e-06, + "loss": 0.412, + "step": 2788 + }, + { + "epoch": 1.18, + "grad_norm": 0.5745208858282651, + "learning_rate": 2.1274828089283696e-06, + "loss": 0.4315, + "step": 2789 + }, + { + "epoch": 1.18, + "grad_norm": 0.6261475696552913, + "learning_rate": 2.125624947559475e-06, + "loss": 0.4141, + "step": 2790 + }, + { + "epoch": 1.18, + "grad_norm": 0.587239695175737, + "learning_rate": 2.123767297663161e-06, + "loss": 0.4384, + "step": 2791 + }, + { + "epoch": 1.18, + "grad_norm": 0.5583403908079667, + "learning_rate": 2.1219098602887524e-06, + "loss": 0.4215, + "step": 2792 + }, + { + "epoch": 1.18, + "grad_norm": 0.5697359070123751, + "learning_rate": 2.1200526364854583e-06, + "loss": 0.4195, + "step": 2793 + }, + { + "epoch": 1.18, + "grad_norm": 0.5746434077589057, + "learning_rate": 2.118195627302364e-06, + "loss": 0.4397, + "step": 2794 + }, + { + "epoch": 1.18, + "grad_norm": 0.5770878978662568, + "learning_rate": 2.116338833788437e-06, + "loss": 0.4142, + "step": 2795 + }, + { + "epoch": 1.18, + "grad_norm": 0.615557087883441, + "learning_rate": 2.114482256992519e-06, + "loss": 0.4124, + "step": 2796 + }, + { + "epoch": 1.18, + "grad_norm": 0.5665043322437245, + "learning_rate": 2.112625897963333e-06, + "loss": 0.423, + "step": 2797 + }, + { + "epoch": 1.18, + "grad_norm": 0.5598727728785423, + "learning_rate": 2.1107697577494764e-06, + "loss": 0.4014, + "step": 2798 + }, + { + "epoch": 1.18, + "grad_norm": 0.567119767667523, + "learning_rate": 2.1089138373994226e-06, + "loss": 0.4072, + "step": 2799 + }, + { + "epoch": 1.18, + "grad_norm": 0.5672415415140308, + "learning_rate": 2.1070581379615253e-06, + "loss": 0.4336, + "step": 2800 + }, + { + "epoch": 1.18, + "grad_norm": 0.5503253498723211, + "learning_rate": 2.1052026604840066e-06, + "loss": 0.4181, + "step": 2801 + }, + { + "epoch": 1.18, + "grad_norm": 0.5837370512357433, + "learning_rate": 2.10334740601497e-06, + "loss": 0.4014, + "step": 2802 + }, + { + "epoch": 1.18, + "grad_norm": 0.5485492339607526, + "learning_rate": 2.101492375602387e-06, + "loss": 0.3878, + "step": 2803 + }, + { + "epoch": 1.18, + "grad_norm": 0.5901125098812506, + "learning_rate": 2.099637570294108e-06, + "loss": 0.4315, + "step": 2804 + }, + { + "epoch": 1.18, + "grad_norm": 0.5790878839442004, + "learning_rate": 2.0977829911378507e-06, + "loss": 0.4082, + "step": 2805 + }, + { + "epoch": 1.18, + "grad_norm": 0.5486030886397184, + "learning_rate": 2.0959286391812116e-06, + "loss": 0.4131, + "step": 2806 + }, + { + "epoch": 1.19, + "grad_norm": 0.5731467050022245, + "learning_rate": 2.0940745154716516e-06, + "loss": 0.4213, + "step": 2807 + }, + { + "epoch": 1.19, + "grad_norm": 0.563249940690925, + "learning_rate": 2.0922206210565088e-06, + "loss": 0.4275, + "step": 2808 + }, + { + "epoch": 1.19, + "grad_norm": 0.5588267746939021, + "learning_rate": 2.090366956982988e-06, + "loss": 0.4053, + "step": 2809 + }, + { + "epoch": 1.19, + "grad_norm": 0.6276548597898437, + "learning_rate": 2.088513524298165e-06, + "loss": 0.4044, + "step": 2810 + }, + { + "epoch": 1.19, + "grad_norm": 0.5654177414506385, + "learning_rate": 2.086660324048987e-06, + "loss": 0.3936, + "step": 2811 + }, + { + "epoch": 1.19, + "grad_norm": 0.5912299181232848, + "learning_rate": 2.084807357282266e-06, + "loss": 0.4449, + "step": 2812 + }, + { + "epoch": 1.19, + "grad_norm": 0.5934179796428065, + "learning_rate": 2.0829546250446846e-06, + "loss": 0.4316, + "step": 2813 + }, + { + "epoch": 1.19, + "grad_norm": 0.660477044543163, + "learning_rate": 2.0811021283827928e-06, + "loss": 0.4134, + "step": 2814 + }, + { + "epoch": 1.19, + "grad_norm": 0.5785577080236974, + "learning_rate": 2.0792498683430072e-06, + "loss": 0.4289, + "step": 2815 + }, + { + "epoch": 1.19, + "grad_norm": 0.5359940863049542, + "learning_rate": 2.077397845971609e-06, + "loss": 0.3887, + "step": 2816 + }, + { + "epoch": 1.19, + "grad_norm": 0.5579305558402582, + "learning_rate": 2.07554606231475e-06, + "loss": 0.4046, + "step": 2817 + }, + { + "epoch": 1.19, + "grad_norm": 0.5768033221733195, + "learning_rate": 2.0736945184184406e-06, + "loss": 0.4306, + "step": 2818 + }, + { + "epoch": 1.19, + "grad_norm": 0.5574558414654748, + "learning_rate": 2.0718432153285615e-06, + "loss": 0.4266, + "step": 2819 + }, + { + "epoch": 1.19, + "grad_norm": 0.5523510875719263, + "learning_rate": 2.0699921540908542e-06, + "loss": 0.4209, + "step": 2820 + }, + { + "epoch": 1.19, + "grad_norm": 0.586454598962645, + "learning_rate": 2.068141335750925e-06, + "loss": 0.4038, + "step": 2821 + }, + { + "epoch": 1.19, + "grad_norm": 0.5767355936333549, + "learning_rate": 2.0662907613542405e-06, + "loss": 0.4219, + "step": 2822 + }, + { + "epoch": 1.19, + "grad_norm": 0.5748539680981339, + "learning_rate": 2.064440431946133e-06, + "loss": 0.382, + "step": 2823 + }, + { + "epoch": 1.19, + "grad_norm": 0.5642219467594125, + "learning_rate": 2.062590348571796e-06, + "loss": 0.4051, + "step": 2824 + }, + { + "epoch": 1.19, + "grad_norm": 0.5825964555184913, + "learning_rate": 2.0607405122762806e-06, + "loss": 0.4096, + "step": 2825 + }, + { + "epoch": 1.19, + "grad_norm": 0.5629400713760981, + "learning_rate": 2.058890924104502e-06, + "loss": 0.4, + "step": 2826 + }, + { + "epoch": 1.19, + "grad_norm": 0.5463214353524036, + "learning_rate": 2.057041585101232e-06, + "loss": 0.3901, + "step": 2827 + }, + { + "epoch": 1.19, + "grad_norm": 0.5728170268667269, + "learning_rate": 2.0551924963111064e-06, + "loss": 0.4252, + "step": 2828 + }, + { + "epoch": 1.19, + "grad_norm": 0.5543782845949665, + "learning_rate": 2.053343658778613e-06, + "loss": 0.4214, + "step": 2829 + }, + { + "epoch": 1.19, + "grad_norm": 0.58082955878978, + "learning_rate": 2.0514950735481053e-06, + "loss": 0.4137, + "step": 2830 + }, + { + "epoch": 1.2, + "grad_norm": 0.5531229406229621, + "learning_rate": 2.049646741663788e-06, + "loss": 0.4131, + "step": 2831 + }, + { + "epoch": 1.2, + "grad_norm": 0.5541337155553405, + "learning_rate": 2.0477986641697263e-06, + "loss": 0.4068, + "step": 2832 + }, + { + "epoch": 1.2, + "grad_norm": 0.5512293318719139, + "learning_rate": 2.0459508421098383e-06, + "loss": 0.4193, + "step": 2833 + }, + { + "epoch": 1.2, + "grad_norm": 0.6481077588847556, + "learning_rate": 2.0441032765279036e-06, + "loss": 0.4195, + "step": 2834 + }, + { + "epoch": 1.2, + "grad_norm": 0.5553462347610719, + "learning_rate": 2.0422559684675498e-06, + "loss": 0.4131, + "step": 2835 + }, + { + "epoch": 1.2, + "grad_norm": 0.5854214572008192, + "learning_rate": 2.040408918972264e-06, + "loss": 0.4223, + "step": 2836 + }, + { + "epoch": 1.2, + "grad_norm": 0.5524192472655216, + "learning_rate": 2.038562129085387e-06, + "loss": 0.3845, + "step": 2837 + }, + { + "epoch": 1.2, + "grad_norm": 0.56829946589284, + "learning_rate": 2.0367155998501092e-06, + "loss": 0.4013, + "step": 2838 + }, + { + "epoch": 1.2, + "grad_norm": 0.5878475779332522, + "learning_rate": 2.03486933230948e-06, + "loss": 0.4206, + "step": 2839 + }, + { + "epoch": 1.2, + "grad_norm": 0.5637907040986664, + "learning_rate": 2.033023327506393e-06, + "loss": 0.4052, + "step": 2840 + }, + { + "epoch": 1.2, + "eval_loss": 0.4627860486507416, + "eval_runtime": 6936.5234, + "eval_samples_per_second": 41.864, + "eval_steps_per_second": 2.093, + "step": 2840 + }, + { + "epoch": 1.2, + "grad_norm": 0.5782227933222694, + "learning_rate": 2.0311775864836007e-06, + "loss": 0.4184, + "step": 2841 + }, + { + "epoch": 1.2, + "grad_norm": 0.5726678150106956, + "learning_rate": 2.0293321102837023e-06, + "loss": 0.4379, + "step": 2842 + }, + { + "epoch": 1.2, + "grad_norm": 0.5653275322857352, + "learning_rate": 2.0274868999491496e-06, + "loss": 0.411, + "step": 2843 + }, + { + "epoch": 1.2, + "grad_norm": 0.6143518059743222, + "learning_rate": 2.0256419565222423e-06, + "loss": 0.4202, + "step": 2844 + }, + { + "epoch": 1.2, + "grad_norm": 0.5533020572133284, + "learning_rate": 2.023797281045132e-06, + "loss": 0.4015, + "step": 2845 + }, + { + "epoch": 1.2, + "grad_norm": 0.5551626428007986, + "learning_rate": 2.0219528745598145e-06, + "loss": 0.4148, + "step": 2846 + }, + { + "epoch": 1.2, + "grad_norm": 0.5889239778826716, + "learning_rate": 2.020108738108139e-06, + "loss": 0.4052, + "step": 2847 + }, + { + "epoch": 1.2, + "grad_norm": 0.6024562268514936, + "learning_rate": 2.0182648727317986e-06, + "loss": 0.4287, + "step": 2848 + }, + { + "epoch": 1.2, + "grad_norm": 0.5746780507322024, + "learning_rate": 2.0164212794723336e-06, + "loss": 0.4165, + "step": 2849 + }, + { + "epoch": 1.2, + "grad_norm": 0.5908540121904199, + "learning_rate": 2.014577959371134e-06, + "loss": 0.4391, + "step": 2850 + }, + { + "epoch": 1.2, + "grad_norm": 0.5820354589271028, + "learning_rate": 2.012734913469429e-06, + "loss": 0.4151, + "step": 2851 + }, + { + "epoch": 1.2, + "grad_norm": 0.5473344454525257, + "learning_rate": 2.0108921428083e-06, + "loss": 0.4279, + "step": 2852 + }, + { + "epoch": 1.2, + "grad_norm": 0.5832936596852468, + "learning_rate": 2.009049648428668e-06, + "loss": 0.4288, + "step": 2853 + }, + { + "epoch": 1.21, + "grad_norm": 0.6084388529390307, + "learning_rate": 2.0072074313713e-06, + "loss": 0.4461, + "step": 2854 + }, + { + "epoch": 1.21, + "grad_norm": 0.5740745870244656, + "learning_rate": 2.0053654926768044e-06, + "loss": 0.4021, + "step": 2855 + }, + { + "epoch": 1.21, + "grad_norm": 0.5515506740065009, + "learning_rate": 2.003523833385637e-06, + "loss": 0.3928, + "step": 2856 + }, + { + "epoch": 1.21, + "grad_norm": 0.5624075725007869, + "learning_rate": 2.0016824545380895e-06, + "loss": 0.4191, + "step": 2857 + }, + { + "epoch": 1.21, + "grad_norm": 0.5863935111984939, + "learning_rate": 1.9998413571743006e-06, + "loss": 0.4224, + "step": 2858 + }, + { + "epoch": 1.21, + "grad_norm": 0.5721820837476251, + "learning_rate": 1.9980005423342462e-06, + "loss": 0.4364, + "step": 2859 + }, + { + "epoch": 1.21, + "grad_norm": 0.5729299550286993, + "learning_rate": 1.996160011057746e-06, + "loss": 0.4112, + "step": 2860 + }, + { + "epoch": 1.21, + "grad_norm": 0.5933829400685822, + "learning_rate": 1.9943197643844554e-06, + "loss": 0.399, + "step": 2861 + }, + { + "epoch": 1.21, + "grad_norm": 0.601305825995989, + "learning_rate": 1.992479803353872e-06, + "loss": 0.4265, + "step": 2862 + }, + { + "epoch": 1.21, + "grad_norm": 0.5692294662214427, + "learning_rate": 1.9906401290053323e-06, + "loss": 0.4071, + "step": 2863 + }, + { + "epoch": 1.21, + "grad_norm": 0.570531202757177, + "learning_rate": 1.9888007423780095e-06, + "loss": 0.4161, + "step": 2864 + }, + { + "epoch": 1.21, + "grad_norm": 0.5765127200524025, + "learning_rate": 1.9869616445109146e-06, + "loss": 0.42, + "step": 2865 + }, + { + "epoch": 1.21, + "grad_norm": 0.5652493468336516, + "learning_rate": 1.985122836442895e-06, + "loss": 0.4447, + "step": 2866 + }, + { + "epoch": 1.21, + "grad_norm": 0.5345921455849048, + "learning_rate": 1.9832843192126367e-06, + "loss": 0.4114, + "step": 2867 + }, + { + "epoch": 1.21, + "grad_norm": 0.553833707765255, + "learning_rate": 1.9814460938586572e-06, + "loss": 0.4192, + "step": 2868 + }, + { + "epoch": 1.21, + "grad_norm": 0.5659115457713205, + "learning_rate": 1.9796081614193143e-06, + "loss": 0.4195, + "step": 2869 + }, + { + "epoch": 1.21, + "grad_norm": 0.5667385460753466, + "learning_rate": 1.9777705229327954e-06, + "loss": 0.4177, + "step": 2870 + }, + { + "epoch": 1.21, + "grad_norm": 0.5807688119962365, + "learning_rate": 1.9759331794371255e-06, + "loss": 0.4015, + "step": 2871 + }, + { + "epoch": 1.21, + "grad_norm": 0.5637812729284788, + "learning_rate": 1.97409613197016e-06, + "loss": 0.4288, + "step": 2872 + }, + { + "epoch": 1.21, + "grad_norm": 0.5695007837691306, + "learning_rate": 1.972259381569592e-06, + "loss": 0.4029, + "step": 2873 + }, + { + "epoch": 1.21, + "grad_norm": 0.5977467234875765, + "learning_rate": 1.9704229292729393e-06, + "loss": 0.4189, + "step": 2874 + }, + { + "epoch": 1.21, + "grad_norm": 0.569978022020393, + "learning_rate": 1.9685867761175584e-06, + "loss": 0.3981, + "step": 2875 + }, + { + "epoch": 1.21, + "grad_norm": 0.5622730363649805, + "learning_rate": 1.9667509231406332e-06, + "loss": 0.4072, + "step": 2876 + }, + { + "epoch": 1.22, + "grad_norm": 0.5772397599288341, + "learning_rate": 1.964915371379178e-06, + "loss": 0.4224, + "step": 2877 + }, + { + "epoch": 1.22, + "grad_norm": 0.5550624772060395, + "learning_rate": 1.9630801218700397e-06, + "loss": 0.4168, + "step": 2878 + }, + { + "epoch": 1.22, + "grad_norm": 0.593384835711817, + "learning_rate": 1.961245175649889e-06, + "loss": 0.4082, + "step": 2879 + }, + { + "epoch": 1.22, + "grad_norm": 0.5648138827206016, + "learning_rate": 1.959410533755232e-06, + "loss": 0.4386, + "step": 2880 + }, + { + "epoch": 1.22, + "grad_norm": 0.590345590782358, + "learning_rate": 1.9575761972223983e-06, + "loss": 0.4318, + "step": 2881 + }, + { + "epoch": 1.22, + "grad_norm": 0.5814485952827325, + "learning_rate": 1.955742167087547e-06, + "loss": 0.4288, + "step": 2882 + }, + { + "epoch": 1.22, + "grad_norm": 0.5833916546222742, + "learning_rate": 1.953908444386662e-06, + "loss": 0.4093, + "step": 2883 + }, + { + "epoch": 1.22, + "grad_norm": 0.599723766340312, + "learning_rate": 1.9520750301555574e-06, + "loss": 0.4177, + "step": 2884 + }, + { + "epoch": 1.22, + "grad_norm": 0.6655917345901515, + "learning_rate": 1.9502419254298674e-06, + "loss": 0.4105, + "step": 2885 + }, + { + "epoch": 1.22, + "grad_norm": 0.5717968178433537, + "learning_rate": 1.9484091312450577e-06, + "loss": 0.4214, + "step": 2886 + }, + { + "epoch": 1.22, + "grad_norm": 0.5844960834468691, + "learning_rate": 1.9465766486364145e-06, + "loss": 0.4249, + "step": 2887 + }, + { + "epoch": 1.22, + "grad_norm": 0.5832680992228242, + "learning_rate": 1.944744478639048e-06, + "loss": 0.4095, + "step": 2888 + }, + { + "epoch": 1.22, + "grad_norm": 0.5814549183663302, + "learning_rate": 1.9429126222878954e-06, + "loss": 0.4303, + "step": 2889 + }, + { + "epoch": 1.22, + "grad_norm": 0.5589845040306947, + "learning_rate": 1.9410810806177105e-06, + "loss": 0.4008, + "step": 2890 + }, + { + "epoch": 1.22, + "grad_norm": 0.5823353970607823, + "learning_rate": 1.9392498546630767e-06, + "loss": 0.4206, + "step": 2891 + }, + { + "epoch": 1.22, + "grad_norm": 0.5773798607020071, + "learning_rate": 1.937418945458393e-06, + "loss": 0.4107, + "step": 2892 + }, + { + "epoch": 1.22, + "grad_norm": 0.5758173233706595, + "learning_rate": 1.935588354037883e-06, + "loss": 0.4299, + "step": 2893 + }, + { + "epoch": 1.22, + "grad_norm": 0.5823417001041828, + "learning_rate": 1.9337580814355887e-06, + "loss": 0.4109, + "step": 2894 + }, + { + "epoch": 1.22, + "grad_norm": 0.5969616099880778, + "learning_rate": 1.931928128685375e-06, + "loss": 0.4005, + "step": 2895 + }, + { + "epoch": 1.22, + "grad_norm": 0.5732522814857013, + "learning_rate": 1.9300984968209215e-06, + "loss": 0.4251, + "step": 2896 + }, + { + "epoch": 1.22, + "grad_norm": 0.6361313903565526, + "learning_rate": 1.928269186875731e-06, + "loss": 0.4294, + "step": 2897 + }, + { + "epoch": 1.22, + "grad_norm": 0.5936983346727575, + "learning_rate": 1.9264401998831213e-06, + "loss": 0.4503, + "step": 2898 + }, + { + "epoch": 1.22, + "grad_norm": 0.566199487669491, + "learning_rate": 1.9246115368762307e-06, + "loss": 0.4267, + "step": 2899 + }, + { + "epoch": 1.22, + "grad_norm": 0.5658880403316346, + "learning_rate": 1.922783198888011e-06, + "loss": 0.4211, + "step": 2900 + }, + { + "epoch": 1.23, + "grad_norm": 0.5861694408434711, + "learning_rate": 1.9209551869512326e-06, + "loss": 0.4142, + "step": 2901 + }, + { + "epoch": 1.23, + "grad_norm": 0.5861893981468904, + "learning_rate": 1.919127502098483e-06, + "loss": 0.4285, + "step": 2902 + }, + { + "epoch": 1.23, + "grad_norm": 0.6057636807953072, + "learning_rate": 1.9173001453621615e-06, + "loss": 0.4334, + "step": 2903 + }, + { + "epoch": 1.23, + "grad_norm": 0.5825991271880557, + "learning_rate": 1.9154731177744858e-06, + "loss": 0.4135, + "step": 2904 + }, + { + "epoch": 1.23, + "grad_norm": 0.5685728541574715, + "learning_rate": 1.913646420367483e-06, + "loss": 0.4239, + "step": 2905 + }, + { + "epoch": 1.23, + "grad_norm": 0.5435808297310615, + "learning_rate": 1.911820054173e-06, + "loss": 0.4092, + "step": 2906 + }, + { + "epoch": 1.23, + "grad_norm": 0.5768278588163496, + "learning_rate": 1.9099940202226895e-06, + "loss": 0.4192, + "step": 2907 + }, + { + "epoch": 1.23, + "grad_norm": 0.5528574586286196, + "learning_rate": 1.908168319548023e-06, + "loss": 0.4395, + "step": 2908 + }, + { + "epoch": 1.23, + "grad_norm": 0.5460290788495442, + "learning_rate": 1.9063429531802788e-06, + "loss": 0.4089, + "step": 2909 + }, + { + "epoch": 1.23, + "grad_norm": 0.5836425311067438, + "learning_rate": 1.9045179221505497e-06, + "loss": 0.4096, + "step": 2910 + }, + { + "epoch": 1.23, + "grad_norm": 0.5368200883277836, + "learning_rate": 1.902693227489737e-06, + "loss": 0.4098, + "step": 2911 + }, + { + "epoch": 1.23, + "eval_loss": 0.46161049604415894, + "eval_runtime": 6937.863, + "eval_samples_per_second": 41.856, + "eval_steps_per_second": 2.093, + "step": 2911 + }, + { + "epoch": 1.23, + "grad_norm": 0.5737532658966875, + "learning_rate": 1.9008688702285532e-06, + "loss": 0.4076, + "step": 2912 + }, + { + "epoch": 1.23, + "grad_norm": 0.5929933734472119, + "learning_rate": 1.899044851397519e-06, + "loss": 0.386, + "step": 2913 + }, + { + "epoch": 1.23, + "grad_norm": 0.5904060193816585, + "learning_rate": 1.8972211720269657e-06, + "loss": 0.4006, + "step": 2914 + }, + { + "epoch": 1.23, + "grad_norm": 0.5744303339968841, + "learning_rate": 1.8953978331470322e-06, + "loss": 0.3955, + "step": 2915 + }, + { + "epoch": 1.23, + "grad_norm": 0.5805107950321803, + "learning_rate": 1.8935748357876626e-06, + "loss": 0.3978, + "step": 2916 + }, + { + "epoch": 1.23, + "grad_norm": 0.6035250400166989, + "learning_rate": 1.8917521809786136e-06, + "loss": 0.4243, + "step": 2917 + }, + { + "epoch": 1.23, + "grad_norm": 0.5889359436095863, + "learning_rate": 1.8899298697494413e-06, + "loss": 0.4332, + "step": 2918 + }, + { + "epoch": 1.23, + "grad_norm": 0.638150416673843, + "learning_rate": 1.8881079031295147e-06, + "loss": 0.4289, + "step": 2919 + }, + { + "epoch": 1.23, + "grad_norm": 0.5711856241860922, + "learning_rate": 1.8862862821480023e-06, + "loss": 0.4198, + "step": 2920 + }, + { + "epoch": 1.23, + "grad_norm": 0.5695997739109594, + "learning_rate": 1.8844650078338818e-06, + "loss": 0.4008, + "step": 2921 + }, + { + "epoch": 1.23, + "grad_norm": 0.5593680674464713, + "learning_rate": 1.8826440812159321e-06, + "loss": 0.3921, + "step": 2922 + }, + { + "epoch": 1.23, + "grad_norm": 0.5684210443299574, + "learning_rate": 1.8808235033227378e-06, + "loss": 0.4109, + "step": 2923 + }, + { + "epoch": 1.24, + "grad_norm": 1.791352852330744, + "learning_rate": 1.8790032751826839e-06, + "loss": 0.4426, + "step": 2924 + }, + { + "epoch": 1.24, + "grad_norm": 0.590981142715526, + "learning_rate": 1.8771833978239615e-06, + "loss": 0.4044, + "step": 2925 + }, + { + "epoch": 1.24, + "grad_norm": 0.5929104028257183, + "learning_rate": 1.8753638722745601e-06, + "loss": 0.4251, + "step": 2926 + }, + { + "epoch": 1.24, + "grad_norm": 0.6088394356034341, + "learning_rate": 1.8735446995622719e-06, + "loss": 0.422, + "step": 2927 + }, + { + "epoch": 1.24, + "grad_norm": 0.5812034504925151, + "learning_rate": 1.8717258807146918e-06, + "loss": 0.4417, + "step": 2928 + }, + { + "epoch": 1.24, + "grad_norm": 0.5634954353609875, + "learning_rate": 1.8699074167592097e-06, + "loss": 0.4221, + "step": 2929 + }, + { + "epoch": 1.24, + "grad_norm": 0.5805870682157763, + "learning_rate": 1.8680893087230207e-06, + "loss": 0.4148, + "step": 2930 + }, + { + "epoch": 1.24, + "grad_norm": 0.5978992039621629, + "learning_rate": 1.866271557633115e-06, + "loss": 0.427, + "step": 2931 + }, + { + "epoch": 1.24, + "grad_norm": 0.5433676698516975, + "learning_rate": 1.8644541645162834e-06, + "loss": 0.4378, + "step": 2932 + }, + { + "epoch": 1.24, + "grad_norm": 0.5654081584633399, + "learning_rate": 1.862637130399112e-06, + "loss": 0.4255, + "step": 2933 + }, + { + "epoch": 1.24, + "grad_norm": 0.5745394540754084, + "learning_rate": 1.8608204563079874e-06, + "loss": 0.4138, + "step": 2934 + }, + { + "epoch": 1.24, + "grad_norm": 0.5651328725524883, + "learning_rate": 1.8590041432690895e-06, + "loss": 0.4289, + "step": 2935 + }, + { + "epoch": 1.24, + "grad_norm": 0.5543604900455075, + "learning_rate": 1.8571881923083976e-06, + "loss": 0.4098, + "step": 2936 + }, + { + "epoch": 1.24, + "grad_norm": 0.5541075458786751, + "learning_rate": 1.8553726044516835e-06, + "loss": 0.405, + "step": 2937 + }, + { + "epoch": 1.24, + "grad_norm": 0.5662395052454143, + "learning_rate": 1.8535573807245155e-06, + "loss": 0.398, + "step": 2938 + }, + { + "epoch": 1.24, + "grad_norm": 0.5415458339050451, + "learning_rate": 1.8517425221522555e-06, + "loss": 0.3944, + "step": 2939 + }, + { + "epoch": 1.24, + "grad_norm": 0.5975828808599232, + "learning_rate": 1.8499280297600594e-06, + "loss": 0.4102, + "step": 2940 + }, + { + "epoch": 1.24, + "grad_norm": 0.5586543614587621, + "learning_rate": 1.848113904572878e-06, + "loss": 0.4071, + "step": 2941 + }, + { + "epoch": 1.24, + "grad_norm": 0.5462110270977989, + "learning_rate": 1.8463001476154508e-06, + "loss": 0.4151, + "step": 2942 + }, + { + "epoch": 1.24, + "grad_norm": 0.5750571044180495, + "learning_rate": 1.844486759912313e-06, + "loss": 0.4111, + "step": 2943 + }, + { + "epoch": 1.24, + "grad_norm": 0.5670618407119518, + "learning_rate": 1.8426737424877883e-06, + "loss": 0.4188, + "step": 2944 + }, + { + "epoch": 1.24, + "grad_norm": 0.5916977571948979, + "learning_rate": 1.840861096365995e-06, + "loss": 0.4323, + "step": 2945 + }, + { + "epoch": 1.24, + "grad_norm": 0.5723973722291331, + "learning_rate": 1.8390488225708364e-06, + "loss": 0.4216, + "step": 2946 + }, + { + "epoch": 1.24, + "grad_norm": 0.5788397104566307, + "learning_rate": 1.8372369221260106e-06, + "loss": 0.4185, + "step": 2947 + }, + { + "epoch": 1.25, + "grad_norm": 0.605618630484289, + "learning_rate": 1.8354253960550017e-06, + "loss": 0.4073, + "step": 2948 + }, + { + "epoch": 1.25, + "grad_norm": 0.5986751285211731, + "learning_rate": 1.833614245381084e-06, + "loss": 0.4254, + "step": 2949 + }, + { + "epoch": 1.25, + "grad_norm": 0.5555679218850047, + "learning_rate": 1.8318034711273181e-06, + "loss": 0.4029, + "step": 2950 + }, + { + "epoch": 1.25, + "grad_norm": 0.5541685467504466, + "learning_rate": 1.8299930743165537e-06, + "loss": 0.3933, + "step": 2951 + }, + { + "epoch": 1.25, + "grad_norm": 0.5797747508458493, + "learning_rate": 1.8281830559714248e-06, + "loss": 0.4167, + "step": 2952 + }, + { + "epoch": 1.25, + "grad_norm": 0.5724155369928727, + "learning_rate": 1.8263734171143552e-06, + "loss": 0.4065, + "step": 2953 + }, + { + "epoch": 1.25, + "grad_norm": 0.5842672183423349, + "learning_rate": 1.8245641587675523e-06, + "loss": 0.4193, + "step": 2954 + }, + { + "epoch": 1.25, + "grad_norm": 0.5670445430376003, + "learning_rate": 1.822755281953007e-06, + "loss": 0.4362, + "step": 2955 + }, + { + "epoch": 1.25, + "grad_norm": 0.5785669095113156, + "learning_rate": 1.8209467876924992e-06, + "loss": 0.4115, + "step": 2956 + }, + { + "epoch": 1.25, + "grad_norm": 0.5942980368738244, + "learning_rate": 1.8191386770075863e-06, + "loss": 0.4273, + "step": 2957 + }, + { + "epoch": 1.25, + "grad_norm": 0.5928392121690472, + "learning_rate": 1.8173309509196158e-06, + "loss": 0.4224, + "step": 2958 + }, + { + "epoch": 1.25, + "grad_norm": 0.5922420607132926, + "learning_rate": 1.8155236104497128e-06, + "loss": 0.4394, + "step": 2959 + }, + { + "epoch": 1.25, + "grad_norm": 0.6223806379306809, + "learning_rate": 1.813716656618788e-06, + "loss": 0.434, + "step": 2960 + }, + { + "epoch": 1.25, + "grad_norm": 0.5845357998232491, + "learning_rate": 1.8119100904475306e-06, + "loss": 0.441, + "step": 2961 + }, + { + "epoch": 1.25, + "grad_norm": 0.5576654968046494, + "learning_rate": 1.8101039129564142e-06, + "loss": 0.4018, + "step": 2962 + }, + { + "epoch": 1.25, + "grad_norm": 0.5734931825622845, + "learning_rate": 1.8082981251656887e-06, + "loss": 0.4122, + "step": 2963 + }, + { + "epoch": 1.25, + "grad_norm": 0.5992264311549484, + "learning_rate": 1.8064927280953893e-06, + "loss": 0.4426, + "step": 2964 + }, + { + "epoch": 1.25, + "grad_norm": 0.5768107642343978, + "learning_rate": 1.8046877227653248e-06, + "loss": 0.388, + "step": 2965 + }, + { + "epoch": 1.25, + "grad_norm": 0.5821336521989418, + "learning_rate": 1.8028831101950866e-06, + "loss": 0.4245, + "step": 2966 + }, + { + "epoch": 1.25, + "grad_norm": 0.5390570291656819, + "learning_rate": 1.8010788914040444e-06, + "loss": 0.3867, + "step": 2967 + }, + { + "epoch": 1.25, + "grad_norm": 0.5704847089539564, + "learning_rate": 1.7992750674113414e-06, + "loss": 0.4278, + "step": 2968 + }, + { + "epoch": 1.25, + "grad_norm": 0.6111690436052009, + "learning_rate": 1.7974716392359026e-06, + "loss": 0.4479, + "step": 2969 + }, + { + "epoch": 1.25, + "grad_norm": 0.5802046152902072, + "learning_rate": 1.7956686078964257e-06, + "loss": 0.4298, + "step": 2970 + }, + { + "epoch": 1.26, + "grad_norm": 0.5455052953532722, + "learning_rate": 1.793865974411388e-06, + "loss": 0.4181, + "step": 2971 + }, + { + "epoch": 1.26, + "grad_norm": 0.5686949830786789, + "learning_rate": 1.7920637397990373e-06, + "loss": 0.4203, + "step": 2972 + }, + { + "epoch": 1.26, + "grad_norm": 0.5957176939693254, + "learning_rate": 1.7902619050774006e-06, + "loss": 0.4026, + "step": 2973 + }, + { + "epoch": 1.26, + "grad_norm": 0.6022161318571446, + "learning_rate": 1.788460471264276e-06, + "loss": 0.3995, + "step": 2974 + }, + { + "epoch": 1.26, + "grad_norm": 0.5831579158285504, + "learning_rate": 1.7866594393772375e-06, + "loss": 0.4035, + "step": 2975 + }, + { + "epoch": 1.26, + "grad_norm": 0.5801852393556498, + "learning_rate": 1.7848588104336293e-06, + "loss": 0.4123, + "step": 2976 + }, + { + "epoch": 1.26, + "grad_norm": 0.5961588266575963, + "learning_rate": 1.783058585450571e-06, + "loss": 0.4317, + "step": 2977 + }, + { + "epoch": 1.26, + "grad_norm": 0.573944779319003, + "learning_rate": 1.781258765444951e-06, + "loss": 0.4344, + "step": 2978 + }, + { + "epoch": 1.26, + "grad_norm": 0.5879109677820142, + "learning_rate": 1.7794593514334313e-06, + "loss": 0.4182, + "step": 2979 + }, + { + "epoch": 1.26, + "grad_norm": 0.5888697412742766, + "learning_rate": 1.7776603444324445e-06, + "loss": 0.4119, + "step": 2980 + }, + { + "epoch": 1.26, + "grad_norm": 0.570679582164104, + "learning_rate": 1.775861745458191e-06, + "loss": 0.4411, + "step": 2981 + }, + { + "epoch": 1.26, + "grad_norm": 0.5775235217718448, + "learning_rate": 1.774063555526644e-06, + "loss": 0.4003, + "step": 2982 + }, + { + "epoch": 1.26, + "eval_loss": 0.4607752859592438, + "eval_runtime": 6938.4518, + "eval_samples_per_second": 41.852, + "eval_steps_per_second": 2.093, + "step": 2982 + }, + { + "epoch": 1.26, + "grad_norm": 0.5772173427303933, + "learning_rate": 1.7722657756535422e-06, + "loss": 0.4292, + "step": 2983 + }, + { + "epoch": 1.26, + "grad_norm": 0.5620226835290322, + "learning_rate": 1.7704684068543953e-06, + "loss": 0.396, + "step": 2984 + }, + { + "epoch": 1.26, + "grad_norm": 0.6017908973225675, + "learning_rate": 1.7686714501444791e-06, + "loss": 0.4228, + "step": 2985 + }, + { + "epoch": 1.26, + "grad_norm": 0.5842773295597439, + "learning_rate": 1.7668749065388385e-06, + "loss": 0.4235, + "step": 2986 + }, + { + "epoch": 1.26, + "grad_norm": 0.5881893778638159, + "learning_rate": 1.7650787770522831e-06, + "loss": 0.3885, + "step": 2987 + }, + { + "epoch": 1.26, + "grad_norm": 0.5921874088208304, + "learning_rate": 1.76328306269939e-06, + "loss": 0.4118, + "step": 2988 + }, + { + "epoch": 1.26, + "grad_norm": 0.5860478876629458, + "learning_rate": 1.7614877644945002e-06, + "loss": 0.4259, + "step": 2989 + }, + { + "epoch": 1.26, + "grad_norm": 0.5566234951598134, + "learning_rate": 1.759692883451721e-06, + "loss": 0.4062, + "step": 2990 + }, + { + "epoch": 1.26, + "grad_norm": 0.5886058120181612, + "learning_rate": 1.757898420584925e-06, + "loss": 0.4157, + "step": 2991 + }, + { + "epoch": 1.26, + "grad_norm": 0.5835204891594994, + "learning_rate": 1.756104376907746e-06, + "loss": 0.4287, + "step": 2992 + }, + { + "epoch": 1.26, + "grad_norm": 0.5820795493170282, + "learning_rate": 1.7543107534335828e-06, + "loss": 0.4224, + "step": 2993 + }, + { + "epoch": 1.26, + "grad_norm": 0.6013433453081255, + "learning_rate": 1.752517551175596e-06, + "loss": 0.4354, + "step": 2994 + }, + { + "epoch": 1.27, + "grad_norm": 0.6033893634449149, + "learning_rate": 1.750724771146709e-06, + "loss": 0.4286, + "step": 2995 + }, + { + "epoch": 1.27, + "grad_norm": 0.54509644376269, + "learning_rate": 1.748932414359605e-06, + "loss": 0.4082, + "step": 2996 + }, + { + "epoch": 1.27, + "grad_norm": 0.5826151129215978, + "learning_rate": 1.7471404818267319e-06, + "loss": 0.4051, + "step": 2997 + }, + { + "epoch": 1.27, + "grad_norm": 0.5880061063869555, + "learning_rate": 1.745348974560293e-06, + "loss": 0.4386, + "step": 2998 + }, + { + "epoch": 1.27, + "grad_norm": 0.5656468304808548, + "learning_rate": 1.743557893572256e-06, + "loss": 0.4146, + "step": 2999 + }, + { + "epoch": 1.27, + "grad_norm": 0.5689192538618583, + "learning_rate": 1.741767239874344e-06, + "loss": 0.4103, + "step": 3000 + }, + { + "epoch": 1.27, + "grad_norm": 0.5840118133437147, + "learning_rate": 1.7399770144780414e-06, + "loss": 0.4199, + "step": 3001 + }, + { + "epoch": 1.27, + "grad_norm": 0.5550874833815063, + "learning_rate": 1.7381872183945885e-06, + "loss": 0.3935, + "step": 3002 + }, + { + "epoch": 1.27, + "grad_norm": 0.5731500465902329, + "learning_rate": 1.7363978526349857e-06, + "loss": 0.4307, + "step": 3003 + }, + { + "epoch": 1.27, + "grad_norm": 0.5579055837980196, + "learning_rate": 1.734608918209989e-06, + "loss": 0.4152, + "step": 3004 + }, + { + "epoch": 1.27, + "grad_norm": 0.5619015288206511, + "learning_rate": 1.7328204161301084e-06, + "loss": 0.4257, + "step": 3005 + }, + { + "epoch": 1.27, + "grad_norm": 0.5685518518480854, + "learning_rate": 1.7310323474056154e-06, + "loss": 0.4292, + "step": 3006 + }, + { + "epoch": 1.27, + "grad_norm": 0.572223039710208, + "learning_rate": 1.7292447130465296e-06, + "loss": 0.4027, + "step": 3007 + }, + { + "epoch": 1.27, + "grad_norm": 0.5923422187875147, + "learning_rate": 1.7274575140626318e-06, + "loss": 0.4185, + "step": 3008 + }, + { + "epoch": 1.27, + "grad_norm": 0.578248418530715, + "learning_rate": 1.7256707514634521e-06, + "loss": 0.4225, + "step": 3009 + }, + { + "epoch": 1.27, + "grad_norm": 0.5738266276609935, + "learning_rate": 1.723884426258277e-06, + "loss": 0.4073, + "step": 3010 + }, + { + "epoch": 1.27, + "grad_norm": 0.5804211498201226, + "learning_rate": 1.7220985394561445e-06, + "loss": 0.4253, + "step": 3011 + }, + { + "epoch": 1.27, + "grad_norm": 0.5703132294176713, + "learning_rate": 1.7203130920658457e-06, + "loss": 0.4127, + "step": 3012 + }, + { + "epoch": 1.27, + "grad_norm": 0.580926877910347, + "learning_rate": 1.7185280850959215e-06, + "loss": 0.4152, + "step": 3013 + }, + { + "epoch": 1.27, + "grad_norm": 0.5807574727618313, + "learning_rate": 1.7167435195546683e-06, + "loss": 0.4133, + "step": 3014 + }, + { + "epoch": 1.27, + "grad_norm": 0.5834413664173587, + "learning_rate": 1.7149593964501285e-06, + "loss": 0.4132, + "step": 3015 + }, + { + "epoch": 1.27, + "grad_norm": 0.592960565681535, + "learning_rate": 1.7131757167900966e-06, + "loss": 0.4159, + "step": 3016 + }, + { + "epoch": 1.27, + "grad_norm": 0.5734356639268737, + "learning_rate": 1.711392481582119e-06, + "loss": 0.4037, + "step": 3017 + }, + { + "epoch": 1.28, + "grad_norm": 0.608289566001257, + "learning_rate": 1.7096096918334853e-06, + "loss": 0.4225, + "step": 3018 + }, + { + "epoch": 1.28, + "grad_norm": 0.5553937161005909, + "learning_rate": 1.7078273485512392e-06, + "loss": 0.428, + "step": 3019 + }, + { + "epoch": 1.28, + "grad_norm": 0.5800080754367256, + "learning_rate": 1.7060454527421688e-06, + "loss": 0.4497, + "step": 3020 + }, + { + "epoch": 1.28, + "grad_norm": 0.5956520471145508, + "learning_rate": 1.7042640054128112e-06, + "loss": 0.4023, + "step": 3021 + }, + { + "epoch": 1.28, + "grad_norm": 0.5513802042755411, + "learning_rate": 1.7024830075694483e-06, + "loss": 0.4057, + "step": 3022 + }, + { + "epoch": 1.28, + "grad_norm": 0.5663784002610377, + "learning_rate": 1.7007024602181105e-06, + "loss": 0.4072, + "step": 3023 + }, + { + "epoch": 1.28, + "grad_norm": 0.5728768465657827, + "learning_rate": 1.6989223643645706e-06, + "loss": 0.4027, + "step": 3024 + }, + { + "epoch": 1.28, + "grad_norm": 0.5540154660402091, + "learning_rate": 1.6971427210143503e-06, + "loss": 0.417, + "step": 3025 + }, + { + "epoch": 1.28, + "grad_norm": 0.580580249925815, + "learning_rate": 1.6953635311727126e-06, + "loss": 0.4203, + "step": 3026 + }, + { + "epoch": 1.28, + "grad_norm": 0.6204069176031851, + "learning_rate": 1.6935847958446657e-06, + "loss": 0.438, + "step": 3027 + }, + { + "epoch": 1.28, + "grad_norm": 0.5536941783725637, + "learning_rate": 1.6918065160349604e-06, + "loss": 0.4255, + "step": 3028 + }, + { + "epoch": 1.28, + "grad_norm": 0.569556045523807, + "learning_rate": 1.6900286927480898e-06, + "loss": 0.4122, + "step": 3029 + }, + { + "epoch": 1.28, + "grad_norm": 0.5546113410095393, + "learning_rate": 1.6882513269882916e-06, + "loss": 0.4194, + "step": 3030 + }, + { + "epoch": 1.28, + "grad_norm": 0.5749490985614878, + "learning_rate": 1.6864744197595418e-06, + "loss": 0.4213, + "step": 3031 + }, + { + "epoch": 1.28, + "grad_norm": 0.5849700553588103, + "learning_rate": 1.68469797206556e-06, + "loss": 0.414, + "step": 3032 + }, + { + "epoch": 1.28, + "grad_norm": 0.5642184593848872, + "learning_rate": 1.6829219849098035e-06, + "loss": 0.4217, + "step": 3033 + }, + { + "epoch": 1.28, + "grad_norm": 0.5928018316436391, + "learning_rate": 1.681146459295473e-06, + "loss": 0.4244, + "step": 3034 + }, + { + "epoch": 1.28, + "grad_norm": 0.591820965440071, + "learning_rate": 1.6793713962255043e-06, + "loss": 0.4609, + "step": 3035 + }, + { + "epoch": 1.28, + "grad_norm": 0.5886651969708854, + "learning_rate": 1.6775967967025764e-06, + "loss": 0.418, + "step": 3036 + }, + { + "epoch": 1.28, + "grad_norm": 0.563924203947021, + "learning_rate": 1.675822661729103e-06, + "loss": 0.3846, + "step": 3037 + }, + { + "epoch": 1.28, + "grad_norm": 0.5882580889291531, + "learning_rate": 1.674048992307237e-06, + "loss": 0.4225, + "step": 3038 + }, + { + "epoch": 1.28, + "grad_norm": 0.5609824212530277, + "learning_rate": 1.6722757894388675e-06, + "loss": 0.3968, + "step": 3039 + }, + { + "epoch": 1.28, + "grad_norm": 0.5783821889365726, + "learning_rate": 1.6705030541256211e-06, + "loss": 0.4044, + "step": 3040 + }, + { + "epoch": 1.29, + "grad_norm": 0.5904279554578172, + "learning_rate": 1.6687307873688583e-06, + "loss": 0.4334, + "step": 3041 + }, + { + "epoch": 1.29, + "grad_norm": 0.6379074710732456, + "learning_rate": 1.6669589901696778e-06, + "loss": 0.4264, + "step": 3042 + }, + { + "epoch": 1.29, + "grad_norm": 0.5522942159204164, + "learning_rate": 1.665187663528912e-06, + "loss": 0.4256, + "step": 3043 + }, + { + "epoch": 1.29, + "grad_norm": 0.5665275799582181, + "learning_rate": 1.6634168084471252e-06, + "loss": 0.4099, + "step": 3044 + }, + { + "epoch": 1.29, + "grad_norm": 0.5800540818159535, + "learning_rate": 1.661646425924619e-06, + "loss": 0.4272, + "step": 3045 + }, + { + "epoch": 1.29, + "grad_norm": 0.59834302679002, + "learning_rate": 1.6598765169614245e-06, + "loss": 0.417, + "step": 3046 + }, + { + "epoch": 1.29, + "grad_norm": 0.5425819631570858, + "learning_rate": 1.6581070825573093e-06, + "loss": 0.4123, + "step": 3047 + }, + { + "epoch": 1.29, + "grad_norm": 0.549947011538962, + "learning_rate": 1.6563381237117688e-06, + "loss": 0.4141, + "step": 3048 + }, + { + "epoch": 1.29, + "grad_norm": 0.5708925242028148, + "learning_rate": 1.6545696414240326e-06, + "loss": 0.4306, + "step": 3049 + }, + { + "epoch": 1.29, + "grad_norm": 0.5742846219284804, + "learning_rate": 1.6528016366930594e-06, + "loss": 0.4147, + "step": 3050 + }, + { + "epoch": 1.29, + "grad_norm": 0.6145663624908198, + "learning_rate": 1.6510341105175401e-06, + "loss": 0.4262, + "step": 3051 + }, + { + "epoch": 1.29, + "grad_norm": 0.6072916721819851, + "learning_rate": 1.6492670638958924e-06, + "loss": 0.441, + "step": 3052 + }, + { + "epoch": 1.29, + "grad_norm": 0.5834453619650581, + "learning_rate": 1.647500497826267e-06, + "loss": 0.4159, + "step": 3053 + }, + { + "epoch": 1.29, + "eval_loss": 0.4601620137691498, + "eval_runtime": 6944.585, + "eval_samples_per_second": 41.815, + "eval_steps_per_second": 2.091, + "step": 3053 + }, + { + "epoch": 1.29, + "grad_norm": 0.558687925184084, + "learning_rate": 1.6457344133065395e-06, + "loss": 0.4053, + "step": 3054 + }, + { + "epoch": 1.29, + "grad_norm": 0.5674610161351168, + "learning_rate": 1.643968811334315e-06, + "loss": 0.417, + "step": 3055 + }, + { + "epoch": 1.29, + "grad_norm": 0.5640253416506891, + "learning_rate": 1.642203692906927e-06, + "loss": 0.3994, + "step": 3056 + }, + { + "epoch": 1.29, + "grad_norm": 0.5508590652466895, + "learning_rate": 1.640439059021433e-06, + "loss": 0.4221, + "step": 3057 + }, + { + "epoch": 1.29, + "grad_norm": 0.5938928064517502, + "learning_rate": 1.6386749106746214e-06, + "loss": 0.4267, + "step": 3058 + }, + { + "epoch": 1.29, + "grad_norm": 0.5570296609089503, + "learning_rate": 1.6369112488630009e-06, + "loss": 0.4032, + "step": 3059 + }, + { + "epoch": 1.29, + "grad_norm": 0.5854299661276051, + "learning_rate": 1.6351480745828098e-06, + "loss": 0.4095, + "step": 3060 + }, + { + "epoch": 1.29, + "grad_norm": 0.559667720654985, + "learning_rate": 1.6333853888300083e-06, + "loss": 0.4221, + "step": 3061 + }, + { + "epoch": 1.29, + "grad_norm": 0.5735241871511867, + "learning_rate": 1.6316231926002823e-06, + "loss": 0.395, + "step": 3062 + }, + { + "epoch": 1.29, + "grad_norm": 0.5416537755969776, + "learning_rate": 1.629861486889039e-06, + "loss": 0.3963, + "step": 3063 + }, + { + "epoch": 1.29, + "grad_norm": 0.5673587035921341, + "learning_rate": 1.6281002726914125e-06, + "loss": 0.4109, + "step": 3064 + }, + { + "epoch": 1.3, + "grad_norm": 0.5433392636524627, + "learning_rate": 1.6263395510022546e-06, + "loss": 0.4001, + "step": 3065 + }, + { + "epoch": 1.3, + "grad_norm": 0.5550782192229988, + "learning_rate": 1.6245793228161421e-06, + "loss": 0.4045, + "step": 3066 + }, + { + "epoch": 1.3, + "grad_norm": 0.5861247355677058, + "learning_rate": 1.622819589127372e-06, + "loss": 0.4346, + "step": 3067 + }, + { + "epoch": 1.3, + "grad_norm": 0.607146634130001, + "learning_rate": 1.6210603509299604e-06, + "loss": 0.4115, + "step": 3068 + }, + { + "epoch": 1.3, + "grad_norm": 0.5562854235248, + "learning_rate": 1.6193016092176484e-06, + "loss": 0.4203, + "step": 3069 + }, + { + "epoch": 1.3, + "grad_norm": 0.5627927077991777, + "learning_rate": 1.6175433649838901e-06, + "loss": 0.4262, + "step": 3070 + }, + { + "epoch": 1.3, + "grad_norm": 0.5750081673543436, + "learning_rate": 1.615785619221864e-06, + "loss": 0.3833, + "step": 3071 + }, + { + "epoch": 1.3, + "grad_norm": 0.5936159776999882, + "learning_rate": 1.6140283729244638e-06, + "loss": 0.4203, + "step": 3072 + }, + { + "epoch": 1.3, + "grad_norm": 0.5718250335162808, + "learning_rate": 1.6122716270843025e-06, + "loss": 0.4065, + "step": 3073 + }, + { + "epoch": 1.3, + "grad_norm": 0.5771598792199492, + "learning_rate": 1.6105153826937087e-06, + "loss": 0.4185, + "step": 3074 + }, + { + "epoch": 1.3, + "grad_norm": 0.5759622276853105, + "learning_rate": 1.6087596407447314e-06, + "loss": 0.42, + "step": 3075 + }, + { + "epoch": 1.3, + "grad_norm": 0.611717304812912, + "learning_rate": 1.607004402229132e-06, + "loss": 0.4072, + "step": 3076 + }, + { + "epoch": 1.3, + "grad_norm": 0.5872035309615434, + "learning_rate": 1.60524966813839e-06, + "loss": 0.4354, + "step": 3077 + }, + { + "epoch": 1.3, + "grad_norm": 0.5811950216335751, + "learning_rate": 1.6034954394636977e-06, + "loss": 0.4073, + "step": 3078 + }, + { + "epoch": 1.3, + "grad_norm": 0.5706512241287943, + "learning_rate": 1.6017417171959643e-06, + "loss": 0.4203, + "step": 3079 + }, + { + "epoch": 1.3, + "grad_norm": 0.5826675112936385, + "learning_rate": 1.5999885023258099e-06, + "loss": 0.3995, + "step": 3080 + }, + { + "epoch": 1.3, + "grad_norm": 0.5682279192683956, + "learning_rate": 1.5982357958435723e-06, + "loss": 0.3878, + "step": 3081 + }, + { + "epoch": 1.3, + "grad_norm": 0.5713141319558672, + "learning_rate": 1.5964835987392991e-06, + "loss": 0.4014, + "step": 3082 + }, + { + "epoch": 1.3, + "grad_norm": 0.5747223757155583, + "learning_rate": 1.59473191200275e-06, + "loss": 0.4146, + "step": 3083 + }, + { + "epoch": 1.3, + "grad_norm": 0.5809784351004627, + "learning_rate": 1.5929807366233979e-06, + "loss": 0.4445, + "step": 3084 + }, + { + "epoch": 1.3, + "grad_norm": 0.5632117792726165, + "learning_rate": 1.5912300735904252e-06, + "loss": 0.3946, + "step": 3085 + }, + { + "epoch": 1.3, + "grad_norm": 0.581279032610451, + "learning_rate": 1.5894799238927277e-06, + "loss": 0.395, + "step": 3086 + }, + { + "epoch": 1.3, + "grad_norm": 0.5877431541038257, + "learning_rate": 1.5877302885189077e-06, + "loss": 0.4212, + "step": 3087 + }, + { + "epoch": 1.31, + "grad_norm": 0.6178627095748951, + "learning_rate": 1.5859811684572796e-06, + "loss": 0.4237, + "step": 3088 + }, + { + "epoch": 1.31, + "grad_norm": 0.5791179237596596, + "learning_rate": 1.584232564695865e-06, + "loss": 0.4243, + "step": 3089 + }, + { + "epoch": 1.31, + "grad_norm": 0.5657862133711392, + "learning_rate": 1.5824844782223956e-06, + "loss": 0.4234, + "step": 3090 + }, + { + "epoch": 1.31, + "grad_norm": 0.5767524507801428, + "learning_rate": 1.5807369100243084e-06, + "loss": 0.4211, + "step": 3091 + }, + { + "epoch": 1.31, + "grad_norm": 0.5697331723482661, + "learning_rate": 1.578989861088751e-06, + "loss": 0.414, + "step": 3092 + }, + { + "epoch": 1.31, + "grad_norm": 0.5476753293523711, + "learning_rate": 1.5772433324025748e-06, + "loss": 0.406, + "step": 3093 + }, + { + "epoch": 1.31, + "grad_norm": 0.5561716385297626, + "learning_rate": 1.5754973249523387e-06, + "loss": 0.4155, + "step": 3094 + }, + { + "epoch": 1.31, + "grad_norm": 0.572512956688457, + "learning_rate": 1.5737518397243074e-06, + "loss": 0.4286, + "step": 3095 + }, + { + "epoch": 1.31, + "grad_norm": 0.5939672288238211, + "learning_rate": 1.5720068777044479e-06, + "loss": 0.4276, + "step": 3096 + }, + { + "epoch": 1.31, + "grad_norm": 0.5966500948903306, + "learning_rate": 1.570262439878437e-06, + "loss": 0.4211, + "step": 3097 + }, + { + "epoch": 1.31, + "grad_norm": 0.5952090362230588, + "learning_rate": 1.56851852723165e-06, + "loss": 0.4447, + "step": 3098 + }, + { + "epoch": 1.31, + "grad_norm": 0.5673871145350723, + "learning_rate": 1.5667751407491689e-06, + "loss": 0.4408, + "step": 3099 + }, + { + "epoch": 1.31, + "grad_norm": 0.5663044725908228, + "learning_rate": 1.5650322814157764e-06, + "loss": 0.4219, + "step": 3100 + }, + { + "epoch": 1.31, + "grad_norm": 0.5902068842643253, + "learning_rate": 1.5632899502159594e-06, + "loss": 0.4104, + "step": 3101 + }, + { + "epoch": 1.31, + "grad_norm": 0.5961011802361124, + "learning_rate": 1.561548148133904e-06, + "loss": 0.3999, + "step": 3102 + }, + { + "epoch": 1.31, + "grad_norm": 0.5676333875668861, + "learning_rate": 1.559806876153501e-06, + "loss": 0.42, + "step": 3103 + }, + { + "epoch": 1.31, + "grad_norm": 0.5717204537217901, + "learning_rate": 1.5580661352583377e-06, + "loss": 0.3927, + "step": 3104 + }, + { + "epoch": 1.31, + "grad_norm": 0.5932365555296837, + "learning_rate": 1.5563259264317048e-06, + "loss": 0.4069, + "step": 3105 + }, + { + "epoch": 1.31, + "grad_norm": 0.60609083336309, + "learning_rate": 1.55458625065659e-06, + "loss": 0.4232, + "step": 3106 + }, + { + "epoch": 1.31, + "grad_norm": 0.5568592972383142, + "learning_rate": 1.5528471089156805e-06, + "loss": 0.4035, + "step": 3107 + }, + { + "epoch": 1.31, + "grad_norm": 0.5811068115782658, + "learning_rate": 1.5511085021913644e-06, + "loss": 0.4381, + "step": 3108 + }, + { + "epoch": 1.31, + "grad_norm": 0.5978207369401186, + "learning_rate": 1.5493704314657232e-06, + "loss": 0.438, + "step": 3109 + }, + { + "epoch": 1.31, + "grad_norm": 0.5720438095205538, + "learning_rate": 1.5476328977205396e-06, + "loss": 0.4279, + "step": 3110 + }, + { + "epoch": 1.31, + "grad_norm": 0.5893474852498958, + "learning_rate": 1.5458959019372893e-06, + "loss": 0.431, + "step": 3111 + }, + { + "epoch": 1.32, + "grad_norm": 0.5490466454197072, + "learning_rate": 1.544159445097148e-06, + "loss": 0.4092, + "step": 3112 + }, + { + "epoch": 1.32, + "grad_norm": 0.5384899530575747, + "learning_rate": 1.542423528180983e-06, + "loss": 0.3936, + "step": 3113 + }, + { + "epoch": 1.32, + "grad_norm": 0.5850096033351098, + "learning_rate": 1.5406881521693606e-06, + "loss": 0.4019, + "step": 3114 + }, + { + "epoch": 1.32, + "grad_norm": 0.587195704302166, + "learning_rate": 1.5389533180425387e-06, + "loss": 0.4144, + "step": 3115 + }, + { + "epoch": 1.32, + "grad_norm": 0.5635366537239104, + "learning_rate": 1.5372190267804704e-06, + "loss": 0.4129, + "step": 3116 + }, + { + "epoch": 1.32, + "grad_norm": 0.5718044990589992, + "learning_rate": 1.5354852793628007e-06, + "loss": 0.4009, + "step": 3117 + }, + { + "epoch": 1.32, + "grad_norm": 0.5926450332610386, + "learning_rate": 1.53375207676887e-06, + "loss": 0.4348, + "step": 3118 + }, + { + "epoch": 1.32, + "grad_norm": 0.564012302965777, + "learning_rate": 1.5320194199777078e-06, + "loss": 0.4008, + "step": 3119 + }, + { + "epoch": 1.32, + "grad_norm": 0.5881564721462378, + "learning_rate": 1.5302873099680378e-06, + "loss": 0.4116, + "step": 3120 + }, + { + "epoch": 1.32, + "grad_norm": 0.5805570882385258, + "learning_rate": 1.5285557477182744e-06, + "loss": 0.3955, + "step": 3121 + }, + { + "epoch": 1.32, + "grad_norm": 0.601026663572398, + "learning_rate": 1.5268247342065215e-06, + "loss": 0.4222, + "step": 3122 + }, + { + "epoch": 1.32, + "grad_norm": 0.5688916955974609, + "learning_rate": 1.525094270410574e-06, + "loss": 0.4077, + "step": 3123 + }, + { + "epoch": 1.32, + "grad_norm": 0.6376814875861784, + "learning_rate": 1.5233643573079148e-06, + "loss": 0.4198, + "step": 3124 + }, + { + "epoch": 1.32, + "eval_loss": 0.45952433347702026, + "eval_runtime": 6938.4565, + "eval_samples_per_second": 41.852, + "eval_steps_per_second": 2.093, + "step": 3124 + }, + { + "epoch": 1.32, + "grad_norm": 0.5675589935533134, + "learning_rate": 1.5216349958757187e-06, + "loss": 0.4276, + "step": 3125 + }, + { + "epoch": 1.32, + "grad_norm": 0.5819253780607699, + "learning_rate": 1.5199061870908457e-06, + "loss": 0.4135, + "step": 3126 + }, + { + "epoch": 1.32, + "grad_norm": 0.5473928976130037, + "learning_rate": 1.518177931929846e-06, + "loss": 0.39, + "step": 3127 + }, + { + "epoch": 1.32, + "grad_norm": 0.5668895033063771, + "learning_rate": 1.516450231368955e-06, + "loss": 0.4145, + "step": 3128 + }, + { + "epoch": 1.32, + "grad_norm": 0.5566976729596672, + "learning_rate": 1.5147230863840968e-06, + "loss": 0.417, + "step": 3129 + }, + { + "epoch": 1.32, + "grad_norm": 0.5599445867608915, + "learning_rate": 1.5129964979508792e-06, + "loss": 0.4049, + "step": 3130 + }, + { + "epoch": 1.32, + "grad_norm": 0.5689525691882638, + "learning_rate": 1.5112704670445994e-06, + "loss": 0.414, + "step": 3131 + }, + { + "epoch": 1.32, + "grad_norm": 0.5745693019904166, + "learning_rate": 1.509544994640236e-06, + "loss": 0.415, + "step": 3132 + }, + { + "epoch": 1.32, + "grad_norm": 0.5703134113139972, + "learning_rate": 1.507820081712454e-06, + "loss": 0.4202, + "step": 3133 + }, + { + "epoch": 1.32, + "grad_norm": 0.5798680593678225, + "learning_rate": 1.5060957292356021e-06, + "loss": 0.4224, + "step": 3134 + }, + { + "epoch": 1.33, + "grad_norm": 0.5634109776233843, + "learning_rate": 1.5043719381837113e-06, + "loss": 0.4165, + "step": 3135 + }, + { + "epoch": 1.33, + "grad_norm": 0.6007001621304139, + "learning_rate": 1.5026487095304982e-06, + "loss": 0.4143, + "step": 3136 + }, + { + "epoch": 1.33, + "grad_norm": 0.5816452933961378, + "learning_rate": 1.5009260442493582e-06, + "loss": 0.3987, + "step": 3137 + }, + { + "epoch": 1.33, + "grad_norm": 0.5520751522048931, + "learning_rate": 1.4992039433133715e-06, + "loss": 0.4151, + "step": 3138 + }, + { + "epoch": 1.33, + "grad_norm": 0.6053997190844257, + "learning_rate": 1.497482407695297e-06, + "loss": 0.4311, + "step": 3139 + }, + { + "epoch": 1.33, + "grad_norm": 0.5574155007809499, + "learning_rate": 1.495761438367577e-06, + "loss": 0.3934, + "step": 3140 + }, + { + "epoch": 1.33, + "grad_norm": 0.5612828220380264, + "learning_rate": 1.4940410363023306e-06, + "loss": 0.3898, + "step": 3141 + }, + { + "epoch": 1.33, + "grad_norm": 0.5900288585719455, + "learning_rate": 1.4923212024713602e-06, + "loss": 0.4369, + "step": 3142 + }, + { + "epoch": 1.33, + "grad_norm": 0.5818357215539803, + "learning_rate": 1.4906019378461437e-06, + "loss": 0.4448, + "step": 3143 + }, + { + "epoch": 1.33, + "grad_norm": 0.553139692347851, + "learning_rate": 1.4888832433978403e-06, + "loss": 0.426, + "step": 3144 + }, + { + "epoch": 1.33, + "grad_norm": 0.5952446958459735, + "learning_rate": 1.4871651200972854e-06, + "loss": 0.4246, + "step": 3145 + }, + { + "epoch": 1.33, + "grad_norm": 0.6187484191420308, + "learning_rate": 1.485447568914991e-06, + "loss": 0.4145, + "step": 3146 + }, + { + "epoch": 1.33, + "grad_norm": 0.6002358417607456, + "learning_rate": 1.4837305908211502e-06, + "loss": 0.407, + "step": 3147 + }, + { + "epoch": 1.33, + "grad_norm": 0.55885003361605, + "learning_rate": 1.4820141867856268e-06, + "loss": 0.3977, + "step": 3148 + }, + { + "epoch": 1.33, + "grad_norm": 0.5703620407770618, + "learning_rate": 1.4802983577779651e-06, + "loss": 0.4402, + "step": 3149 + }, + { + "epoch": 1.33, + "grad_norm": 0.5752264024437074, + "learning_rate": 1.47858310476738e-06, + "loss": 0.4081, + "step": 3150 + }, + { + "epoch": 1.33, + "grad_norm": 0.587484294352918, + "learning_rate": 1.4768684287227652e-06, + "loss": 0.4258, + "step": 3151 + }, + { + "epoch": 1.33, + "grad_norm": 0.577277316397856, + "learning_rate": 1.4751543306126856e-06, + "loss": 0.4076, + "step": 3152 + }, + { + "epoch": 1.33, + "grad_norm": 0.5585293413380346, + "learning_rate": 1.4734408114053822e-06, + "loss": 0.4066, + "step": 3153 + }, + { + "epoch": 1.33, + "grad_norm": 0.5996038294832453, + "learning_rate": 1.471727872068766e-06, + "loss": 0.413, + "step": 3154 + }, + { + "epoch": 1.33, + "grad_norm": 0.5908728837224524, + "learning_rate": 1.470015513570424e-06, + "loss": 0.4135, + "step": 3155 + }, + { + "epoch": 1.33, + "grad_norm": 0.5973053416179713, + "learning_rate": 1.468303736877611e-06, + "loss": 0.4216, + "step": 3156 + }, + { + "epoch": 1.33, + "grad_norm": 0.5783789948908563, + "learning_rate": 1.466592542957257e-06, + "loss": 0.4292, + "step": 3157 + }, + { + "epoch": 1.34, + "grad_norm": 0.549412909563158, + "learning_rate": 1.4648819327759589e-06, + "loss": 0.4151, + "step": 3158 + }, + { + "epoch": 1.34, + "grad_norm": 0.5638177225917886, + "learning_rate": 1.4631719072999884e-06, + "loss": 0.43, + "step": 3159 + }, + { + "epoch": 1.34, + "grad_norm": 0.5811841444119168, + "learning_rate": 1.4614624674952843e-06, + "loss": 0.4199, + "step": 3160 + }, + { + "epoch": 1.34, + "grad_norm": 0.5745757615240445, + "learning_rate": 1.4597536143274537e-06, + "loss": 0.4223, + "step": 3161 + }, + { + "epoch": 1.34, + "grad_norm": 0.5695416398603609, + "learning_rate": 1.4580453487617747e-06, + "loss": 0.403, + "step": 3162 + }, + { + "epoch": 1.34, + "grad_norm": 0.5643072461632591, + "learning_rate": 1.4563376717631906e-06, + "loss": 0.4024, + "step": 3163 + }, + { + "epoch": 1.34, + "grad_norm": 0.5855374862753647, + "learning_rate": 1.4546305842963156e-06, + "loss": 0.4169, + "step": 3164 + }, + { + "epoch": 1.34, + "grad_norm": 0.5640742939680888, + "learning_rate": 1.452924087325428e-06, + "loss": 0.3986, + "step": 3165 + }, + { + "epoch": 1.34, + "grad_norm": 0.5854422692893917, + "learning_rate": 1.4512181818144763e-06, + "loss": 0.4092, + "step": 3166 + }, + { + "epoch": 1.34, + "grad_norm": 0.6013198747012948, + "learning_rate": 1.4495128687270682e-06, + "loss": 0.4147, + "step": 3167 + }, + { + "epoch": 1.34, + "grad_norm": 0.6349112878398844, + "learning_rate": 1.4478081490264841e-06, + "loss": 0.4013, + "step": 3168 + }, + { + "epoch": 1.34, + "grad_norm": 0.6018833225607165, + "learning_rate": 1.4461040236756643e-06, + "loss": 0.4034, + "step": 3169 + }, + { + "epoch": 1.34, + "grad_norm": 0.572757916476602, + "learning_rate": 1.4444004936372166e-06, + "loss": 0.4139, + "step": 3170 + }, + { + "epoch": 1.34, + "grad_norm": 0.5964839570478256, + "learning_rate": 1.4426975598734103e-06, + "loss": 0.4403, + "step": 3171 + }, + { + "epoch": 1.34, + "grad_norm": 0.6532887857894302, + "learning_rate": 1.4409952233461777e-06, + "loss": 0.4089, + "step": 3172 + }, + { + "epoch": 1.34, + "grad_norm": 0.599992570422661, + "learning_rate": 1.4392934850171161e-06, + "loss": 0.4328, + "step": 3173 + }, + { + "epoch": 1.34, + "grad_norm": 0.5924831441960442, + "learning_rate": 1.4375923458474822e-06, + "loss": 0.4397, + "step": 3174 + }, + { + "epoch": 1.34, + "grad_norm": 0.5694039028586787, + "learning_rate": 1.4358918067981969e-06, + "loss": 0.4052, + "step": 3175 + }, + { + "epoch": 1.34, + "grad_norm": 0.5638688789990264, + "learning_rate": 1.434191868829839e-06, + "loss": 0.4306, + "step": 3176 + }, + { + "epoch": 1.34, + "grad_norm": 0.6070863428317758, + "learning_rate": 1.4324925329026526e-06, + "loss": 0.4069, + "step": 3177 + }, + { + "epoch": 1.34, + "grad_norm": 0.5970977280572529, + "learning_rate": 1.4307937999765343e-06, + "loss": 0.4264, + "step": 3178 + }, + { + "epoch": 1.34, + "grad_norm": 0.5400844435082709, + "learning_rate": 1.4290956710110477e-06, + "loss": 0.4114, + "step": 3179 + }, + { + "epoch": 1.34, + "grad_norm": 0.6189005664696162, + "learning_rate": 1.4273981469654093e-06, + "loss": 0.4393, + "step": 3180 + }, + { + "epoch": 1.34, + "grad_norm": 0.5551865309592269, + "learning_rate": 1.4257012287984994e-06, + "loss": 0.3815, + "step": 3181 + }, + { + "epoch": 1.35, + "grad_norm": 0.5788766743190364, + "learning_rate": 1.4240049174688514e-06, + "loss": 0.4129, + "step": 3182 + }, + { + "epoch": 1.35, + "grad_norm": 0.6145740654828784, + "learning_rate": 1.4223092139346583e-06, + "loss": 0.4465, + "step": 3183 + }, + { + "epoch": 1.35, + "grad_norm": 0.56123366419593, + "learning_rate": 1.4206141191537681e-06, + "loss": 0.4226, + "step": 3184 + }, + { + "epoch": 1.35, + "grad_norm": 0.5702082132713671, + "learning_rate": 1.4189196340836866e-06, + "loss": 0.4407, + "step": 3185 + }, + { + "epoch": 1.35, + "grad_norm": 0.568086319678547, + "learning_rate": 1.4172257596815762e-06, + "loss": 0.4235, + "step": 3186 + }, + { + "epoch": 1.35, + "grad_norm": 0.570650833706996, + "learning_rate": 1.41553249690425e-06, + "loss": 0.397, + "step": 3187 + }, + { + "epoch": 1.35, + "grad_norm": 0.5758906759310528, + "learning_rate": 1.413839846708182e-06, + "loss": 0.4008, + "step": 3188 + }, + { + "epoch": 1.35, + "grad_norm": 0.5867057883601531, + "learning_rate": 1.4121478100494926e-06, + "loss": 0.3931, + "step": 3189 + }, + { + "epoch": 1.35, + "grad_norm": 0.6216370173036398, + "learning_rate": 1.4104563878839623e-06, + "loss": 0.432, + "step": 3190 + }, + { + "epoch": 1.35, + "grad_norm": 0.5834896894589426, + "learning_rate": 1.4087655811670196e-06, + "loss": 0.3892, + "step": 3191 + }, + { + "epoch": 1.35, + "grad_norm": 0.55572187008008, + "learning_rate": 1.4070753908537498e-06, + "loss": 0.3945, + "step": 3192 + }, + { + "epoch": 1.35, + "grad_norm": 0.6195431150659669, + "learning_rate": 1.4053858178988866e-06, + "loss": 0.4252, + "step": 3193 + }, + { + "epoch": 1.35, + "grad_norm": 0.5855307932366061, + "learning_rate": 1.4036968632568163e-06, + "loss": 0.4023, + "step": 3194 + }, + { + "epoch": 1.35, + "grad_norm": 0.5785248405460688, + "learning_rate": 1.4020085278815745e-06, + "loss": 0.4187, + "step": 3195 + }, + { + "epoch": 1.35, + "eval_loss": 0.4585270285606384, + "eval_runtime": 6935.0493, + "eval_samples_per_second": 41.873, + "eval_steps_per_second": 2.094, + "step": 3195 + }, + { + "epoch": 1.35, + "grad_norm": 0.5793269431905544, + "learning_rate": 1.4003208127268503e-06, + "loss": 0.4362, + "step": 3196 + }, + { + "epoch": 1.35, + "grad_norm": 0.5635960927267711, + "learning_rate": 1.3986337187459787e-06, + "loss": 0.396, + "step": 3197 + }, + { + "epoch": 1.35, + "grad_norm": 0.5675588328279935, + "learning_rate": 1.3969472468919462e-06, + "loss": 0.4357, + "step": 3198 + }, + { + "epoch": 1.35, + "grad_norm": 0.5813371441566617, + "learning_rate": 1.3952613981173894e-06, + "loss": 0.4202, + "step": 3199 + }, + { + "epoch": 1.35, + "grad_norm": 0.5567802235455163, + "learning_rate": 1.3935761733745865e-06, + "loss": 0.4131, + "step": 3200 + }, + { + "epoch": 1.35, + "grad_norm": 0.5821477938771911, + "learning_rate": 1.3918915736154704e-06, + "loss": 0.4214, + "step": 3201 + }, + { + "epoch": 1.35, + "grad_norm": 0.5964536001348693, + "learning_rate": 1.3902075997916164e-06, + "loss": 0.4336, + "step": 3202 + }, + { + "epoch": 1.35, + "grad_norm": 0.6086865744806292, + "learning_rate": 1.3885242528542497e-06, + "loss": 0.4551, + "step": 3203 + }, + { + "epoch": 1.35, + "grad_norm": 0.575959160272487, + "learning_rate": 1.3868415337542382e-06, + "loss": 0.4224, + "step": 3204 + }, + { + "epoch": 1.36, + "grad_norm": 0.6026326885124447, + "learning_rate": 1.3851594434420968e-06, + "loss": 0.4202, + "step": 3205 + }, + { + "epoch": 1.36, + "grad_norm": 0.5493404687568753, + "learning_rate": 1.383477982867984e-06, + "loss": 0.3954, + "step": 3206 + }, + { + "epoch": 1.36, + "grad_norm": 0.5781348712939294, + "learning_rate": 1.3817971529817054e-06, + "loss": 0.3956, + "step": 3207 + }, + { + "epoch": 1.36, + "grad_norm": 0.5848758929707726, + "learning_rate": 1.380116954732706e-06, + "loss": 0.4298, + "step": 3208 + }, + { + "epoch": 1.36, + "grad_norm": 0.5839753363394727, + "learning_rate": 1.3784373890700789e-06, + "loss": 0.4159, + "step": 3209 + }, + { + "epoch": 1.36, + "grad_norm": 0.5473659827496421, + "learning_rate": 1.3767584569425562e-06, + "loss": 0.3982, + "step": 3210 + }, + { + "epoch": 1.36, + "grad_norm": 0.5854856789628843, + "learning_rate": 1.375080159298513e-06, + "loss": 0.3985, + "step": 3211 + }, + { + "epoch": 1.36, + "grad_norm": 0.6223285089698908, + "learning_rate": 1.3734024970859672e-06, + "loss": 0.3924, + "step": 3212 + }, + { + "epoch": 1.36, + "grad_norm": 0.5818260019212478, + "learning_rate": 1.3717254712525758e-06, + "loss": 0.4091, + "step": 3213 + }, + { + "epoch": 1.36, + "grad_norm": 0.588671059523623, + "learning_rate": 1.3700490827456393e-06, + "loss": 0.4278, + "step": 3214 + }, + { + "epoch": 1.36, + "grad_norm": 0.542860404907633, + "learning_rate": 1.3683733325120934e-06, + "loss": 0.4131, + "step": 3215 + }, + { + "epoch": 1.36, + "grad_norm": 0.5562613227316864, + "learning_rate": 1.3666982214985208e-06, + "loss": 0.4245, + "step": 3216 + }, + { + "epoch": 1.36, + "grad_norm": 4.123028508699682, + "learning_rate": 1.3650237506511333e-06, + "loss": 0.4637, + "step": 3217 + }, + { + "epoch": 1.36, + "grad_norm": 0.590425927959101, + "learning_rate": 1.3633499209157898e-06, + "loss": 0.4267, + "step": 3218 + }, + { + "epoch": 1.36, + "grad_norm": 0.5832844350075718, + "learning_rate": 1.3616767332379815e-06, + "loss": 0.4182, + "step": 3219 + }, + { + "epoch": 1.36, + "grad_norm": 0.5960509584308913, + "learning_rate": 1.360004188562841e-06, + "loss": 0.4169, + "step": 3220 + }, + { + "epoch": 1.36, + "grad_norm": 0.592810575458578, + "learning_rate": 1.3583322878351346e-06, + "loss": 0.427, + "step": 3221 + }, + { + "epoch": 1.36, + "grad_norm": 0.5842647152175641, + "learning_rate": 1.3566610319992658e-06, + "loss": 0.4101, + "step": 3222 + }, + { + "epoch": 1.36, + "grad_norm": 0.5687572730019927, + "learning_rate": 1.3549904219992732e-06, + "loss": 0.4027, + "step": 3223 + }, + { + "epoch": 1.36, + "grad_norm": 0.563025461962013, + "learning_rate": 1.3533204587788323e-06, + "loss": 0.4068, + "step": 3224 + }, + { + "epoch": 1.36, + "grad_norm": 0.5694816221991685, + "learning_rate": 1.351651143281253e-06, + "loss": 0.4101, + "step": 3225 + }, + { + "epoch": 1.36, + "grad_norm": 0.5642547386099277, + "learning_rate": 1.3499824764494773e-06, + "loss": 0.4127, + "step": 3226 + }, + { + "epoch": 1.36, + "grad_norm": 0.5480375629168616, + "learning_rate": 1.3483144592260844e-06, + "loss": 0.397, + "step": 3227 + }, + { + "epoch": 1.36, + "grad_norm": 0.5653603918404666, + "learning_rate": 1.346647092553281e-06, + "loss": 0.4109, + "step": 3228 + }, + { + "epoch": 1.37, + "grad_norm": 0.5806409790053582, + "learning_rate": 1.3449803773729115e-06, + "loss": 0.3967, + "step": 3229 + }, + { + "epoch": 1.37, + "grad_norm": 0.5605529434144851, + "learning_rate": 1.3433143146264494e-06, + "loss": 0.4287, + "step": 3230 + }, + { + "epoch": 1.37, + "grad_norm": 0.5722146410788806, + "learning_rate": 1.3416489052550019e-06, + "loss": 0.4006, + "step": 3231 + }, + { + "epoch": 1.37, + "grad_norm": 0.575347179976502, + "learning_rate": 1.3399841501993056e-06, + "loss": 0.4116, + "step": 3232 + }, + { + "epoch": 1.37, + "grad_norm": 0.5949791657298646, + "learning_rate": 1.338320050399727e-06, + "loss": 0.4303, + "step": 3233 + }, + { + "epoch": 1.37, + "grad_norm": 0.5794211854463649, + "learning_rate": 1.3366566067962628e-06, + "loss": 0.4124, + "step": 3234 + }, + { + "epoch": 1.37, + "grad_norm": 0.5793797643264423, + "learning_rate": 1.3349938203285412e-06, + "loss": 0.4275, + "step": 3235 + }, + { + "epoch": 1.37, + "grad_norm": 0.5842493222488487, + "learning_rate": 1.3333316919358159e-06, + "loss": 0.4209, + "step": 3236 + }, + { + "epoch": 1.37, + "grad_norm": 0.5663991497625657, + "learning_rate": 1.3316702225569708e-06, + "loss": 0.3918, + "step": 3237 + }, + { + "epoch": 1.37, + "grad_norm": 0.5486915223283709, + "learning_rate": 1.3300094131305196e-06, + "loss": 0.4073, + "step": 3238 + }, + { + "epoch": 1.37, + "grad_norm": 0.5350475982264186, + "learning_rate": 1.3283492645945966e-06, + "loss": 0.3985, + "step": 3239 + }, + { + "epoch": 1.37, + "grad_norm": 0.5886624292798438, + "learning_rate": 1.3266897778869704e-06, + "loss": 0.393, + "step": 3240 + }, + { + "epoch": 1.37, + "grad_norm": 0.5755756067045215, + "learning_rate": 1.3250309539450298e-06, + "loss": 0.4184, + "step": 3241 + }, + { + "epoch": 1.37, + "grad_norm": 0.5677276610035034, + "learning_rate": 1.3233727937057938e-06, + "loss": 0.4299, + "step": 3242 + }, + { + "epoch": 1.37, + "grad_norm": 0.5690350952201221, + "learning_rate": 1.3217152981059043e-06, + "loss": 0.4292, + "step": 3243 + }, + { + "epoch": 1.37, + "grad_norm": 0.5739165525454375, + "learning_rate": 1.320058468081627e-06, + "loss": 0.4076, + "step": 3244 + }, + { + "epoch": 1.37, + "grad_norm": 0.5613813103721713, + "learning_rate": 1.3184023045688515e-06, + "loss": 0.4214, + "step": 3245 + }, + { + "epoch": 1.37, + "grad_norm": 0.5954222168994903, + "learning_rate": 1.3167468085030948e-06, + "loss": 0.4513, + "step": 3246 + }, + { + "epoch": 1.37, + "grad_norm": 0.5502927352970999, + "learning_rate": 1.3150919808194917e-06, + "loss": 0.4156, + "step": 3247 + }, + { + "epoch": 1.37, + "grad_norm": 0.5608089205143509, + "learning_rate": 1.3134378224528026e-06, + "loss": 0.416, + "step": 3248 + }, + { + "epoch": 1.37, + "grad_norm": 0.5536001064152207, + "learning_rate": 1.311784334337411e-06, + "loss": 0.4126, + "step": 3249 + }, + { + "epoch": 1.37, + "grad_norm": 0.5766677004057259, + "learning_rate": 1.3101315174073162e-06, + "loss": 0.4304, + "step": 3250 + }, + { + "epoch": 1.37, + "grad_norm": 0.5776009437096029, + "learning_rate": 1.3084793725961447e-06, + "loss": 0.3962, + "step": 3251 + }, + { + "epoch": 1.38, + "grad_norm": 0.5479613505888217, + "learning_rate": 1.3068279008371387e-06, + "loss": 0.3994, + "step": 3252 + }, + { + "epoch": 1.38, + "grad_norm": 0.5690887730011218, + "learning_rate": 1.3051771030631644e-06, + "loss": 0.4088, + "step": 3253 + }, + { + "epoch": 1.38, + "grad_norm": 0.5799412277505956, + "learning_rate": 1.303526980206704e-06, + "loss": 0.422, + "step": 3254 + }, + { + "epoch": 1.38, + "grad_norm": 0.5570959933884828, + "learning_rate": 1.301877533199859e-06, + "loss": 0.4104, + "step": 3255 + }, + { + "epoch": 1.38, + "grad_norm": 0.5643212278599219, + "learning_rate": 1.3002287629743488e-06, + "loss": 0.4161, + "step": 3256 + }, + { + "epoch": 1.38, + "grad_norm": 0.5618675409375589, + "learning_rate": 1.2985806704615139e-06, + "loss": 0.4352, + "step": 3257 + }, + { + "epoch": 1.38, + "grad_norm": 0.5795363521047754, + "learning_rate": 1.2969332565923068e-06, + "loss": 0.4149, + "step": 3258 + }, + { + "epoch": 1.38, + "grad_norm": 0.5607134789936746, + "learning_rate": 1.2952865222973015e-06, + "loss": 0.3938, + "step": 3259 + }, + { + "epoch": 1.38, + "grad_norm": 0.5774563064680311, + "learning_rate": 1.2936404685066852e-06, + "loss": 0.4031, + "step": 3260 + }, + { + "epoch": 1.38, + "grad_norm": 0.5784824116412987, + "learning_rate": 1.2919950961502603e-06, + "loss": 0.4175, + "step": 3261 + }, + { + "epoch": 1.38, + "grad_norm": 0.5685744391843301, + "learning_rate": 1.2903504061574467e-06, + "loss": 0.4102, + "step": 3262 + }, + { + "epoch": 1.38, + "grad_norm": 0.6133989126312784, + "learning_rate": 1.2887063994572765e-06, + "loss": 0.4397, + "step": 3263 + }, + { + "epoch": 1.38, + "grad_norm": 0.5576015312128622, + "learning_rate": 1.2870630769783985e-06, + "loss": 0.4254, + "step": 3264 + }, + { + "epoch": 1.38, + "grad_norm": 0.5854336579154686, + "learning_rate": 1.2854204396490722e-06, + "loss": 0.425, + "step": 3265 + }, + { + "epoch": 1.38, + "grad_norm": 0.5851979998505886, + "learning_rate": 1.2837784883971716e-06, + "loss": 0.4158, + "step": 3266 + }, + { + "epoch": 1.38, + "eval_loss": 0.4578353464603424, + "eval_runtime": 6934.1495, + "eval_samples_per_second": 41.878, + "eval_steps_per_second": 2.094, + "step": 3266 + }, + { + "epoch": 1.38, + "grad_norm": 0.5605246475286948, + "learning_rate": 1.2821372241501814e-06, + "loss": 0.4197, + "step": 3267 + }, + { + "epoch": 1.38, + "grad_norm": 0.5907220198204197, + "learning_rate": 1.280496647835202e-06, + "loss": 0.4239, + "step": 3268 + }, + { + "epoch": 1.38, + "grad_norm": 0.5543362298552139, + "learning_rate": 1.278856760378941e-06, + "loss": 0.4027, + "step": 3269 + }, + { + "epoch": 1.38, + "grad_norm": 0.5824324760965199, + "learning_rate": 1.2772175627077204e-06, + "loss": 0.3936, + "step": 3270 + }, + { + "epoch": 1.38, + "grad_norm": 0.5596093376012918, + "learning_rate": 1.27557905574747e-06, + "loss": 0.3892, + "step": 3271 + }, + { + "epoch": 1.38, + "grad_norm": 0.5847030750956056, + "learning_rate": 1.2739412404237306e-06, + "loss": 0.4517, + "step": 3272 + }, + { + "epoch": 1.38, + "grad_norm": 0.5532504862482353, + "learning_rate": 1.2723041176616508e-06, + "loss": 0.4178, + "step": 3273 + }, + { + "epoch": 1.38, + "grad_norm": 0.5842377189415942, + "learning_rate": 1.2706676883859902e-06, + "loss": 0.4135, + "step": 3274 + }, + { + "epoch": 1.39, + "grad_norm": 0.5442392432462477, + "learning_rate": 1.2690319535211171e-06, + "loss": 0.3921, + "step": 3275 + }, + { + "epoch": 1.39, + "grad_norm": 0.5655162202220977, + "learning_rate": 1.2673969139910047e-06, + "loss": 0.4281, + "step": 3276 + } + ], + "logging_steps": 1, + "max_steps": 4680, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 468, + "total_flos": 3429507897753600.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}