diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7105 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.976, + "eval_steps": 125, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002, + "grad_norm": 0.06923668831586838, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.4175, + "step": 1 + }, + { + "epoch": 0.002, + "eval_loss": 0.4618559181690216, + "eval_runtime": 137.9356, + "eval_samples_per_second": 4.002, + "eval_steps_per_second": 0.5, + "step": 1 + }, + { + "epoch": 0.004, + "grad_norm": 0.09036832302808762, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.5159, + "step": 2 + }, + { + "epoch": 0.006, + "grad_norm": 0.06212183088064194, + "learning_rate": 3e-06, + "loss": 0.3274, + "step": 3 + }, + { + "epoch": 0.008, + "grad_norm": 0.089068204164505, + "learning_rate": 4.000000000000001e-06, + "loss": 0.5353, + "step": 4 + }, + { + "epoch": 0.01, + "grad_norm": 0.08060520887374878, + "learning_rate": 5e-06, + "loss": 0.5229, + "step": 5 + }, + { + "epoch": 0.012, + "grad_norm": 0.08129512518644333, + "learning_rate": 6e-06, + "loss": 0.416, + "step": 6 + }, + { + "epoch": 0.014, + "grad_norm": 0.13881395757198334, + "learning_rate": 7e-06, + "loss": 0.4797, + "step": 7 + }, + { + "epoch": 0.016, + "grad_norm": 0.09156442433595657, + "learning_rate": 8.000000000000001e-06, + "loss": 0.4808, + "step": 8 + }, + { + "epoch": 0.018, + "grad_norm": 0.09145132452249527, + "learning_rate": 9e-06, + "loss": 0.4991, + "step": 9 + }, + { + "epoch": 0.02, + "grad_norm": 0.08622220903635025, + "learning_rate": 1e-05, + "loss": 0.484, + "step": 10 + }, + { + "epoch": 0.022, + "grad_norm": 0.07630373537540436, + "learning_rate": 9.999974825027756e-06, + "loss": 0.3951, + "step": 11 + }, + { + "epoch": 0.024, + "grad_norm": 0.06840338557958603, + "learning_rate": 9.999899300364534e-06, + "loss": 0.4058, + "step": 12 + }, + { + "epoch": 0.026, + "grad_norm": 0.09991295635700226, + "learning_rate": 9.999773426770864e-06, + "loss": 0.5737, + "step": 13 + }, + { + "epoch": 0.028, + "grad_norm": 0.09987013041973114, + "learning_rate": 9.999597205514298e-06, + "loss": 0.4535, + "step": 14 + }, + { + "epoch": 0.03, + "grad_norm": 0.07334341108798981, + "learning_rate": 9.999370638369377e-06, + "loss": 0.4047, + "step": 15 + }, + { + "epoch": 0.032, + "grad_norm": 0.10504010319709778, + "learning_rate": 9.99909372761763e-06, + "loss": 0.4587, + "step": 16 + }, + { + "epoch": 0.034, + "grad_norm": 0.12481511384248734, + "learning_rate": 9.998766476047546e-06, + "loss": 0.5568, + "step": 17 + }, + { + "epoch": 0.036, + "grad_norm": 0.10193619877099991, + "learning_rate": 9.998388886954546e-06, + "loss": 0.58, + "step": 18 + }, + { + "epoch": 0.038, + "grad_norm": 0.09747433662414551, + "learning_rate": 9.997960964140946e-06, + "loss": 0.4248, + "step": 19 + }, + { + "epoch": 0.04, + "grad_norm": 0.10985693335533142, + "learning_rate": 9.997482711915926e-06, + "loss": 0.5813, + "step": 20 + }, + { + "epoch": 0.042, + "grad_norm": 0.08061390370130539, + "learning_rate": 9.99695413509548e-06, + "loss": 0.3419, + "step": 21 + }, + { + "epoch": 0.044, + "grad_norm": 0.09820478409528732, + "learning_rate": 9.99637523900237e-06, + "loss": 0.336, + "step": 22 + }, + { + "epoch": 0.046, + "grad_norm": 0.11657540500164032, + "learning_rate": 9.995746029466071e-06, + "loss": 0.4634, + "step": 23 + }, + { + "epoch": 0.048, + "grad_norm": 0.0904548391699791, + "learning_rate": 9.99506651282272e-06, + "loss": 0.4085, + "step": 24 + }, + { + "epoch": 0.05, + "grad_norm": 0.1137523204088211, + "learning_rate": 9.994336695915041e-06, + "loss": 0.6002, + "step": 25 + }, + { + "epoch": 0.052, + "grad_norm": 0.08930382132530212, + "learning_rate": 9.993556586092281e-06, + "loss": 0.4007, + "step": 26 + }, + { + "epoch": 0.054, + "grad_norm": 0.10268951207399368, + "learning_rate": 9.992726191210139e-06, + "loss": 0.5762, + "step": 27 + }, + { + "epoch": 0.056, + "grad_norm": 0.11000809073448181, + "learning_rate": 9.991845519630679e-06, + "loss": 0.5878, + "step": 28 + }, + { + "epoch": 0.058, + "grad_norm": 0.08394967019557953, + "learning_rate": 9.990914580222258e-06, + "loss": 0.4447, + "step": 29 + }, + { + "epoch": 0.06, + "grad_norm": 0.10849784314632416, + "learning_rate": 9.989933382359423e-06, + "loss": 0.6129, + "step": 30 + }, + { + "epoch": 0.062, + "grad_norm": 0.09749893844127655, + "learning_rate": 9.988901935922826e-06, + "loss": 0.4993, + "step": 31 + }, + { + "epoch": 0.064, + "grad_norm": 0.09867393970489502, + "learning_rate": 9.987820251299121e-06, + "loss": 0.4415, + "step": 32 + }, + { + "epoch": 0.066, + "grad_norm": 0.07566885650157928, + "learning_rate": 9.986688339380863e-06, + "loss": 0.3669, + "step": 33 + }, + { + "epoch": 0.068, + "grad_norm": 0.08246949315071106, + "learning_rate": 9.985506211566388e-06, + "loss": 0.4102, + "step": 34 + }, + { + "epoch": 0.07, + "grad_norm": 0.10148797929286957, + "learning_rate": 9.984273879759713e-06, + "loss": 0.5327, + "step": 35 + }, + { + "epoch": 0.072, + "grad_norm": 0.08779735118150711, + "learning_rate": 9.982991356370404e-06, + "loss": 0.4914, + "step": 36 + }, + { + "epoch": 0.074, + "grad_norm": 0.09165964275598526, + "learning_rate": 9.981658654313458e-06, + "loss": 0.4136, + "step": 37 + }, + { + "epoch": 0.076, + "grad_norm": 0.10425784438848495, + "learning_rate": 9.98027578700917e-06, + "loss": 0.6063, + "step": 38 + }, + { + "epoch": 0.078, + "grad_norm": 0.09124460816383362, + "learning_rate": 9.978842768382999e-06, + "loss": 0.5461, + "step": 39 + }, + { + "epoch": 0.08, + "grad_norm": 0.0863451436161995, + "learning_rate": 9.977359612865424e-06, + "loss": 0.5108, + "step": 40 + }, + { + "epoch": 0.082, + "grad_norm": 0.11560487747192383, + "learning_rate": 9.975826335391808e-06, + "loss": 0.4965, + "step": 41 + }, + { + "epoch": 0.084, + "grad_norm": 0.1319773942232132, + "learning_rate": 9.974242951402236e-06, + "loss": 0.4754, + "step": 42 + }, + { + "epoch": 0.086, + "grad_norm": 0.08868485689163208, + "learning_rate": 9.972609476841368e-06, + "loss": 0.4958, + "step": 43 + }, + { + "epoch": 0.088, + "grad_norm": 0.12390384823083878, + "learning_rate": 9.970925928158275e-06, + "loss": 0.5641, + "step": 44 + }, + { + "epoch": 0.09, + "grad_norm": 0.095445416867733, + "learning_rate": 9.969192322306271e-06, + "loss": 0.5145, + "step": 45 + }, + { + "epoch": 0.092, + "grad_norm": 0.09656377136707306, + "learning_rate": 9.96740867674275e-06, + "loss": 0.3749, + "step": 46 + }, + { + "epoch": 0.094, + "grad_norm": 0.07841179519891739, + "learning_rate": 9.965575009429006e-06, + "loss": 0.4113, + "step": 47 + }, + { + "epoch": 0.096, + "grad_norm": 0.07786890119314194, + "learning_rate": 9.963691338830045e-06, + "loss": 0.4374, + "step": 48 + }, + { + "epoch": 0.098, + "grad_norm": 0.09050661325454712, + "learning_rate": 9.961757683914406e-06, + "loss": 0.5285, + "step": 49 + }, + { + "epoch": 0.1, + "grad_norm": 0.11070208251476288, + "learning_rate": 9.959774064153977e-06, + "loss": 0.5326, + "step": 50 + }, + { + "epoch": 0.102, + "grad_norm": 0.09067952632904053, + "learning_rate": 9.957740499523787e-06, + "loss": 0.5613, + "step": 51 + }, + { + "epoch": 0.104, + "grad_norm": 0.08883544057607651, + "learning_rate": 9.955657010501807e-06, + "loss": 0.4599, + "step": 52 + }, + { + "epoch": 0.106, + "grad_norm": 0.10251513868570328, + "learning_rate": 9.95352361806875e-06, + "loss": 0.5354, + "step": 53 + }, + { + "epoch": 0.108, + "grad_norm": 0.07133735716342926, + "learning_rate": 9.951340343707852e-06, + "loss": 0.3696, + "step": 54 + }, + { + "epoch": 0.11, + "grad_norm": 0.061642151325941086, + "learning_rate": 9.949107209404664e-06, + "loss": 0.3472, + "step": 55 + }, + { + "epoch": 0.112, + "grad_norm": 0.08950634300708771, + "learning_rate": 9.946824237646823e-06, + "loss": 0.4969, + "step": 56 + }, + { + "epoch": 0.114, + "grad_norm": 0.08016358315944672, + "learning_rate": 9.944491451423829e-06, + "loss": 0.5239, + "step": 57 + }, + { + "epoch": 0.116, + "grad_norm": 0.12512832880020142, + "learning_rate": 9.942108874226812e-06, + "loss": 0.5365, + "step": 58 + }, + { + "epoch": 0.118, + "grad_norm": 0.09220532327890396, + "learning_rate": 9.9396765300483e-06, + "loss": 0.4783, + "step": 59 + }, + { + "epoch": 0.12, + "grad_norm": 0.0885612890124321, + "learning_rate": 9.937194443381972e-06, + "loss": 0.5459, + "step": 60 + }, + { + "epoch": 0.122, + "grad_norm": 0.08592379838228226, + "learning_rate": 9.934662639222412e-06, + "loss": 0.4545, + "step": 61 + }, + { + "epoch": 0.124, + "grad_norm": 0.08418423682451248, + "learning_rate": 9.93208114306486e-06, + "loss": 0.5105, + "step": 62 + }, + { + "epoch": 0.126, + "grad_norm": 0.07870952039957047, + "learning_rate": 9.929449980904952e-06, + "loss": 0.4593, + "step": 63 + }, + { + "epoch": 0.128, + "grad_norm": 0.08841884881258011, + "learning_rate": 9.926769179238467e-06, + "loss": 0.4812, + "step": 64 + }, + { + "epoch": 0.13, + "grad_norm": 0.07493194192647934, + "learning_rate": 9.924038765061042e-06, + "loss": 0.5065, + "step": 65 + }, + { + "epoch": 0.132, + "grad_norm": 0.08470446616411209, + "learning_rate": 9.921258765867919e-06, + "loss": 0.4676, + "step": 66 + }, + { + "epoch": 0.134, + "grad_norm": 0.0656595379114151, + "learning_rate": 9.918429209653662e-06, + "loss": 0.3227, + "step": 67 + }, + { + "epoch": 0.136, + "grad_norm": 0.06501025706529617, + "learning_rate": 9.915550124911866e-06, + "loss": 0.2777, + "step": 68 + }, + { + "epoch": 0.138, + "grad_norm": 0.08443128317594528, + "learning_rate": 9.912621540634889e-06, + "loss": 0.4357, + "step": 69 + }, + { + "epoch": 0.14, + "grad_norm": 0.07121642678976059, + "learning_rate": 9.909643486313533e-06, + "loss": 0.3545, + "step": 70 + }, + { + "epoch": 0.142, + "grad_norm": 0.09408602863550186, + "learning_rate": 9.906615991936781e-06, + "loss": 0.3916, + "step": 71 + }, + { + "epoch": 0.144, + "grad_norm": 0.05998094752430916, + "learning_rate": 9.903539087991462e-06, + "loss": 0.2739, + "step": 72 + }, + { + "epoch": 0.146, + "grad_norm": 0.08949826657772064, + "learning_rate": 9.900412805461968e-06, + "loss": 0.3722, + "step": 73 + }, + { + "epoch": 0.148, + "grad_norm": 0.0731697678565979, + "learning_rate": 9.897237175829927e-06, + "loss": 0.2906, + "step": 74 + }, + { + "epoch": 0.15, + "grad_norm": 0.07855986058712006, + "learning_rate": 9.894012231073895e-06, + "loss": 0.4149, + "step": 75 + }, + { + "epoch": 0.152, + "grad_norm": 0.0791892409324646, + "learning_rate": 9.890738003669029e-06, + "loss": 0.4383, + "step": 76 + }, + { + "epoch": 0.154, + "grad_norm": 0.07980603724718094, + "learning_rate": 9.887414526586764e-06, + "loss": 0.4867, + "step": 77 + }, + { + "epoch": 0.156, + "grad_norm": 0.08503536880016327, + "learning_rate": 9.884041833294477e-06, + "loss": 0.4644, + "step": 78 + }, + { + "epoch": 0.158, + "grad_norm": 0.09240555018186569, + "learning_rate": 9.880619957755151e-06, + "loss": 0.3107, + "step": 79 + }, + { + "epoch": 0.16, + "grad_norm": 0.08195238560438156, + "learning_rate": 9.877148934427037e-06, + "loss": 0.3414, + "step": 80 + }, + { + "epoch": 0.162, + "grad_norm": 0.09512759745121002, + "learning_rate": 9.873628798263297e-06, + "loss": 0.4745, + "step": 81 + }, + { + "epoch": 0.164, + "grad_norm": 0.07976000756025314, + "learning_rate": 9.870059584711668e-06, + "loss": 0.3925, + "step": 82 + }, + { + "epoch": 0.166, + "grad_norm": 0.11229317635297775, + "learning_rate": 9.86644132971409e-06, + "loss": 0.4921, + "step": 83 + }, + { + "epoch": 0.168, + "grad_norm": 0.07479218393564224, + "learning_rate": 9.862774069706346e-06, + "loss": 0.3607, + "step": 84 + }, + { + "epoch": 0.17, + "grad_norm": 0.08530927449464798, + "learning_rate": 9.859057841617709e-06, + "loss": 0.4116, + "step": 85 + }, + { + "epoch": 0.172, + "grad_norm": 0.05544688552618027, + "learning_rate": 9.855292682870552e-06, + "loss": 0.2043, + "step": 86 + }, + { + "epoch": 0.174, + "grad_norm": 0.08539939671754837, + "learning_rate": 9.851478631379982e-06, + "loss": 0.4437, + "step": 87 + }, + { + "epoch": 0.176, + "grad_norm": 0.08732863515615463, + "learning_rate": 9.847615725553457e-06, + "loss": 0.4449, + "step": 88 + }, + { + "epoch": 0.178, + "grad_norm": 0.08848625421524048, + "learning_rate": 9.843704004290393e-06, + "loss": 0.5191, + "step": 89 + }, + { + "epoch": 0.18, + "grad_norm": 0.1142885684967041, + "learning_rate": 9.839743506981783e-06, + "loss": 0.3788, + "step": 90 + }, + { + "epoch": 0.182, + "grad_norm": 0.0678037703037262, + "learning_rate": 9.835734273509787e-06, + "loss": 0.3655, + "step": 91 + }, + { + "epoch": 0.184, + "grad_norm": 0.08179458975791931, + "learning_rate": 9.831676344247343e-06, + "loss": 0.4804, + "step": 92 + }, + { + "epoch": 0.186, + "grad_norm": 0.10821828246116638, + "learning_rate": 9.827569760057755e-06, + "loss": 0.4946, + "step": 93 + }, + { + "epoch": 0.188, + "grad_norm": 0.06980521976947784, + "learning_rate": 9.82341456229428e-06, + "loss": 0.3301, + "step": 94 + }, + { + "epoch": 0.19, + "grad_norm": 0.07966768741607666, + "learning_rate": 9.819210792799711e-06, + "loss": 0.4377, + "step": 95 + }, + { + "epoch": 0.192, + "grad_norm": 0.08750802278518677, + "learning_rate": 9.814958493905962e-06, + "loss": 0.4137, + "step": 96 + }, + { + "epoch": 0.194, + "grad_norm": 0.08171187341213226, + "learning_rate": 9.810657708433637e-06, + "loss": 0.5154, + "step": 97 + }, + { + "epoch": 0.196, + "grad_norm": 0.07627864181995392, + "learning_rate": 9.806308479691595e-06, + "loss": 0.3593, + "step": 98 + }, + { + "epoch": 0.198, + "grad_norm": 0.07038850337266922, + "learning_rate": 9.801910851476524e-06, + "loss": 0.3882, + "step": 99 + }, + { + "epoch": 0.2, + "grad_norm": 0.09910848736763, + "learning_rate": 9.797464868072489e-06, + "loss": 0.5034, + "step": 100 + }, + { + "epoch": 0.202, + "grad_norm": 0.08382704854011536, + "learning_rate": 9.792970574250493e-06, + "loss": 0.4769, + "step": 101 + }, + { + "epoch": 0.204, + "grad_norm": 0.07511335611343384, + "learning_rate": 9.788428015268027e-06, + "loss": 0.3703, + "step": 102 + }, + { + "epoch": 0.206, + "grad_norm": 0.08155877888202667, + "learning_rate": 9.78383723686861e-06, + "loss": 0.4102, + "step": 103 + }, + { + "epoch": 0.208, + "grad_norm": 0.06436574459075928, + "learning_rate": 9.779198285281326e-06, + "loss": 0.3253, + "step": 104 + }, + { + "epoch": 0.21, + "grad_norm": 0.06901544332504272, + "learning_rate": 9.774511207220369e-06, + "loss": 0.2842, + "step": 105 + }, + { + "epoch": 0.212, + "grad_norm": 0.08444689959287643, + "learning_rate": 9.769776049884564e-06, + "loss": 0.4212, + "step": 106 + }, + { + "epoch": 0.214, + "grad_norm": 0.08550014346837997, + "learning_rate": 9.76499286095689e-06, + "loss": 0.4404, + "step": 107 + }, + { + "epoch": 0.216, + "grad_norm": 0.09659305214881897, + "learning_rate": 9.760161688604008e-06, + "loss": 0.5841, + "step": 108 + }, + { + "epoch": 0.218, + "grad_norm": 0.06201549619436264, + "learning_rate": 9.755282581475769e-06, + "loss": 0.2246, + "step": 109 + }, + { + "epoch": 0.22, + "grad_norm": 0.07813581079244614, + "learning_rate": 9.750355588704728e-06, + "loss": 0.4415, + "step": 110 + }, + { + "epoch": 0.222, + "grad_norm": 0.10021974891424179, + "learning_rate": 9.745380759905648e-06, + "loss": 0.3042, + "step": 111 + }, + { + "epoch": 0.224, + "grad_norm": 0.10321412235498428, + "learning_rate": 9.740358145174999e-06, + "loss": 0.4837, + "step": 112 + }, + { + "epoch": 0.226, + "grad_norm": 0.11536537110805511, + "learning_rate": 9.735287795090455e-06, + "loss": 0.5586, + "step": 113 + }, + { + "epoch": 0.228, + "grad_norm": 0.07521039247512817, + "learning_rate": 9.730169760710385e-06, + "loss": 0.361, + "step": 114 + }, + { + "epoch": 0.23, + "grad_norm": 0.07128458470106125, + "learning_rate": 9.725004093573343e-06, + "loss": 0.3511, + "step": 115 + }, + { + "epoch": 0.232, + "grad_norm": 0.08504608273506165, + "learning_rate": 9.719790845697534e-06, + "loss": 0.4472, + "step": 116 + }, + { + "epoch": 0.234, + "grad_norm": 0.08541107177734375, + "learning_rate": 9.71453006958031e-06, + "loss": 0.3195, + "step": 117 + }, + { + "epoch": 0.236, + "grad_norm": 0.085638627409935, + "learning_rate": 9.709221818197626e-06, + "loss": 0.4343, + "step": 118 + }, + { + "epoch": 0.238, + "grad_norm": 0.06405656784772873, + "learning_rate": 9.703866145003512e-06, + "loss": 0.2905, + "step": 119 + }, + { + "epoch": 0.24, + "grad_norm": 0.12191811949014664, + "learning_rate": 9.698463103929542e-06, + "loss": 0.4092, + "step": 120 + }, + { + "epoch": 0.242, + "grad_norm": 0.08051154762506485, + "learning_rate": 9.69301274938428e-06, + "loss": 0.3362, + "step": 121 + }, + { + "epoch": 0.244, + "grad_norm": 0.09473302215337753, + "learning_rate": 9.687515136252732e-06, + "loss": 0.3941, + "step": 122 + }, + { + "epoch": 0.246, + "grad_norm": 0.09992998838424683, + "learning_rate": 9.681970319895804e-06, + "loss": 0.4603, + "step": 123 + }, + { + "epoch": 0.248, + "grad_norm": 0.08887780457735062, + "learning_rate": 9.676378356149733e-06, + "loss": 0.3082, + "step": 124 + }, + { + "epoch": 0.25, + "grad_norm": 0.08823645859956741, + "learning_rate": 9.670739301325534e-06, + "loss": 0.4301, + "step": 125 + }, + { + "epoch": 0.25, + "eval_loss": 0.3706146478652954, + "eval_runtime": 76.5201, + "eval_samples_per_second": 7.214, + "eval_steps_per_second": 0.902, + "step": 125 + }, + { + "epoch": 0.252, + "grad_norm": 0.10688935965299606, + "learning_rate": 9.665053212208426e-06, + "loss": 0.3065, + "step": 126 + }, + { + "epoch": 0.254, + "grad_norm": 0.09517981857061386, + "learning_rate": 9.659320146057263e-06, + "loss": 0.5437, + "step": 127 + }, + { + "epoch": 0.256, + "grad_norm": 0.11310486495494843, + "learning_rate": 9.653540160603956e-06, + "loss": 0.6087, + "step": 128 + }, + { + "epoch": 0.258, + "grad_norm": 0.08851969987154007, + "learning_rate": 9.647713314052896e-06, + "loss": 0.3598, + "step": 129 + }, + { + "epoch": 0.26, + "grad_norm": 0.09503145515918732, + "learning_rate": 9.641839665080363e-06, + "loss": 0.338, + "step": 130 + }, + { + "epoch": 0.262, + "grad_norm": 0.09553948044776917, + "learning_rate": 9.635919272833938e-06, + "loss": 0.3801, + "step": 131 + }, + { + "epoch": 0.264, + "grad_norm": 0.09811339527368546, + "learning_rate": 9.629952196931902e-06, + "loss": 0.3866, + "step": 132 + }, + { + "epoch": 0.266, + "grad_norm": 0.0865439921617508, + "learning_rate": 9.623938497462647e-06, + "loss": 0.4466, + "step": 133 + }, + { + "epoch": 0.268, + "grad_norm": 0.09298735857009888, + "learning_rate": 9.617878234984056e-06, + "loss": 0.4413, + "step": 134 + }, + { + "epoch": 0.27, + "grad_norm": 0.10931612551212311, + "learning_rate": 9.611771470522908e-06, + "loss": 0.3974, + "step": 135 + }, + { + "epoch": 0.272, + "grad_norm": 0.08798681199550629, + "learning_rate": 9.60561826557425e-06, + "loss": 0.4052, + "step": 136 + }, + { + "epoch": 0.274, + "grad_norm": 0.09892652928829193, + "learning_rate": 9.599418682100793e-06, + "loss": 0.4645, + "step": 137 + }, + { + "epoch": 0.276, + "grad_norm": 0.10193604230880737, + "learning_rate": 9.59317278253227e-06, + "loss": 0.4064, + "step": 138 + }, + { + "epoch": 0.278, + "grad_norm": 0.07900392264127731, + "learning_rate": 9.586880629764817e-06, + "loss": 0.3229, + "step": 139 + }, + { + "epoch": 0.28, + "grad_norm": 0.08284664154052734, + "learning_rate": 9.580542287160348e-06, + "loss": 0.3703, + "step": 140 + }, + { + "epoch": 0.282, + "grad_norm": 0.08164459466934204, + "learning_rate": 9.574157818545902e-06, + "loss": 0.2879, + "step": 141 + }, + { + "epoch": 0.284, + "grad_norm": 0.1115422248840332, + "learning_rate": 9.567727288213005e-06, + "loss": 0.4593, + "step": 142 + }, + { + "epoch": 0.286, + "grad_norm": 0.09770838916301727, + "learning_rate": 9.561250760917026e-06, + "loss": 0.4133, + "step": 143 + }, + { + "epoch": 0.288, + "grad_norm": 0.12189961224794388, + "learning_rate": 9.554728301876525e-06, + "loss": 0.5928, + "step": 144 + }, + { + "epoch": 0.29, + "grad_norm": 0.14093732833862305, + "learning_rate": 9.548159976772593e-06, + "loss": 0.415, + "step": 145 + }, + { + "epoch": 0.292, + "grad_norm": 0.11479732394218445, + "learning_rate": 9.541545851748186e-06, + "loss": 0.3691, + "step": 146 + }, + { + "epoch": 0.294, + "grad_norm": 0.09249378740787506, + "learning_rate": 9.534885993407474e-06, + "loss": 0.3394, + "step": 147 + }, + { + "epoch": 0.296, + "grad_norm": 0.10194878280162811, + "learning_rate": 9.528180468815155e-06, + "loss": 0.3745, + "step": 148 + }, + { + "epoch": 0.298, + "grad_norm": 0.09345925599336624, + "learning_rate": 9.521429345495787e-06, + "loss": 0.3934, + "step": 149 + }, + { + "epoch": 0.3, + "grad_norm": 0.09919178485870361, + "learning_rate": 9.514632691433108e-06, + "loss": 0.4053, + "step": 150 + }, + { + "epoch": 0.302, + "grad_norm": 0.10807909071445465, + "learning_rate": 9.507790575069347e-06, + "loss": 0.4631, + "step": 151 + }, + { + "epoch": 0.304, + "grad_norm": 0.10555636882781982, + "learning_rate": 9.50090306530454e-06, + "loss": 0.4952, + "step": 152 + }, + { + "epoch": 0.306, + "grad_norm": 0.10507559776306152, + "learning_rate": 9.493970231495836e-06, + "loss": 0.294, + "step": 153 + }, + { + "epoch": 0.308, + "grad_norm": 0.08718883246183395, + "learning_rate": 9.486992143456792e-06, + "loss": 0.3044, + "step": 154 + }, + { + "epoch": 0.31, + "grad_norm": 0.10039477050304413, + "learning_rate": 9.47996887145668e-06, + "loss": 0.3736, + "step": 155 + }, + { + "epoch": 0.312, + "grad_norm": 0.09952064603567123, + "learning_rate": 9.47290048621977e-06, + "loss": 0.4359, + "step": 156 + }, + { + "epoch": 0.314, + "grad_norm": 0.10663799196481705, + "learning_rate": 9.46578705892462e-06, + "loss": 0.3939, + "step": 157 + }, + { + "epoch": 0.316, + "grad_norm": 0.10759017616510391, + "learning_rate": 9.458628661203368e-06, + "loss": 0.4575, + "step": 158 + }, + { + "epoch": 0.318, + "grad_norm": 0.08924371749162674, + "learning_rate": 9.451425365140997e-06, + "loss": 0.3525, + "step": 159 + }, + { + "epoch": 0.32, + "grad_norm": 0.13670168817043304, + "learning_rate": 9.444177243274619e-06, + "loss": 0.5385, + "step": 160 + }, + { + "epoch": 0.322, + "grad_norm": 0.10520858317613602, + "learning_rate": 9.43688436859274e-06, + "loss": 0.2964, + "step": 161 + }, + { + "epoch": 0.324, + "grad_norm": 0.10608810931444168, + "learning_rate": 9.429546814534528e-06, + "loss": 0.4369, + "step": 162 + }, + { + "epoch": 0.326, + "grad_norm": 0.08399061113595963, + "learning_rate": 9.422164654989073e-06, + "loss": 0.3246, + "step": 163 + }, + { + "epoch": 0.328, + "grad_norm": 0.11295214295387268, + "learning_rate": 9.414737964294636e-06, + "loss": 0.4766, + "step": 164 + }, + { + "epoch": 0.33, + "grad_norm": 0.1255977749824524, + "learning_rate": 9.40726681723791e-06, + "loss": 0.5263, + "step": 165 + }, + { + "epoch": 0.332, + "grad_norm": 0.0891086682677269, + "learning_rate": 9.399751289053267e-06, + "loss": 0.2796, + "step": 166 + }, + { + "epoch": 0.334, + "grad_norm": 0.12856395542621613, + "learning_rate": 9.392191455421989e-06, + "loss": 0.4485, + "step": 167 + }, + { + "epoch": 0.336, + "grad_norm": 0.1172974556684494, + "learning_rate": 9.384587392471516e-06, + "loss": 0.542, + "step": 168 + }, + { + "epoch": 0.338, + "grad_norm": 0.08675208687782288, + "learning_rate": 9.376939176774678e-06, + "loss": 0.2899, + "step": 169 + }, + { + "epoch": 0.34, + "grad_norm": 0.11079028248786926, + "learning_rate": 9.369246885348926e-06, + "loss": 0.3732, + "step": 170 + }, + { + "epoch": 0.342, + "grad_norm": 0.12667471170425415, + "learning_rate": 9.361510595655545e-06, + "loss": 0.54, + "step": 171 + }, + { + "epoch": 0.344, + "grad_norm": 0.08692082017660141, + "learning_rate": 9.353730385598887e-06, + "loss": 0.3873, + "step": 172 + }, + { + "epoch": 0.346, + "grad_norm": 0.1013069748878479, + "learning_rate": 9.345906333525582e-06, + "loss": 0.438, + "step": 173 + }, + { + "epoch": 0.348, + "grad_norm": 0.09999188780784607, + "learning_rate": 9.338038518223746e-06, + "loss": 0.4467, + "step": 174 + }, + { + "epoch": 0.35, + "grad_norm": 0.11317498981952667, + "learning_rate": 9.330127018922195e-06, + "loss": 0.3912, + "step": 175 + }, + { + "epoch": 0.352, + "grad_norm": 0.10574603080749512, + "learning_rate": 9.322171915289635e-06, + "loss": 0.3808, + "step": 176 + }, + { + "epoch": 0.354, + "grad_norm": 0.1281527876853943, + "learning_rate": 9.314173287433874e-06, + "loss": 0.423, + "step": 177 + }, + { + "epoch": 0.356, + "grad_norm": 0.12899580597877502, + "learning_rate": 9.306131215901004e-06, + "loss": 0.4509, + "step": 178 + }, + { + "epoch": 0.358, + "grad_norm": 0.10952267050743103, + "learning_rate": 9.298045781674595e-06, + "loss": 0.3512, + "step": 179 + }, + { + "epoch": 0.36, + "grad_norm": 0.1423255354166031, + "learning_rate": 9.289917066174887e-06, + "loss": 0.3631, + "step": 180 + }, + { + "epoch": 0.362, + "grad_norm": 0.13039131462574005, + "learning_rate": 9.281745151257946e-06, + "loss": 0.3762, + "step": 181 + }, + { + "epoch": 0.364, + "grad_norm": 0.10448655486106873, + "learning_rate": 9.273530119214868e-06, + "loss": 0.3694, + "step": 182 + }, + { + "epoch": 0.366, + "grad_norm": 0.0945306122303009, + "learning_rate": 9.265272052770936e-06, + "loss": 0.28, + "step": 183 + }, + { + "epoch": 0.368, + "grad_norm": 0.10995735973119736, + "learning_rate": 9.256971035084786e-06, + "loss": 0.4849, + "step": 184 + }, + { + "epoch": 0.37, + "grad_norm": 0.11014600843191147, + "learning_rate": 9.248627149747573e-06, + "loss": 0.3213, + "step": 185 + }, + { + "epoch": 0.372, + "grad_norm": 0.09283925592899323, + "learning_rate": 9.24024048078213e-06, + "loss": 0.4077, + "step": 186 + }, + { + "epoch": 0.374, + "grad_norm": 0.14395715296268463, + "learning_rate": 9.231811112642121e-06, + "loss": 0.4869, + "step": 187 + }, + { + "epoch": 0.376, + "grad_norm": 0.10785488784313202, + "learning_rate": 9.223339130211194e-06, + "loss": 0.4122, + "step": 188 + }, + { + "epoch": 0.378, + "grad_norm": 0.09983161091804504, + "learning_rate": 9.214824618802108e-06, + "loss": 0.3027, + "step": 189 + }, + { + "epoch": 0.38, + "grad_norm": 0.10121427476406097, + "learning_rate": 9.206267664155906e-06, + "loss": 0.3055, + "step": 190 + }, + { + "epoch": 0.382, + "grad_norm": 0.11393419653177261, + "learning_rate": 9.197668352441025e-06, + "loss": 0.3567, + "step": 191 + }, + { + "epoch": 0.384, + "grad_norm": 0.132842019200325, + "learning_rate": 9.189026770252437e-06, + "loss": 0.3556, + "step": 192 + }, + { + "epoch": 0.386, + "grad_norm": 0.1139449030160904, + "learning_rate": 9.18034300461078e-06, + "loss": 0.4298, + "step": 193 + }, + { + "epoch": 0.388, + "grad_norm": 0.09980877488851547, + "learning_rate": 9.171617142961477e-06, + "loss": 0.3853, + "step": 194 + }, + { + "epoch": 0.39, + "grad_norm": 0.12531818449497223, + "learning_rate": 9.162849273173857e-06, + "loss": 0.4845, + "step": 195 + }, + { + "epoch": 0.392, + "grad_norm": 0.11148197203874588, + "learning_rate": 9.154039483540273e-06, + "loss": 0.4091, + "step": 196 + }, + { + "epoch": 0.394, + "grad_norm": 0.11962081491947174, + "learning_rate": 9.145187862775208e-06, + "loss": 0.371, + "step": 197 + }, + { + "epoch": 0.396, + "grad_norm": 0.10789982974529266, + "learning_rate": 9.136294500014387e-06, + "loss": 0.4268, + "step": 198 + }, + { + "epoch": 0.398, + "grad_norm": 0.15846121311187744, + "learning_rate": 9.12735948481387e-06, + "loss": 0.6264, + "step": 199 + }, + { + "epoch": 0.4, + "grad_norm": 0.1426246613264084, + "learning_rate": 9.118382907149164e-06, + "loss": 0.4769, + "step": 200 + }, + { + "epoch": 0.402, + "grad_norm": 0.1069459393620491, + "learning_rate": 9.109364857414306e-06, + "loss": 0.3708, + "step": 201 + }, + { + "epoch": 0.404, + "grad_norm": 0.10732389986515045, + "learning_rate": 9.100305426420957e-06, + "loss": 0.3962, + "step": 202 + }, + { + "epoch": 0.406, + "grad_norm": 0.1436106562614441, + "learning_rate": 9.091204705397485e-06, + "loss": 0.4549, + "step": 203 + }, + { + "epoch": 0.408, + "grad_norm": 0.10230587422847748, + "learning_rate": 9.08206278598805e-06, + "loss": 0.3926, + "step": 204 + }, + { + "epoch": 0.41, + "grad_norm": 0.11367027461528778, + "learning_rate": 9.07287976025168e-06, + "loss": 0.3378, + "step": 205 + }, + { + "epoch": 0.412, + "grad_norm": 0.14832234382629395, + "learning_rate": 9.06365572066134e-06, + "loss": 0.4202, + "step": 206 + }, + { + "epoch": 0.414, + "grad_norm": 0.10567332804203033, + "learning_rate": 9.05439076010301e-06, + "loss": 0.2904, + "step": 207 + }, + { + "epoch": 0.416, + "grad_norm": 0.11918513476848602, + "learning_rate": 9.045084971874738e-06, + "loss": 0.2632, + "step": 208 + }, + { + "epoch": 0.418, + "grad_norm": 0.13223537802696228, + "learning_rate": 9.035738449685707e-06, + "loss": 0.4208, + "step": 209 + }, + { + "epoch": 0.42, + "grad_norm": 0.12573251128196716, + "learning_rate": 9.026351287655294e-06, + "loss": 0.4609, + "step": 210 + }, + { + "epoch": 0.422, + "grad_norm": 0.11943136155605316, + "learning_rate": 9.016923580312114e-06, + "loss": 0.3323, + "step": 211 + }, + { + "epoch": 0.424, + "grad_norm": 0.13152974843978882, + "learning_rate": 9.007455422593077e-06, + "loss": 0.4258, + "step": 212 + }, + { + "epoch": 0.426, + "grad_norm": 0.13339808583259583, + "learning_rate": 8.997946909842426e-06, + "loss": 0.5303, + "step": 213 + }, + { + "epoch": 0.428, + "grad_norm": 0.11746034771203995, + "learning_rate": 8.988398137810778e-06, + "loss": 0.4109, + "step": 214 + }, + { + "epoch": 0.43, + "grad_norm": 0.11518029868602753, + "learning_rate": 8.978809202654161e-06, + "loss": 0.4154, + "step": 215 + }, + { + "epoch": 0.432, + "grad_norm": 0.15307952463626862, + "learning_rate": 8.969180200933048e-06, + "loss": 0.4196, + "step": 216 + }, + { + "epoch": 0.434, + "grad_norm": 0.11385340988636017, + "learning_rate": 8.959511229611377e-06, + "loss": 0.3713, + "step": 217 + }, + { + "epoch": 0.436, + "grad_norm": 0.1380355805158615, + "learning_rate": 8.949802386055582e-06, + "loss": 0.3891, + "step": 218 + }, + { + "epoch": 0.438, + "grad_norm": 0.09614066779613495, + "learning_rate": 8.94005376803361e-06, + "loss": 0.2527, + "step": 219 + }, + { + "epoch": 0.44, + "grad_norm": 0.12352288514375687, + "learning_rate": 8.930265473713939e-06, + "loss": 0.3737, + "step": 220 + }, + { + "epoch": 0.442, + "grad_norm": 0.18210633099079132, + "learning_rate": 8.92043760166458e-06, + "loss": 0.3839, + "step": 221 + }, + { + "epoch": 0.444, + "grad_norm": 0.1087498739361763, + "learning_rate": 8.910570250852098e-06, + "loss": 0.3141, + "step": 222 + }, + { + "epoch": 0.446, + "grad_norm": 0.11985889822244644, + "learning_rate": 8.900663520640605e-06, + "loss": 0.4606, + "step": 223 + }, + { + "epoch": 0.448, + "grad_norm": 0.146299347281456, + "learning_rate": 8.890717510790763e-06, + "loss": 0.4094, + "step": 224 + }, + { + "epoch": 0.45, + "grad_norm": 0.09788361191749573, + "learning_rate": 8.880732321458785e-06, + "loss": 0.2964, + "step": 225 + }, + { + "epoch": 0.452, + "grad_norm": 0.09735774993896484, + "learning_rate": 8.870708053195414e-06, + "loss": 0.2646, + "step": 226 + }, + { + "epoch": 0.454, + "grad_norm": 0.1293504238128662, + "learning_rate": 8.860644806944917e-06, + "loss": 0.2991, + "step": 227 + }, + { + "epoch": 0.456, + "grad_norm": 0.13126921653747559, + "learning_rate": 8.850542684044078e-06, + "loss": 0.4474, + "step": 228 + }, + { + "epoch": 0.458, + "grad_norm": 0.11488878726959229, + "learning_rate": 8.84040178622116e-06, + "loss": 0.3628, + "step": 229 + }, + { + "epoch": 0.46, + "grad_norm": 0.13861073553562164, + "learning_rate": 8.83022221559489e-06, + "loss": 0.4022, + "step": 230 + }, + { + "epoch": 0.462, + "grad_norm": 0.16164664924144745, + "learning_rate": 8.820004074673433e-06, + "loss": 0.4217, + "step": 231 + }, + { + "epoch": 0.464, + "grad_norm": 0.10550030320882797, + "learning_rate": 8.809747466353356e-06, + "loss": 0.2927, + "step": 232 + }, + { + "epoch": 0.466, + "grad_norm": 0.1035122275352478, + "learning_rate": 8.799452493918586e-06, + "loss": 0.2453, + "step": 233 + }, + { + "epoch": 0.468, + "grad_norm": 0.15530018508434296, + "learning_rate": 8.789119261039385e-06, + "loss": 0.3758, + "step": 234 + }, + { + "epoch": 0.47, + "grad_norm": 0.13951483368873596, + "learning_rate": 8.778747871771293e-06, + "loss": 0.4502, + "step": 235 + }, + { + "epoch": 0.472, + "grad_norm": 0.13241475820541382, + "learning_rate": 8.768338430554083e-06, + "loss": 0.5012, + "step": 236 + }, + { + "epoch": 0.474, + "grad_norm": 0.11370962113142014, + "learning_rate": 8.757891042210713e-06, + "loss": 0.2801, + "step": 237 + }, + { + "epoch": 0.476, + "grad_norm": 0.1501305103302002, + "learning_rate": 8.747405811946272e-06, + "loss": 0.4888, + "step": 238 + }, + { + "epoch": 0.478, + "grad_norm": 0.1636514514684677, + "learning_rate": 8.736882845346906e-06, + "loss": 0.518, + "step": 239 + }, + { + "epoch": 0.48, + "grad_norm": 0.11505798250436783, + "learning_rate": 8.726322248378775e-06, + "loss": 0.2627, + "step": 240 + }, + { + "epoch": 0.482, + "grad_norm": 0.15717971324920654, + "learning_rate": 8.715724127386971e-06, + "loss": 0.3299, + "step": 241 + }, + { + "epoch": 0.484, + "grad_norm": 0.13042742013931274, + "learning_rate": 8.705088589094458e-06, + "loss": 0.351, + "step": 242 + }, + { + "epoch": 0.486, + "grad_norm": 0.1414385885000229, + "learning_rate": 8.69441574060099e-06, + "loss": 0.471, + "step": 243 + }, + { + "epoch": 0.488, + "grad_norm": 0.10110446810722351, + "learning_rate": 8.683705689382025e-06, + "loss": 0.2369, + "step": 244 + }, + { + "epoch": 0.49, + "grad_norm": 0.1549258530139923, + "learning_rate": 8.672958543287666e-06, + "loss": 0.4333, + "step": 245 + }, + { + "epoch": 0.492, + "grad_norm": 0.11834664642810822, + "learning_rate": 8.662174410541556e-06, + "loss": 0.3182, + "step": 246 + }, + { + "epoch": 0.494, + "grad_norm": 0.1529727429151535, + "learning_rate": 8.651353399739787e-06, + "loss": 0.4963, + "step": 247 + }, + { + "epoch": 0.496, + "grad_norm": 0.14854104816913605, + "learning_rate": 8.640495619849821e-06, + "loss": 0.4514, + "step": 248 + }, + { + "epoch": 0.498, + "grad_norm": 0.12271202355623245, + "learning_rate": 8.629601180209382e-06, + "loss": 0.3694, + "step": 249 + }, + { + "epoch": 0.5, + "grad_norm": 0.11352905631065369, + "learning_rate": 8.61867019052535e-06, + "loss": 0.2978, + "step": 250 + }, + { + "epoch": 0.5, + "eval_loss": 0.32808247208595276, + "eval_runtime": 76.51, + "eval_samples_per_second": 7.215, + "eval_steps_per_second": 0.902, + "step": 250 + }, + { + "epoch": 0.502, + "grad_norm": 0.1511523425579071, + "learning_rate": 8.607702760872679e-06, + "loss": 0.4037, + "step": 251 + }, + { + "epoch": 0.504, + "grad_norm": 0.13344620168209076, + "learning_rate": 8.596699001693257e-06, + "loss": 0.2303, + "step": 252 + }, + { + "epoch": 0.506, + "grad_norm": 0.12220989167690277, + "learning_rate": 8.585659023794818e-06, + "loss": 0.4347, + "step": 253 + }, + { + "epoch": 0.508, + "grad_norm": 0.1094481498003006, + "learning_rate": 8.574582938349818e-06, + "loss": 0.3089, + "step": 254 + }, + { + "epoch": 0.51, + "grad_norm": 0.11940666288137436, + "learning_rate": 8.563470856894316e-06, + "loss": 0.2699, + "step": 255 + }, + { + "epoch": 0.512, + "grad_norm": 0.139656201004982, + "learning_rate": 8.552322891326846e-06, + "loss": 0.2763, + "step": 256 + }, + { + "epoch": 0.514, + "grad_norm": 0.11665194481611252, + "learning_rate": 8.541139153907296e-06, + "loss": 0.2695, + "step": 257 + }, + { + "epoch": 0.516, + "grad_norm": 0.12714596092700958, + "learning_rate": 8.529919757255783e-06, + "loss": 0.2489, + "step": 258 + }, + { + "epoch": 0.518, + "grad_norm": 0.12326015532016754, + "learning_rate": 8.518664814351502e-06, + "loss": 0.3067, + "step": 259 + }, + { + "epoch": 0.52, + "grad_norm": 0.13826797902584076, + "learning_rate": 8.507374438531606e-06, + "loss": 0.3119, + "step": 260 + }, + { + "epoch": 0.522, + "grad_norm": 0.15031856298446655, + "learning_rate": 8.496048743490053e-06, + "loss": 0.3112, + "step": 261 + }, + { + "epoch": 0.524, + "grad_norm": 0.14100715517997742, + "learning_rate": 8.48468784327647e-06, + "loss": 0.3878, + "step": 262 + }, + { + "epoch": 0.526, + "grad_norm": 0.15813864767551422, + "learning_rate": 8.473291852294986e-06, + "loss": 0.3382, + "step": 263 + }, + { + "epoch": 0.528, + "grad_norm": 0.15911728143692017, + "learning_rate": 8.461860885303116e-06, + "loss": 0.4177, + "step": 264 + }, + { + "epoch": 0.53, + "grad_norm": 0.15685637295246124, + "learning_rate": 8.450395057410561e-06, + "loss": 0.3557, + "step": 265 + }, + { + "epoch": 0.532, + "grad_norm": 0.13905856013298035, + "learning_rate": 8.438894484078086e-06, + "loss": 0.3323, + "step": 266 + }, + { + "epoch": 0.534, + "grad_norm": 0.13344989717006683, + "learning_rate": 8.427359281116335e-06, + "loss": 0.3475, + "step": 267 + }, + { + "epoch": 0.536, + "grad_norm": 0.16016146540641785, + "learning_rate": 8.415789564684673e-06, + "loss": 0.3789, + "step": 268 + }, + { + "epoch": 0.538, + "grad_norm": 0.11681054532527924, + "learning_rate": 8.404185451290017e-06, + "loss": 0.2061, + "step": 269 + }, + { + "epoch": 0.54, + "grad_norm": 0.14662593603134155, + "learning_rate": 8.392547057785662e-06, + "loss": 0.4173, + "step": 270 + }, + { + "epoch": 0.542, + "grad_norm": 0.21970625221729279, + "learning_rate": 8.380874501370098e-06, + "loss": 0.5602, + "step": 271 + }, + { + "epoch": 0.544, + "grad_norm": 0.11630596220493317, + "learning_rate": 8.36916789958584e-06, + "loss": 0.2674, + "step": 272 + }, + { + "epoch": 0.546, + "grad_norm": 0.14212217926979065, + "learning_rate": 8.357427370318239e-06, + "loss": 0.2776, + "step": 273 + }, + { + "epoch": 0.548, + "grad_norm": 0.14911417663097382, + "learning_rate": 8.345653031794292e-06, + "loss": 0.4463, + "step": 274 + }, + { + "epoch": 0.55, + "grad_norm": 0.142579585313797, + "learning_rate": 8.33384500258146e-06, + "loss": 0.4963, + "step": 275 + }, + { + "epoch": 0.552, + "grad_norm": 0.14713557064533234, + "learning_rate": 8.322003401586463e-06, + "loss": 0.2642, + "step": 276 + }, + { + "epoch": 0.554, + "grad_norm": 0.24756528437137604, + "learning_rate": 8.310128348054093e-06, + "loss": 0.5423, + "step": 277 + }, + { + "epoch": 0.556, + "grad_norm": 0.13731062412261963, + "learning_rate": 8.298219961566008e-06, + "loss": 0.3333, + "step": 278 + }, + { + "epoch": 0.558, + "grad_norm": 0.18075144290924072, + "learning_rate": 8.286278362039527e-06, + "loss": 0.3733, + "step": 279 + }, + { + "epoch": 0.56, + "grad_norm": 0.1650344282388687, + "learning_rate": 8.274303669726427e-06, + "loss": 0.383, + "step": 280 + }, + { + "epoch": 0.562, + "grad_norm": 0.18053463101387024, + "learning_rate": 8.262296005211722e-06, + "loss": 0.4359, + "step": 281 + }, + { + "epoch": 0.564, + "grad_norm": 0.16192179918289185, + "learning_rate": 8.250255489412464e-06, + "loss": 0.3839, + "step": 282 + }, + { + "epoch": 0.566, + "grad_norm": 0.16045285761356354, + "learning_rate": 8.238182243576512e-06, + "loss": 0.4185, + "step": 283 + }, + { + "epoch": 0.568, + "grad_norm": 0.14847232401371002, + "learning_rate": 8.226076389281316e-06, + "loss": 0.43, + "step": 284 + }, + { + "epoch": 0.57, + "grad_norm": 0.1868700236082077, + "learning_rate": 8.213938048432697e-06, + "loss": 0.3437, + "step": 285 + }, + { + "epoch": 0.572, + "grad_norm": 0.1744498908519745, + "learning_rate": 8.201767343263612e-06, + "loss": 0.4926, + "step": 286 + }, + { + "epoch": 0.574, + "grad_norm": 0.13156633079051971, + "learning_rate": 8.189564396332927e-06, + "loss": 0.4245, + "step": 287 + }, + { + "epoch": 0.576, + "grad_norm": 0.17716287076473236, + "learning_rate": 8.177329330524182e-06, + "loss": 0.3134, + "step": 288 + }, + { + "epoch": 0.578, + "grad_norm": 0.15387575328350067, + "learning_rate": 8.165062269044353e-06, + "loss": 0.3723, + "step": 289 + }, + { + "epoch": 0.58, + "grad_norm": 0.11926203221082687, + "learning_rate": 8.152763335422612e-06, + "loss": 0.251, + "step": 290 + }, + { + "epoch": 0.582, + "grad_norm": 0.14692164957523346, + "learning_rate": 8.140432653509089e-06, + "loss": 0.3068, + "step": 291 + }, + { + "epoch": 0.584, + "grad_norm": 0.12874449789524078, + "learning_rate": 8.128070347473609e-06, + "loss": 0.3449, + "step": 292 + }, + { + "epoch": 0.586, + "grad_norm": 0.1284901350736618, + "learning_rate": 8.115676541804456e-06, + "loss": 0.2336, + "step": 293 + }, + { + "epoch": 0.588, + "grad_norm": 0.18448615074157715, + "learning_rate": 8.10325136130712e-06, + "loss": 0.4497, + "step": 294 + }, + { + "epoch": 0.59, + "grad_norm": 0.18793466687202454, + "learning_rate": 8.090794931103026e-06, + "loss": 0.446, + "step": 295 + }, + { + "epoch": 0.592, + "grad_norm": 0.11833447217941284, + "learning_rate": 8.078307376628292e-06, + "loss": 0.286, + "step": 296 + }, + { + "epoch": 0.594, + "grad_norm": 0.14963407814502716, + "learning_rate": 8.065788823632451e-06, + "loss": 0.329, + "step": 297 + }, + { + "epoch": 0.596, + "grad_norm": 0.1394645869731903, + "learning_rate": 8.053239398177191e-06, + "loss": 0.2671, + "step": 298 + }, + { + "epoch": 0.598, + "grad_norm": 0.17401300370693207, + "learning_rate": 8.04065922663509e-06, + "loss": 0.5106, + "step": 299 + }, + { + "epoch": 0.6, + "grad_norm": 0.1559733897447586, + "learning_rate": 8.028048435688333e-06, + "loss": 0.259, + "step": 300 + }, + { + "epoch": 0.602, + "grad_norm": 0.14853116869926453, + "learning_rate": 8.015407152327448e-06, + "loss": 0.4095, + "step": 301 + }, + { + "epoch": 0.604, + "grad_norm": 0.13665775954723358, + "learning_rate": 8.002735503850016e-06, + "loss": 0.379, + "step": 302 + }, + { + "epoch": 0.606, + "grad_norm": 0.15187975764274597, + "learning_rate": 7.990033617859396e-06, + "loss": 0.336, + "step": 303 + }, + { + "epoch": 0.608, + "grad_norm": 0.17993216216564178, + "learning_rate": 7.97730162226344e-06, + "loss": 0.4718, + "step": 304 + }, + { + "epoch": 0.61, + "grad_norm": 0.14840970933437347, + "learning_rate": 7.964539645273204e-06, + "loss": 0.3572, + "step": 305 + }, + { + "epoch": 0.612, + "grad_norm": 0.2386975884437561, + "learning_rate": 7.951747815401651e-06, + "loss": 0.3185, + "step": 306 + }, + { + "epoch": 0.614, + "grad_norm": 0.21291233599185944, + "learning_rate": 7.938926261462366e-06, + "loss": 0.362, + "step": 307 + }, + { + "epoch": 0.616, + "grad_norm": 0.16196957230567932, + "learning_rate": 7.92607511256826e-06, + "loss": 0.3024, + "step": 308 + }, + { + "epoch": 0.618, + "grad_norm": 0.2727487087249756, + "learning_rate": 7.913194498130252e-06, + "loss": 0.5212, + "step": 309 + }, + { + "epoch": 0.62, + "grad_norm": 0.1640804558992386, + "learning_rate": 7.900284547855992e-06, + "loss": 0.3948, + "step": 310 + }, + { + "epoch": 0.622, + "grad_norm": 0.22003543376922607, + "learning_rate": 7.887345391748533e-06, + "loss": 0.3745, + "step": 311 + }, + { + "epoch": 0.624, + "grad_norm": 0.1896262764930725, + "learning_rate": 7.874377160105037e-06, + "loss": 0.4448, + "step": 312 + }, + { + "epoch": 0.626, + "grad_norm": 0.18609432876110077, + "learning_rate": 7.861379983515449e-06, + "loss": 0.3685, + "step": 313 + }, + { + "epoch": 0.628, + "grad_norm": 0.14590106904506683, + "learning_rate": 7.848353992861195e-06, + "loss": 0.3338, + "step": 314 + }, + { + "epoch": 0.63, + "grad_norm": 0.13211271166801453, + "learning_rate": 7.835299319313854e-06, + "loss": 0.3297, + "step": 315 + }, + { + "epoch": 0.632, + "grad_norm": 0.16736850142478943, + "learning_rate": 7.822216094333847e-06, + "loss": 0.3118, + "step": 316 + }, + { + "epoch": 0.634, + "grad_norm": 0.17553502321243286, + "learning_rate": 7.8091044496691e-06, + "loss": 0.3447, + "step": 317 + }, + { + "epoch": 0.636, + "grad_norm": 0.17292480170726776, + "learning_rate": 7.795964517353734e-06, + "loss": 0.3152, + "step": 318 + }, + { + "epoch": 0.638, + "grad_norm": 0.13962873816490173, + "learning_rate": 7.782796429706721e-06, + "loss": 0.2142, + "step": 319 + }, + { + "epoch": 0.64, + "grad_norm": 0.19501662254333496, + "learning_rate": 7.769600319330553e-06, + "loss": 0.3923, + "step": 320 + }, + { + "epoch": 0.642, + "grad_norm": 0.1338018923997879, + "learning_rate": 7.756376319109917e-06, + "loss": 0.3381, + "step": 321 + }, + { + "epoch": 0.644, + "grad_norm": 0.1579694300889969, + "learning_rate": 7.743124562210351e-06, + "loss": 0.37, + "step": 322 + }, + { + "epoch": 0.646, + "grad_norm": 0.12136895209550858, + "learning_rate": 7.729845182076896e-06, + "loss": 0.212, + "step": 323 + }, + { + "epoch": 0.648, + "grad_norm": 0.2188921570777893, + "learning_rate": 7.716538312432767e-06, + "loss": 0.3732, + "step": 324 + }, + { + "epoch": 0.65, + "grad_norm": 0.1570715606212616, + "learning_rate": 7.703204087277989e-06, + "loss": 0.321, + "step": 325 + }, + { + "epoch": 0.652, + "grad_norm": 0.19729937613010406, + "learning_rate": 7.689842640888063e-06, + "loss": 0.3955, + "step": 326 + }, + { + "epoch": 0.654, + "grad_norm": 0.20023679733276367, + "learning_rate": 7.676454107812608e-06, + "loss": 0.4399, + "step": 327 + }, + { + "epoch": 0.656, + "grad_norm": 0.14793503284454346, + "learning_rate": 7.663038622873999e-06, + "loss": 0.2922, + "step": 328 + }, + { + "epoch": 0.658, + "grad_norm": 0.16386426985263824, + "learning_rate": 7.649596321166024e-06, + "loss": 0.3495, + "step": 329 + }, + { + "epoch": 0.66, + "grad_norm": 0.15845847129821777, + "learning_rate": 7.636127338052513e-06, + "loss": 0.3607, + "step": 330 + }, + { + "epoch": 0.662, + "grad_norm": 0.17752616107463837, + "learning_rate": 7.622631809165972e-06, + "loss": 0.2863, + "step": 331 + }, + { + "epoch": 0.664, + "grad_norm": 0.2213558405637741, + "learning_rate": 7.60910987040623e-06, + "loss": 0.4411, + "step": 332 + }, + { + "epoch": 0.666, + "grad_norm": 0.2018650323152542, + "learning_rate": 7.595561657939061e-06, + "loss": 0.418, + "step": 333 + }, + { + "epoch": 0.668, + "grad_norm": 0.20029357075691223, + "learning_rate": 7.5819873081948105e-06, + "loss": 0.3025, + "step": 334 + }, + { + "epoch": 0.67, + "grad_norm": 0.1478874832391739, + "learning_rate": 7.568386957867033e-06, + "loss": 0.2437, + "step": 335 + }, + { + "epoch": 0.672, + "grad_norm": 0.18909971415996552, + "learning_rate": 7.554760743911104e-06, + "loss": 0.3974, + "step": 336 + }, + { + "epoch": 0.674, + "grad_norm": 0.16544924676418304, + "learning_rate": 7.541108803542846e-06, + "loss": 0.336, + "step": 337 + }, + { + "epoch": 0.676, + "grad_norm": 0.19204874336719513, + "learning_rate": 7.527431274237149e-06, + "loss": 0.3617, + "step": 338 + }, + { + "epoch": 0.678, + "grad_norm": 0.1770397573709488, + "learning_rate": 7.5137282937265796e-06, + "loss": 0.3617, + "step": 339 + }, + { + "epoch": 0.68, + "grad_norm": 0.15880927443504333, + "learning_rate": 7.500000000000001e-06, + "loss": 0.2993, + "step": 340 + }, + { + "epoch": 0.682, + "grad_norm": 0.4031960368156433, + "learning_rate": 7.486246531301178e-06, + "loss": 0.3137, + "step": 341 + }, + { + "epoch": 0.684, + "grad_norm": 0.17426829040050507, + "learning_rate": 7.472468026127385e-06, + "loss": 0.3712, + "step": 342 + }, + { + "epoch": 0.686, + "grad_norm": 0.16782499849796295, + "learning_rate": 7.45866462322802e-06, + "loss": 0.359, + "step": 343 + }, + { + "epoch": 0.688, + "grad_norm": 0.20207028090953827, + "learning_rate": 7.444836461603195e-06, + "loss": 0.4301, + "step": 344 + }, + { + "epoch": 0.69, + "grad_norm": 0.18788397312164307, + "learning_rate": 7.430983680502344e-06, + "loss": 0.3609, + "step": 345 + }, + { + "epoch": 0.692, + "grad_norm": 0.16447116434574127, + "learning_rate": 7.4171064194228196e-06, + "loss": 0.3514, + "step": 346 + }, + { + "epoch": 0.694, + "grad_norm": 0.15939724445343018, + "learning_rate": 7.403204818108487e-06, + "loss": 0.2747, + "step": 347 + }, + { + "epoch": 0.696, + "grad_norm": 0.2825759947299957, + "learning_rate": 7.3892790165483164e-06, + "loss": 0.5376, + "step": 348 + }, + { + "epoch": 0.698, + "grad_norm": 0.15753747522830963, + "learning_rate": 7.3753291549749764e-06, + "loss": 0.2741, + "step": 349 + }, + { + "epoch": 0.7, + "grad_norm": 0.19103243947029114, + "learning_rate": 7.361355373863415e-06, + "loss": 0.3088, + "step": 350 + }, + { + "epoch": 0.702, + "grad_norm": 0.18185654282569885, + "learning_rate": 7.347357813929455e-06, + "loss": 0.3204, + "step": 351 + }, + { + "epoch": 0.704, + "grad_norm": 0.15075427293777466, + "learning_rate": 7.333336616128369e-06, + "loss": 0.2885, + "step": 352 + }, + { + "epoch": 0.706, + "grad_norm": 0.14092062413692474, + "learning_rate": 7.319291921653464e-06, + "loss": 0.2423, + "step": 353 + }, + { + "epoch": 0.708, + "grad_norm": 0.11944609135389328, + "learning_rate": 7.305223871934657e-06, + "loss": 0.1367, + "step": 354 + }, + { + "epoch": 0.71, + "grad_norm": 0.2248326539993286, + "learning_rate": 7.291132608637053e-06, + "loss": 0.4119, + "step": 355 + }, + { + "epoch": 0.712, + "grad_norm": 0.1844269186258316, + "learning_rate": 7.2770182736595164e-06, + "loss": 0.2714, + "step": 356 + }, + { + "epoch": 0.714, + "grad_norm": 0.19066232442855835, + "learning_rate": 7.262881009133242e-06, + "loss": 0.432, + "step": 357 + }, + { + "epoch": 0.716, + "grad_norm": 0.21767167747020721, + "learning_rate": 7.24872095742033e-06, + "loss": 0.3804, + "step": 358 + }, + { + "epoch": 0.718, + "grad_norm": 0.14823076128959656, + "learning_rate": 7.234538261112342e-06, + "loss": 0.3182, + "step": 359 + }, + { + "epoch": 0.72, + "grad_norm": 0.1661371886730194, + "learning_rate": 7.2203330630288714e-06, + "loss": 0.3078, + "step": 360 + }, + { + "epoch": 0.722, + "grad_norm": 0.18412846326828003, + "learning_rate": 7.206105506216107e-06, + "loss": 0.4066, + "step": 361 + }, + { + "epoch": 0.724, + "grad_norm": 0.17892518639564514, + "learning_rate": 7.191855733945388e-06, + "loss": 0.4772, + "step": 362 + }, + { + "epoch": 0.726, + "grad_norm": 0.24270282685756683, + "learning_rate": 7.177583889711763e-06, + "loss": 0.3902, + "step": 363 + }, + { + "epoch": 0.728, + "grad_norm": 0.187135249376297, + "learning_rate": 7.163290117232542e-06, + "loss": 0.3154, + "step": 364 + }, + { + "epoch": 0.73, + "grad_norm": 0.20502962172031403, + "learning_rate": 7.148974560445859e-06, + "loss": 0.3599, + "step": 365 + }, + { + "epoch": 0.732, + "grad_norm": 0.1704569160938263, + "learning_rate": 7.1346373635092095e-06, + "loss": 0.3705, + "step": 366 + }, + { + "epoch": 0.734, + "grad_norm": 0.20562830567359924, + "learning_rate": 7.12027867079801e-06, + "loss": 0.3169, + "step": 367 + }, + { + "epoch": 0.736, + "grad_norm": 0.19051577150821686, + "learning_rate": 7.105898626904134e-06, + "loss": 0.4571, + "step": 368 + }, + { + "epoch": 0.738, + "grad_norm": 0.18842366337776184, + "learning_rate": 7.0914973766344645e-06, + "loss": 0.2771, + "step": 369 + }, + { + "epoch": 0.74, + "grad_norm": 0.14864154160022736, + "learning_rate": 7.0770750650094335e-06, + "loss": 0.2184, + "step": 370 + }, + { + "epoch": 0.742, + "grad_norm": 0.1662212610244751, + "learning_rate": 7.062631837261556e-06, + "loss": 0.2706, + "step": 371 + }, + { + "epoch": 0.744, + "grad_norm": 0.15230734646320343, + "learning_rate": 7.048167838833977e-06, + "loss": 0.2611, + "step": 372 + }, + { + "epoch": 0.746, + "grad_norm": 0.16176356375217438, + "learning_rate": 7.033683215379002e-06, + "loss": 0.3144, + "step": 373 + }, + { + "epoch": 0.748, + "grad_norm": 0.16796669363975525, + "learning_rate": 7.019178112756625e-06, + "loss": 0.3742, + "step": 374 + }, + { + "epoch": 0.75, + "grad_norm": 0.16455894708633423, + "learning_rate": 7.004652677033069e-06, + "loss": 0.2426, + "step": 375 + }, + { + "epoch": 0.75, + "eval_loss": 0.2979236841201782, + "eval_runtime": 76.5795, + "eval_samples_per_second": 7.208, + "eval_steps_per_second": 0.901, + "step": 375 + }, + { + "epoch": 0.752, + "grad_norm": 0.22792088985443115, + "learning_rate": 6.990107054479313e-06, + "loss": 0.319, + "step": 376 + }, + { + "epoch": 0.754, + "grad_norm": 0.24258168041706085, + "learning_rate": 6.9755413915696105e-06, + "loss": 0.5036, + "step": 377 + }, + { + "epoch": 0.756, + "grad_norm": 0.17646639049053192, + "learning_rate": 6.960955834980028e-06, + "loss": 0.3024, + "step": 378 + }, + { + "epoch": 0.758, + "grad_norm": 0.15006083250045776, + "learning_rate": 6.946350531586959e-06, + "loss": 0.2702, + "step": 379 + }, + { + "epoch": 0.76, + "grad_norm": 0.15430916845798492, + "learning_rate": 6.931725628465643e-06, + "loss": 0.2492, + "step": 380 + }, + { + "epoch": 0.762, + "grad_norm": 0.13274860382080078, + "learning_rate": 6.917081272888697e-06, + "loss": 0.2188, + "step": 381 + }, + { + "epoch": 0.764, + "grad_norm": 0.12552917003631592, + "learning_rate": 6.902417612324615e-06, + "loss": 0.2275, + "step": 382 + }, + { + "epoch": 0.766, + "grad_norm": 0.14306232333183289, + "learning_rate": 6.887734794436301e-06, + "loss": 0.3204, + "step": 383 + }, + { + "epoch": 0.768, + "grad_norm": 0.18567156791687012, + "learning_rate": 6.873032967079562e-06, + "loss": 0.4079, + "step": 384 + }, + { + "epoch": 0.77, + "grad_norm": 0.18761208653450012, + "learning_rate": 6.858312278301638e-06, + "loss": 0.2944, + "step": 385 + }, + { + "epoch": 0.772, + "grad_norm": 0.18265055119991302, + "learning_rate": 6.8435728763397045e-06, + "loss": 0.4399, + "step": 386 + }, + { + "epoch": 0.774, + "grad_norm": 0.18840709328651428, + "learning_rate": 6.828814909619374e-06, + "loss": 0.4057, + "step": 387 + }, + { + "epoch": 0.776, + "grad_norm": 0.19235002994537354, + "learning_rate": 6.814038526753205e-06, + "loss": 0.2826, + "step": 388 + }, + { + "epoch": 0.778, + "grad_norm": 0.1880473792552948, + "learning_rate": 6.799243876539213e-06, + "loss": 0.3739, + "step": 389 + }, + { + "epoch": 0.78, + "grad_norm": 0.29550889134407043, + "learning_rate": 6.78443110795936e-06, + "loss": 0.3594, + "step": 390 + }, + { + "epoch": 0.782, + "grad_norm": 0.19335615634918213, + "learning_rate": 6.76960037017806e-06, + "loss": 0.4026, + "step": 391 + }, + { + "epoch": 0.784, + "grad_norm": 0.14000019431114197, + "learning_rate": 6.75475181254068e-06, + "loss": 0.2576, + "step": 392 + }, + { + "epoch": 0.786, + "grad_norm": 0.15106743574142456, + "learning_rate": 6.739885584572026e-06, + "loss": 0.2538, + "step": 393 + }, + { + "epoch": 0.788, + "grad_norm": 0.19910076260566711, + "learning_rate": 6.725001835974854e-06, + "loss": 0.2867, + "step": 394 + }, + { + "epoch": 0.79, + "grad_norm": 0.22941169142723083, + "learning_rate": 6.710100716628345e-06, + "loss": 0.3183, + "step": 395 + }, + { + "epoch": 0.792, + "grad_norm": 0.1540730744600296, + "learning_rate": 6.695182376586603e-06, + "loss": 0.31, + "step": 396 + }, + { + "epoch": 0.794, + "grad_norm": 0.18420648574829102, + "learning_rate": 6.680246966077151e-06, + "loss": 0.388, + "step": 397 + }, + { + "epoch": 0.796, + "grad_norm": 0.14336371421813965, + "learning_rate": 6.665294635499404e-06, + "loss": 0.3359, + "step": 398 + }, + { + "epoch": 0.798, + "grad_norm": 0.21092049777507782, + "learning_rate": 6.650325535423166e-06, + "loss": 0.2935, + "step": 399 + }, + { + "epoch": 0.8, + "grad_norm": 0.23870034515857697, + "learning_rate": 6.635339816587109e-06, + "loss": 0.3413, + "step": 400 + }, + { + "epoch": 0.802, + "grad_norm": 0.21548299491405487, + "learning_rate": 6.6203376298972535e-06, + "loss": 0.4255, + "step": 401 + }, + { + "epoch": 0.804, + "grad_norm": 0.21555306017398834, + "learning_rate": 6.605319126425455e-06, + "loss": 0.4044, + "step": 402 + }, + { + "epoch": 0.806, + "grad_norm": 0.212354838848114, + "learning_rate": 6.590284457407876e-06, + "loss": 0.3225, + "step": 403 + }, + { + "epoch": 0.808, + "grad_norm": 0.17822064459323883, + "learning_rate": 6.5752337742434644e-06, + "loss": 0.3449, + "step": 404 + }, + { + "epoch": 0.81, + "grad_norm": 0.15272925794124603, + "learning_rate": 6.560167228492436e-06, + "loss": 0.2732, + "step": 405 + }, + { + "epoch": 0.812, + "grad_norm": 0.18225990235805511, + "learning_rate": 6.545084971874738e-06, + "loss": 0.3326, + "step": 406 + }, + { + "epoch": 0.814, + "grad_norm": 0.1854051798582077, + "learning_rate": 6.529987156268527e-06, + "loss": 0.3603, + "step": 407 + }, + { + "epoch": 0.816, + "grad_norm": 0.17678527534008026, + "learning_rate": 6.514873933708637e-06, + "loss": 0.2996, + "step": 408 + }, + { + "epoch": 0.818, + "grad_norm": 0.35500454902648926, + "learning_rate": 6.499745456385054e-06, + "loss": 0.4185, + "step": 409 + }, + { + "epoch": 0.82, + "grad_norm": 0.18555931746959686, + "learning_rate": 6.484601876641375e-06, + "loss": 0.2208, + "step": 410 + }, + { + "epoch": 0.822, + "grad_norm": 0.16834326088428497, + "learning_rate": 6.469443346973281e-06, + "loss": 0.3684, + "step": 411 + }, + { + "epoch": 0.824, + "grad_norm": 0.1469370424747467, + "learning_rate": 6.454270020026996e-06, + "loss": 0.2526, + "step": 412 + }, + { + "epoch": 0.826, + "grad_norm": 0.19754226505756378, + "learning_rate": 6.439082048597755e-06, + "loss": 0.3341, + "step": 413 + }, + { + "epoch": 0.828, + "grad_norm": 0.15154729783535004, + "learning_rate": 6.423879585628262e-06, + "loss": 0.2402, + "step": 414 + }, + { + "epoch": 0.83, + "grad_norm": 0.20265011489391327, + "learning_rate": 6.408662784207149e-06, + "loss": 0.374, + "step": 415 + }, + { + "epoch": 0.832, + "grad_norm": 0.2674030065536499, + "learning_rate": 6.39343179756744e-06, + "loss": 0.3057, + "step": 416 + }, + { + "epoch": 0.834, + "grad_norm": 0.1473691463470459, + "learning_rate": 6.378186779084996e-06, + "loss": 0.3684, + "step": 417 + }, + { + "epoch": 0.836, + "grad_norm": 0.2826951742172241, + "learning_rate": 6.362927882276991e-06, + "loss": 0.2585, + "step": 418 + }, + { + "epoch": 0.838, + "grad_norm": 0.20093302428722382, + "learning_rate": 6.34765526080034e-06, + "loss": 0.3041, + "step": 419 + }, + { + "epoch": 0.84, + "grad_norm": 0.1346312314271927, + "learning_rate": 6.332369068450175e-06, + "loss": 0.2105, + "step": 420 + }, + { + "epoch": 0.842, + "grad_norm": 0.16400040686130524, + "learning_rate": 6.317069459158284e-06, + "loss": 0.2832, + "step": 421 + }, + { + "epoch": 0.844, + "grad_norm": 0.19443334639072418, + "learning_rate": 6.301756586991561e-06, + "loss": 0.3353, + "step": 422 + }, + { + "epoch": 0.846, + "grad_norm": 0.22223643958568573, + "learning_rate": 6.286430606150458e-06, + "loss": 0.384, + "step": 423 + }, + { + "epoch": 0.848, + "grad_norm": 0.16762332618236542, + "learning_rate": 6.271091670967437e-06, + "loss": 0.3826, + "step": 424 + }, + { + "epoch": 0.85, + "grad_norm": 0.26455458998680115, + "learning_rate": 6.255739935905396e-06, + "loss": 0.4419, + "step": 425 + }, + { + "epoch": 0.852, + "grad_norm": 0.1570374071598053, + "learning_rate": 6.240375555556145e-06, + "loss": 0.2199, + "step": 426 + }, + { + "epoch": 0.854, + "grad_norm": 0.16800148785114288, + "learning_rate": 6.22499868463882e-06, + "loss": 0.2561, + "step": 427 + }, + { + "epoch": 0.856, + "grad_norm": 0.17082828283309937, + "learning_rate": 6.209609477998339e-06, + "loss": 0.3317, + "step": 428 + }, + { + "epoch": 0.858, + "grad_norm": 0.26214951276779175, + "learning_rate": 6.194208090603845e-06, + "loss": 0.4105, + "step": 429 + }, + { + "epoch": 0.86, + "grad_norm": 0.17318500578403473, + "learning_rate": 6.178794677547138e-06, + "loss": 0.2216, + "step": 430 + }, + { + "epoch": 0.862, + "grad_norm": 0.18394838273525238, + "learning_rate": 6.163369394041112e-06, + "loss": 0.3251, + "step": 431 + }, + { + "epoch": 0.864, + "grad_norm": 0.2352125197649002, + "learning_rate": 6.1479323954182055e-06, + "loss": 0.349, + "step": 432 + }, + { + "epoch": 0.866, + "grad_norm": 0.18627074360847473, + "learning_rate": 6.132483837128823e-06, + "loss": 0.3048, + "step": 433 + }, + { + "epoch": 0.868, + "grad_norm": 0.2253945916891098, + "learning_rate": 6.1170238747397715e-06, + "loss": 0.3081, + "step": 434 + }, + { + "epoch": 0.87, + "grad_norm": 0.1479015201330185, + "learning_rate": 6.101552663932704e-06, + "loss": 0.192, + "step": 435 + }, + { + "epoch": 0.872, + "grad_norm": 0.1954430192708969, + "learning_rate": 6.08607036050254e-06, + "loss": 0.2251, + "step": 436 + }, + { + "epoch": 0.874, + "grad_norm": 0.16169880330562592, + "learning_rate": 6.070577120355903e-06, + "loss": 0.2765, + "step": 437 + }, + { + "epoch": 0.876, + "grad_norm": 0.19537843763828278, + "learning_rate": 6.055073099509549e-06, + "loss": 0.2724, + "step": 438 + }, + { + "epoch": 0.878, + "grad_norm": 0.1675713211297989, + "learning_rate": 6.039558454088796e-06, + "loss": 0.3164, + "step": 439 + }, + { + "epoch": 0.88, + "grad_norm": 0.27977389097213745, + "learning_rate": 6.024033340325954e-06, + "loss": 0.4432, + "step": 440 + }, + { + "epoch": 0.882, + "grad_norm": 0.1879289448261261, + "learning_rate": 6.0084979145587444e-06, + "loss": 0.3558, + "step": 441 + }, + { + "epoch": 0.884, + "grad_norm": 0.16285355389118195, + "learning_rate": 5.9929523332287275e-06, + "loss": 0.3014, + "step": 442 + }, + { + "epoch": 0.886, + "grad_norm": 0.2135494202375412, + "learning_rate": 5.977396752879742e-06, + "loss": 0.3124, + "step": 443 + }, + { + "epoch": 0.888, + "grad_norm": 0.21992646157741547, + "learning_rate": 5.961831330156306e-06, + "loss": 0.3152, + "step": 444 + }, + { + "epoch": 0.89, + "grad_norm": 0.34824761748313904, + "learning_rate": 5.946256221802052e-06, + "loss": 0.4022, + "step": 445 + }, + { + "epoch": 0.892, + "grad_norm": 0.3176579177379608, + "learning_rate": 5.930671584658151e-06, + "loss": 0.3373, + "step": 446 + }, + { + "epoch": 0.894, + "grad_norm": 0.13881681859493256, + "learning_rate": 5.915077575661723e-06, + "loss": 0.2732, + "step": 447 + }, + { + "epoch": 0.896, + "grad_norm": 0.23585429787635803, + "learning_rate": 5.89947435184427e-06, + "loss": 0.383, + "step": 448 + }, + { + "epoch": 0.898, + "grad_norm": 0.20338225364685059, + "learning_rate": 5.883862070330079e-06, + "loss": 0.3929, + "step": 449 + }, + { + "epoch": 0.9, + "grad_norm": 0.5738399028778076, + "learning_rate": 5.8682408883346535e-06, + "loss": 0.3834, + "step": 450 + }, + { + "epoch": 0.902, + "grad_norm": 0.16114148497581482, + "learning_rate": 5.85261096316312e-06, + "loss": 0.2351, + "step": 451 + }, + { + "epoch": 0.904, + "grad_norm": 0.16090261936187744, + "learning_rate": 5.8369724522086545e-06, + "loss": 0.2264, + "step": 452 + }, + { + "epoch": 0.906, + "grad_norm": 0.1992426961660385, + "learning_rate": 5.821325512950886e-06, + "loss": 0.3239, + "step": 453 + }, + { + "epoch": 0.908, + "grad_norm": 0.1780838966369629, + "learning_rate": 5.805670302954322e-06, + "loss": 0.2997, + "step": 454 + }, + { + "epoch": 0.91, + "grad_norm": 0.24148645997047424, + "learning_rate": 5.79000697986675e-06, + "loss": 0.3701, + "step": 455 + }, + { + "epoch": 0.912, + "grad_norm": 0.1544380933046341, + "learning_rate": 5.774335701417662e-06, + "loss": 0.1843, + "step": 456 + }, + { + "epoch": 0.914, + "grad_norm": 0.20772896707057953, + "learning_rate": 5.758656625416659e-06, + "loss": 0.3617, + "step": 457 + }, + { + "epoch": 0.916, + "grad_norm": 0.2054608017206192, + "learning_rate": 5.7429699097518585e-06, + "loss": 0.3286, + "step": 458 + }, + { + "epoch": 0.918, + "grad_norm": 0.1513553261756897, + "learning_rate": 5.727275712388318e-06, + "loss": 0.2149, + "step": 459 + }, + { + "epoch": 0.92, + "grad_norm": 0.20221109688282013, + "learning_rate": 5.711574191366427e-06, + "loss": 0.2895, + "step": 460 + }, + { + "epoch": 0.922, + "grad_norm": 0.26075002551078796, + "learning_rate": 5.695865504800328e-06, + "loss": 0.3115, + "step": 461 + }, + { + "epoch": 0.924, + "grad_norm": 0.2223353236913681, + "learning_rate": 5.680149810876322e-06, + "loss": 0.3065, + "step": 462 + }, + { + "epoch": 0.926, + "grad_norm": 0.18663600087165833, + "learning_rate": 5.664427267851271e-06, + "loss": 0.2444, + "step": 463 + }, + { + "epoch": 0.928, + "grad_norm": 0.19538210332393646, + "learning_rate": 5.648698034051009e-06, + "loss": 0.3877, + "step": 464 + }, + { + "epoch": 0.93, + "grad_norm": 0.1691403090953827, + "learning_rate": 5.632962267868747e-06, + "loss": 0.2445, + "step": 465 + }, + { + "epoch": 0.932, + "grad_norm": 0.1581772416830063, + "learning_rate": 5.617220127763474e-06, + "loss": 0.3217, + "step": 466 + }, + { + "epoch": 0.934, + "grad_norm": 0.20001822710037231, + "learning_rate": 5.601471772258368e-06, + "loss": 0.3184, + "step": 467 + }, + { + "epoch": 0.936, + "grad_norm": 0.3052047789096832, + "learning_rate": 5.585717359939192e-06, + "loss": 0.3479, + "step": 468 + }, + { + "epoch": 0.938, + "grad_norm": 0.23681974411010742, + "learning_rate": 5.569957049452703e-06, + "loss": 0.3403, + "step": 469 + }, + { + "epoch": 0.94, + "grad_norm": 0.12364782392978668, + "learning_rate": 5.5541909995050554e-06, + "loss": 0.2085, + "step": 470 + }, + { + "epoch": 0.942, + "grad_norm": 0.1526976227760315, + "learning_rate": 5.538419368860196e-06, + "loss": 0.2281, + "step": 471 + }, + { + "epoch": 0.944, + "grad_norm": 0.2230585813522339, + "learning_rate": 5.522642316338268e-06, + "loss": 0.3351, + "step": 472 + }, + { + "epoch": 0.946, + "grad_norm": 0.17690080404281616, + "learning_rate": 5.506860000814017e-06, + "loss": 0.2985, + "step": 473 + }, + { + "epoch": 0.948, + "grad_norm": 0.1738656908273697, + "learning_rate": 5.491072581215186e-06, + "loss": 0.247, + "step": 474 + }, + { + "epoch": 0.95, + "grad_norm": 0.18501204252243042, + "learning_rate": 5.475280216520913e-06, + "loss": 0.2646, + "step": 475 + }, + { + "epoch": 0.952, + "grad_norm": 0.19721092283725739, + "learning_rate": 5.459483065760138e-06, + "loss": 0.2876, + "step": 476 + }, + { + "epoch": 0.954, + "grad_norm": 0.16680027544498444, + "learning_rate": 5.443681288009991e-06, + "loss": 0.2167, + "step": 477 + }, + { + "epoch": 0.956, + "grad_norm": 0.17918136715888977, + "learning_rate": 5.4278750423942e-06, + "loss": 0.3997, + "step": 478 + }, + { + "epoch": 0.958, + "grad_norm": 0.15725551545619965, + "learning_rate": 5.412064488081482e-06, + "loss": 0.2829, + "step": 479 + }, + { + "epoch": 0.96, + "grad_norm": 0.19459596276283264, + "learning_rate": 5.396249784283943e-06, + "loss": 0.3373, + "step": 480 + }, + { + "epoch": 0.962, + "grad_norm": 0.32756415009498596, + "learning_rate": 5.380431090255475e-06, + "loss": 0.4206, + "step": 481 + }, + { + "epoch": 0.964, + "grad_norm": 0.19843968749046326, + "learning_rate": 5.364608565290154e-06, + "loss": 0.3385, + "step": 482 + }, + { + "epoch": 0.966, + "grad_norm": 0.15863648056983948, + "learning_rate": 5.348782368720627e-06, + "loss": 0.2524, + "step": 483 + }, + { + "epoch": 0.968, + "grad_norm": 0.21220897138118744, + "learning_rate": 5.33295265991652e-06, + "loss": 0.2326, + "step": 484 + }, + { + "epoch": 0.97, + "grad_norm": 0.24547149240970612, + "learning_rate": 5.317119598282823e-06, + "loss": 0.3854, + "step": 485 + }, + { + "epoch": 0.972, + "grad_norm": 0.2009747326374054, + "learning_rate": 5.301283343258293e-06, + "loss": 0.3141, + "step": 486 + }, + { + "epoch": 0.974, + "grad_norm": 0.22629286348819733, + "learning_rate": 5.285444054313841e-06, + "loss": 0.3044, + "step": 487 + }, + { + "epoch": 0.976, + "grad_norm": 0.18528909981250763, + "learning_rate": 5.26960189095093e-06, + "loss": 0.3056, + "step": 488 + }, + { + "epoch": 0.978, + "grad_norm": 0.18446871638298035, + "learning_rate": 5.253757012699972e-06, + "loss": 0.3206, + "step": 489 + }, + { + "epoch": 0.98, + "grad_norm": 0.1961178332567215, + "learning_rate": 5.237909579118713e-06, + "loss": 0.386, + "step": 490 + }, + { + "epoch": 0.982, + "grad_norm": 0.20445547997951508, + "learning_rate": 5.2220597497906315e-06, + "loss": 0.3997, + "step": 491 + }, + { + "epoch": 0.984, + "grad_norm": 0.17709751427173615, + "learning_rate": 5.206207684323337e-06, + "loss": 0.3212, + "step": 492 + }, + { + "epoch": 0.986, + "grad_norm": 0.15768595039844513, + "learning_rate": 5.190353542346951e-06, + "loss": 0.2752, + "step": 493 + }, + { + "epoch": 0.988, + "grad_norm": 0.14925841987133026, + "learning_rate": 5.174497483512506e-06, + "loss": 0.2593, + "step": 494 + }, + { + "epoch": 0.99, + "grad_norm": 0.2051381766796112, + "learning_rate": 5.15863966749034e-06, + "loss": 0.3941, + "step": 495 + }, + { + "epoch": 0.992, + "grad_norm": 0.2395932674407959, + "learning_rate": 5.142780253968481e-06, + "loss": 0.3136, + "step": 496 + }, + { + "epoch": 0.994, + "grad_norm": 0.2152215540409088, + "learning_rate": 5.126919402651053e-06, + "loss": 0.3083, + "step": 497 + }, + { + "epoch": 0.996, + "grad_norm": 0.17021948099136353, + "learning_rate": 5.111057273256648e-06, + "loss": 0.3185, + "step": 498 + }, + { + "epoch": 0.998, + "grad_norm": 0.22681966423988342, + "learning_rate": 5.095194025516733e-06, + "loss": 0.4107, + "step": 499 + }, + { + "epoch": 1.0, + "grad_norm": 0.22234933078289032, + "learning_rate": 5.07932981917404e-06, + "loss": 0.3672, + "step": 500 + }, + { + "epoch": 1.0, + "eval_loss": 0.27911150455474854, + "eval_runtime": 76.7158, + "eval_samples_per_second": 7.195, + "eval_steps_per_second": 0.899, + "step": 500 + }, + { + "epoch": 1.002, + "grad_norm": 0.18890836834907532, + "learning_rate": 5.063464813980948e-06, + "loss": 0.2277, + "step": 501 + }, + { + "epoch": 1.004, + "grad_norm": 0.19094686210155487, + "learning_rate": 5.0475991696978844e-06, + "loss": 0.3602, + "step": 502 + }, + { + "epoch": 1.006, + "grad_norm": 0.24123992025852203, + "learning_rate": 5.03173304609171e-06, + "loss": 0.2796, + "step": 503 + }, + { + "epoch": 1.008, + "grad_norm": 0.2091682106256485, + "learning_rate": 5.015866602934112e-06, + "loss": 0.333, + "step": 504 + }, + { + "epoch": 1.01, + "grad_norm": 0.21148917078971863, + "learning_rate": 5e-06, + "loss": 0.4005, + "step": 505 + }, + { + "epoch": 1.012, + "grad_norm": 0.14547854661941528, + "learning_rate": 4.984133397065889e-06, + "loss": 0.2223, + "step": 506 + }, + { + "epoch": 1.014, + "grad_norm": 0.23349957168102264, + "learning_rate": 4.9682669539082914e-06, + "loss": 0.3264, + "step": 507 + }, + { + "epoch": 1.016, + "grad_norm": 0.16822971403598785, + "learning_rate": 4.952400830302117e-06, + "loss": 0.3151, + "step": 508 + }, + { + "epoch": 1.018, + "grad_norm": 0.1795063018798828, + "learning_rate": 4.936535186019053e-06, + "loss": 0.2896, + "step": 509 + }, + { + "epoch": 1.02, + "grad_norm": 0.19863282144069672, + "learning_rate": 4.9206701808259605e-06, + "loss": 0.2481, + "step": 510 + }, + { + "epoch": 1.022, + "grad_norm": 0.18788766860961914, + "learning_rate": 4.904805974483267e-06, + "loss": 0.3513, + "step": 511 + }, + { + "epoch": 1.024, + "grad_norm": 0.1949293315410614, + "learning_rate": 4.888942726743353e-06, + "loss": 0.2264, + "step": 512 + }, + { + "epoch": 1.002, + "grad_norm": 0.16474653780460358, + "learning_rate": 4.873080597348948e-06, + "loss": 0.2793, + "step": 513 + }, + { + "epoch": 1.004, + "grad_norm": 0.20230461657047272, + "learning_rate": 4.85721974603152e-06, + "loss": 0.3618, + "step": 514 + }, + { + "epoch": 1.006, + "grad_norm": 0.16907107830047607, + "learning_rate": 4.841360332509663e-06, + "loss": 0.2708, + "step": 515 + }, + { + "epoch": 1.008, + "grad_norm": 0.22199520468711853, + "learning_rate": 4.825502516487497e-06, + "loss": 0.3405, + "step": 516 + }, + { + "epoch": 1.01, + "grad_norm": 0.17370116710662842, + "learning_rate": 4.809646457653051e-06, + "loss": 0.2715, + "step": 517 + }, + { + "epoch": 1.012, + "grad_norm": 0.21842899918556213, + "learning_rate": 4.793792315676665e-06, + "loss": 0.1802, + "step": 518 + }, + { + "epoch": 1.014, + "grad_norm": 0.1792248785495758, + "learning_rate": 4.777940250209369e-06, + "loss": 0.1912, + "step": 519 + }, + { + "epoch": 1.016, + "grad_norm": 0.24431253969669342, + "learning_rate": 4.762090420881289e-06, + "loss": 0.3494, + "step": 520 + }, + { + "epoch": 1.018, + "grad_norm": 0.1893794983625412, + "learning_rate": 4.74624298730003e-06, + "loss": 0.246, + "step": 521 + }, + { + "epoch": 1.02, + "grad_norm": 0.29100745916366577, + "learning_rate": 4.7303981090490715e-06, + "loss": 0.4553, + "step": 522 + }, + { + "epoch": 1.022, + "grad_norm": 0.21313871443271637, + "learning_rate": 4.71455594568616e-06, + "loss": 0.3414, + "step": 523 + }, + { + "epoch": 1.024, + "grad_norm": 0.257988840341568, + "learning_rate": 4.6987166567417085e-06, + "loss": 0.3223, + "step": 524 + }, + { + "epoch": 1.026, + "grad_norm": 0.1500207781791687, + "learning_rate": 4.682880401717178e-06, + "loss": 0.2883, + "step": 525 + }, + { + "epoch": 1.028, + "grad_norm": 0.2195630818605423, + "learning_rate": 4.667047340083481e-06, + "loss": 0.4185, + "step": 526 + }, + { + "epoch": 1.03, + "grad_norm": 0.24663732945919037, + "learning_rate": 4.651217631279374e-06, + "loss": 0.312, + "step": 527 + }, + { + "epoch": 1.032, + "grad_norm": 0.23168163001537323, + "learning_rate": 4.635391434709847e-06, + "loss": 0.3826, + "step": 528 + }, + { + "epoch": 1.034, + "grad_norm": 0.20334544777870178, + "learning_rate": 4.619568909744524e-06, + "loss": 0.302, + "step": 529 + }, + { + "epoch": 1.036, + "grad_norm": 0.2471403032541275, + "learning_rate": 4.603750215716057e-06, + "loss": 0.3024, + "step": 530 + }, + { + "epoch": 1.038, + "grad_norm": 0.19385652244091034, + "learning_rate": 4.587935511918521e-06, + "loss": 0.2803, + "step": 531 + }, + { + "epoch": 1.04, + "grad_norm": 0.24697639048099518, + "learning_rate": 4.572124957605803e-06, + "loss": 0.4114, + "step": 532 + }, + { + "epoch": 1.042, + "grad_norm": 0.24823316931724548, + "learning_rate": 4.55631871199001e-06, + "loss": 0.3705, + "step": 533 + }, + { + "epoch": 1.044, + "grad_norm": 0.1970013827085495, + "learning_rate": 4.5405169342398634e-06, + "loss": 0.3608, + "step": 534 + }, + { + "epoch": 1.046, + "grad_norm": 0.20955346524715424, + "learning_rate": 4.524719783479088e-06, + "loss": 0.347, + "step": 535 + }, + { + "epoch": 1.048, + "grad_norm": 0.1911235898733139, + "learning_rate": 4.5089274187848144e-06, + "loss": 0.2342, + "step": 536 + }, + { + "epoch": 1.05, + "grad_norm": 0.22940923273563385, + "learning_rate": 4.493139999185984e-06, + "loss": 0.2803, + "step": 537 + }, + { + "epoch": 1.052, + "grad_norm": 0.24347023665905, + "learning_rate": 4.477357683661734e-06, + "loss": 0.3833, + "step": 538 + }, + { + "epoch": 1.054, + "grad_norm": 0.24687382578849792, + "learning_rate": 4.461580631139806e-06, + "loss": 0.3467, + "step": 539 + }, + { + "epoch": 1.056, + "grad_norm": 0.15779221057891846, + "learning_rate": 4.445809000494945e-06, + "loss": 0.2781, + "step": 540 + }, + { + "epoch": 1.058, + "grad_norm": 0.20665578544139862, + "learning_rate": 4.430042950547298e-06, + "loss": 0.4656, + "step": 541 + }, + { + "epoch": 1.06, + "grad_norm": 0.24457348883152008, + "learning_rate": 4.414282640060809e-06, + "loss": 0.2684, + "step": 542 + }, + { + "epoch": 1.062, + "grad_norm": 0.20804962515830994, + "learning_rate": 4.398528227741634e-06, + "loss": 0.3577, + "step": 543 + }, + { + "epoch": 1.064, + "grad_norm": 0.2586953043937683, + "learning_rate": 4.382779872236527e-06, + "loss": 0.3492, + "step": 544 + }, + { + "epoch": 1.066, + "grad_norm": 0.26488688588142395, + "learning_rate": 4.367037732131254e-06, + "loss": 0.3954, + "step": 545 + }, + { + "epoch": 1.068, + "grad_norm": 0.15630888938903809, + "learning_rate": 4.3513019659489916e-06, + "loss": 0.1673, + "step": 546 + }, + { + "epoch": 1.07, + "grad_norm": 0.15465758740901947, + "learning_rate": 4.33557273214873e-06, + "loss": 0.2532, + "step": 547 + }, + { + "epoch": 1.072, + "grad_norm": 0.25680503249168396, + "learning_rate": 4.319850189123681e-06, + "loss": 0.3065, + "step": 548 + }, + { + "epoch": 1.074, + "grad_norm": 0.24224849045276642, + "learning_rate": 4.304134495199675e-06, + "loss": 0.4157, + "step": 549 + }, + { + "epoch": 1.076, + "grad_norm": 0.1849289834499359, + "learning_rate": 4.2884258086335755e-06, + "loss": 0.3611, + "step": 550 + }, + { + "epoch": 1.078, + "grad_norm": 0.2488396316766739, + "learning_rate": 4.272724287611684e-06, + "loss": 0.313, + "step": 551 + }, + { + "epoch": 1.08, + "grad_norm": 0.23535999655723572, + "learning_rate": 4.257030090248142e-06, + "loss": 0.3165, + "step": 552 + }, + { + "epoch": 1.082, + "grad_norm": 0.19105635583400726, + "learning_rate": 4.241343374583343e-06, + "loss": 0.2779, + "step": 553 + }, + { + "epoch": 1.084, + "grad_norm": 0.22108493745326996, + "learning_rate": 4.225664298582339e-06, + "loss": 0.3312, + "step": 554 + }, + { + "epoch": 1.086, + "grad_norm": 0.18127895891666412, + "learning_rate": 4.209993020133251e-06, + "loss": 0.2099, + "step": 555 + }, + { + "epoch": 1.088, + "grad_norm": 0.304030179977417, + "learning_rate": 4.194329697045681e-06, + "loss": 0.4397, + "step": 556 + }, + { + "epoch": 1.09, + "grad_norm": 0.16876006126403809, + "learning_rate": 4.178674487049116e-06, + "loss": 0.253, + "step": 557 + }, + { + "epoch": 1.092, + "grad_norm": 0.18693579733371735, + "learning_rate": 4.163027547791347e-06, + "loss": 0.2696, + "step": 558 + }, + { + "epoch": 1.094, + "grad_norm": 0.2209119349718094, + "learning_rate": 4.147389036836881e-06, + "loss": 0.2225, + "step": 559 + }, + { + "epoch": 1.096, + "grad_norm": 0.1712501347064972, + "learning_rate": 4.131759111665349e-06, + "loss": 0.2205, + "step": 560 + }, + { + "epoch": 1.098, + "grad_norm": 0.18427731096744537, + "learning_rate": 4.116137929669921e-06, + "loss": 0.2527, + "step": 561 + }, + { + "epoch": 1.1, + "grad_norm": 0.16298742592334747, + "learning_rate": 4.100525648155731e-06, + "loss": 0.2583, + "step": 562 + }, + { + "epoch": 1.102, + "grad_norm": 0.1921571046113968, + "learning_rate": 4.084922424338277e-06, + "loss": 0.2931, + "step": 563 + }, + { + "epoch": 1.104, + "grad_norm": 0.1696956604719162, + "learning_rate": 4.06932841534185e-06, + "loss": 0.2686, + "step": 564 + }, + { + "epoch": 1.106, + "grad_norm": 0.2463129460811615, + "learning_rate": 4.053743778197951e-06, + "loss": 0.301, + "step": 565 + }, + { + "epoch": 1.108, + "grad_norm": 0.15761299431324005, + "learning_rate": 4.038168669843698e-06, + "loss": 0.1756, + "step": 566 + }, + { + "epoch": 1.11, + "grad_norm": 0.1688557118177414, + "learning_rate": 4.02260324712026e-06, + "loss": 0.2969, + "step": 567 + }, + { + "epoch": 1.112, + "grad_norm": 0.21805354952812195, + "learning_rate": 4.007047666771274e-06, + "loss": 0.2739, + "step": 568 + }, + { + "epoch": 1.114, + "grad_norm": 0.17749401926994324, + "learning_rate": 3.991502085441259e-06, + "loss": 0.2698, + "step": 569 + }, + { + "epoch": 1.116, + "grad_norm": 0.2537892758846283, + "learning_rate": 3.975966659674048e-06, + "loss": 0.4131, + "step": 570 + }, + { + "epoch": 1.1179999999999999, + "grad_norm": 0.15672741830348969, + "learning_rate": 3.960441545911205e-06, + "loss": 0.2118, + "step": 571 + }, + { + "epoch": 1.12, + "grad_norm": 0.23960451781749725, + "learning_rate": 3.944926900490452e-06, + "loss": 0.2715, + "step": 572 + }, + { + "epoch": 1.1219999999999999, + "grad_norm": 0.17803031206130981, + "learning_rate": 3.929422879644099e-06, + "loss": 0.24, + "step": 573 + }, + { + "epoch": 1.124, + "grad_norm": 0.2676704525947571, + "learning_rate": 3.913929639497462e-06, + "loss": 0.3247, + "step": 574 + }, + { + "epoch": 1.126, + "grad_norm": 0.1522570550441742, + "learning_rate": 3.898447336067297e-06, + "loss": 0.2298, + "step": 575 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 0.23372875154018402, + "learning_rate": 3.882976125260229e-06, + "loss": 0.4375, + "step": 576 + }, + { + "epoch": 1.13, + "grad_norm": 0.3442481756210327, + "learning_rate": 3.867516162871177e-06, + "loss": 0.2883, + "step": 577 + }, + { + "epoch": 1.1320000000000001, + "grad_norm": 0.2335498332977295, + "learning_rate": 3.8520676045817945e-06, + "loss": 0.2602, + "step": 578 + }, + { + "epoch": 1.134, + "grad_norm": 0.29386457800865173, + "learning_rate": 3.8366306059588885e-06, + "loss": 0.3826, + "step": 579 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 0.18141314387321472, + "learning_rate": 3.821205322452863e-06, + "loss": 0.205, + "step": 580 + }, + { + "epoch": 1.138, + "grad_norm": 0.21235667169094086, + "learning_rate": 3.8057919093961554e-06, + "loss": 0.2511, + "step": 581 + }, + { + "epoch": 1.1400000000000001, + "grad_norm": 0.15281343460083008, + "learning_rate": 3.790390522001662e-06, + "loss": 0.1908, + "step": 582 + }, + { + "epoch": 1.142, + "grad_norm": 0.1883106231689453, + "learning_rate": 3.775001315361183e-06, + "loss": 0.2896, + "step": 583 + }, + { + "epoch": 1.144, + "grad_norm": 0.19878095388412476, + "learning_rate": 3.7596244444438577e-06, + "loss": 0.2847, + "step": 584 + }, + { + "epoch": 1.146, + "grad_norm": 0.18822817504405975, + "learning_rate": 3.7442600640946045e-06, + "loss": 0.3134, + "step": 585 + }, + { + "epoch": 1.148, + "grad_norm": 0.21552503108978271, + "learning_rate": 3.7289083290325668e-06, + "loss": 0.3323, + "step": 586 + }, + { + "epoch": 1.15, + "grad_norm": 0.25933748483657837, + "learning_rate": 3.7135693938495433e-06, + "loss": 0.3463, + "step": 587 + }, + { + "epoch": 1.152, + "grad_norm": 0.23867465555667877, + "learning_rate": 3.69824341300844e-06, + "loss": 0.3601, + "step": 588 + }, + { + "epoch": 1.154, + "grad_norm": 0.3167083263397217, + "learning_rate": 3.682930540841717e-06, + "loss": 0.4182, + "step": 589 + }, + { + "epoch": 1.156, + "grad_norm": 0.31397873163223267, + "learning_rate": 3.667630931549826e-06, + "loss": 0.3287, + "step": 590 + }, + { + "epoch": 1.158, + "grad_norm": 0.18764562904834747, + "learning_rate": 3.6523447391996613e-06, + "loss": 0.276, + "step": 591 + }, + { + "epoch": 1.16, + "grad_norm": 0.29411885142326355, + "learning_rate": 3.637072117723012e-06, + "loss": 0.3956, + "step": 592 + }, + { + "epoch": 1.162, + "grad_norm": 0.19027218222618103, + "learning_rate": 3.6218132209150047e-06, + "loss": 0.2753, + "step": 593 + }, + { + "epoch": 1.164, + "grad_norm": 0.20175009965896606, + "learning_rate": 3.606568202432562e-06, + "loss": 0.3459, + "step": 594 + }, + { + "epoch": 1.166, + "grad_norm": 0.2005695253610611, + "learning_rate": 3.5913372157928515e-06, + "loss": 0.2125, + "step": 595 + }, + { + "epoch": 1.168, + "grad_norm": 0.22972247004508972, + "learning_rate": 3.5761204143717387e-06, + "loss": 0.2925, + "step": 596 + }, + { + "epoch": 1.17, + "grad_norm": 0.22252865135669708, + "learning_rate": 3.560917951402245e-06, + "loss": 0.3467, + "step": 597 + }, + { + "epoch": 1.172, + "grad_norm": 0.2404780089855194, + "learning_rate": 3.5457299799730047e-06, + "loss": 0.3268, + "step": 598 + }, + { + "epoch": 1.174, + "grad_norm": 0.24187296628952026, + "learning_rate": 3.5305566530267217e-06, + "loss": 0.3654, + "step": 599 + }, + { + "epoch": 1.176, + "grad_norm": 0.23365625739097595, + "learning_rate": 3.5153981233586277e-06, + "loss": 0.3168, + "step": 600 + }, + { + "epoch": 1.178, + "grad_norm": 0.20350268483161926, + "learning_rate": 3.5002545436149478e-06, + "loss": 0.2618, + "step": 601 + }, + { + "epoch": 1.18, + "grad_norm": 0.22084195911884308, + "learning_rate": 3.4851260662913643e-06, + "loss": 0.381, + "step": 602 + }, + { + "epoch": 1.182, + "grad_norm": 0.5043354630470276, + "learning_rate": 3.470012843731476e-06, + "loss": 0.426, + "step": 603 + }, + { + "epoch": 1.184, + "grad_norm": 0.23615571856498718, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.3891, + "step": 604 + }, + { + "epoch": 1.186, + "grad_norm": 0.1776285469532013, + "learning_rate": 3.439832771507565e-06, + "loss": 0.2032, + "step": 605 + }, + { + "epoch": 1.188, + "grad_norm": 0.23352046310901642, + "learning_rate": 3.4247662257565372e-06, + "loss": 0.2098, + "step": 606 + }, + { + "epoch": 1.19, + "grad_norm": 0.19145451486110687, + "learning_rate": 3.4097155425921256e-06, + "loss": 0.2612, + "step": 607 + }, + { + "epoch": 1.192, + "grad_norm": 0.19671331346035004, + "learning_rate": 3.394680873574546e-06, + "loss": 0.2941, + "step": 608 + }, + { + "epoch": 1.194, + "grad_norm": 0.2002706378698349, + "learning_rate": 3.3796623701027477e-06, + "loss": 0.1828, + "step": 609 + }, + { + "epoch": 1.196, + "grad_norm": 0.23058104515075684, + "learning_rate": 3.3646601834128924e-06, + "loss": 0.2983, + "step": 610 + }, + { + "epoch": 1.198, + "grad_norm": 0.13006491959095, + "learning_rate": 3.349674464576834e-06, + "loss": 0.1306, + "step": 611 + }, + { + "epoch": 1.2, + "grad_norm": 0.29587817192077637, + "learning_rate": 3.3347053645005965e-06, + "loss": 0.3542, + "step": 612 + }, + { + "epoch": 1.202, + "grad_norm": 0.23100513219833374, + "learning_rate": 3.319753033922849e-06, + "loss": 0.4051, + "step": 613 + }, + { + "epoch": 1.204, + "grad_norm": 0.24775229394435883, + "learning_rate": 3.3048176234133967e-06, + "loss": 0.2378, + "step": 614 + }, + { + "epoch": 1.206, + "grad_norm": 0.18648101389408112, + "learning_rate": 3.289899283371657e-06, + "loss": 0.2141, + "step": 615 + }, + { + "epoch": 1.208, + "grad_norm": 0.24682392179965973, + "learning_rate": 3.274998164025148e-06, + "loss": 0.3123, + "step": 616 + }, + { + "epoch": 1.21, + "grad_norm": 0.25237175822257996, + "learning_rate": 3.260114415427975e-06, + "loss": 0.4471, + "step": 617 + }, + { + "epoch": 1.212, + "grad_norm": 0.20262058079242706, + "learning_rate": 3.2452481874593234e-06, + "loss": 0.2694, + "step": 618 + }, + { + "epoch": 1.214, + "grad_norm": 0.23342056572437286, + "learning_rate": 3.230399629821942e-06, + "loss": 0.3093, + "step": 619 + }, + { + "epoch": 1.216, + "grad_norm": 0.17575059831142426, + "learning_rate": 3.2155688920406415e-06, + "loss": 0.2923, + "step": 620 + }, + { + "epoch": 1.218, + "grad_norm": 0.2357223480939865, + "learning_rate": 3.200756123460788e-06, + "loss": 0.3569, + "step": 621 + }, + { + "epoch": 1.22, + "grad_norm": 0.3179761469364166, + "learning_rate": 3.1859614732467957e-06, + "loss": 0.4442, + "step": 622 + }, + { + "epoch": 1.222, + "grad_norm": 0.28770139813423157, + "learning_rate": 3.171185090380628e-06, + "loss": 0.3325, + "step": 623 + }, + { + "epoch": 1.224, + "grad_norm": 0.18547223508358002, + "learning_rate": 3.156427123660297e-06, + "loss": 0.2269, + "step": 624 + }, + { + "epoch": 1.226, + "grad_norm": 0.21385949850082397, + "learning_rate": 3.141687721698363e-06, + "loss": 0.2615, + "step": 625 + }, + { + "epoch": 1.226, + "eval_loss": 0.2700715959072113, + "eval_runtime": 76.6157, + "eval_samples_per_second": 7.205, + "eval_steps_per_second": 0.901, + "step": 625 + }, + { + "epoch": 1.228, + "grad_norm": 0.3386872708797455, + "learning_rate": 3.12696703292044e-06, + "loss": 0.3519, + "step": 626 + }, + { + "epoch": 1.23, + "grad_norm": 0.19794243574142456, + "learning_rate": 3.1122652055637014e-06, + "loss": 0.2581, + "step": 627 + }, + { + "epoch": 1.232, + "grad_norm": 0.1912515014410019, + "learning_rate": 3.097582387675385e-06, + "loss": 0.3286, + "step": 628 + }, + { + "epoch": 1.234, + "grad_norm": 0.18073877692222595, + "learning_rate": 3.0829187271113035e-06, + "loss": 0.2411, + "step": 629 + }, + { + "epoch": 1.236, + "grad_norm": 0.24173890054225922, + "learning_rate": 3.0682743715343565e-06, + "loss": 0.3853, + "step": 630 + }, + { + "epoch": 1.238, + "grad_norm": 0.17611730098724365, + "learning_rate": 3.053649468413043e-06, + "loss": 0.1971, + "step": 631 + }, + { + "epoch": 1.24, + "grad_norm": 0.22723500430583954, + "learning_rate": 3.0390441650199727e-06, + "loss": 0.2852, + "step": 632 + }, + { + "epoch": 1.242, + "grad_norm": 0.2124418169260025, + "learning_rate": 3.0244586084303908e-06, + "loss": 0.329, + "step": 633 + }, + { + "epoch": 1.244, + "grad_norm": 0.24569527804851532, + "learning_rate": 3.0098929455206905e-06, + "loss": 0.4141, + "step": 634 + }, + { + "epoch": 1.246, + "grad_norm": 0.2651529312133789, + "learning_rate": 2.995347322966933e-06, + "loss": 0.2759, + "step": 635 + }, + { + "epoch": 1.248, + "grad_norm": 0.3110187351703644, + "learning_rate": 2.980821887243377e-06, + "loss": 0.3405, + "step": 636 + }, + { + "epoch": 1.25, + "grad_norm": 0.23818974196910858, + "learning_rate": 2.966316784621e-06, + "loss": 0.2185, + "step": 637 + }, + { + "epoch": 1.252, + "grad_norm": 0.32177677750587463, + "learning_rate": 2.951832161166024e-06, + "loss": 0.4972, + "step": 638 + }, + { + "epoch": 1.254, + "grad_norm": 0.21647526323795319, + "learning_rate": 2.937368162738445e-06, + "loss": 0.4215, + "step": 639 + }, + { + "epoch": 1.256, + "grad_norm": 0.1766624003648758, + "learning_rate": 2.9229249349905686e-06, + "loss": 0.2439, + "step": 640 + }, + { + "epoch": 1.258, + "grad_norm": 0.34441429376602173, + "learning_rate": 2.9085026233655367e-06, + "loss": 0.4078, + "step": 641 + }, + { + "epoch": 1.26, + "grad_norm": 0.30576056241989136, + "learning_rate": 2.8941013730958674e-06, + "loss": 0.4071, + "step": 642 + }, + { + "epoch": 1.262, + "grad_norm": 0.22246578335762024, + "learning_rate": 2.8797213292019927e-06, + "loss": 0.3456, + "step": 643 + }, + { + "epoch": 1.264, + "grad_norm": 0.21253855526447296, + "learning_rate": 2.8653626364907918e-06, + "loss": 0.2257, + "step": 644 + }, + { + "epoch": 1.266, + "grad_norm": 0.22427724301815033, + "learning_rate": 2.851025439554142e-06, + "loss": 0.298, + "step": 645 + }, + { + "epoch": 1.268, + "grad_norm": 0.19472835958003998, + "learning_rate": 2.8367098827674575e-06, + "loss": 0.3093, + "step": 646 + }, + { + "epoch": 1.27, + "grad_norm": 0.19399920105934143, + "learning_rate": 2.82241611028824e-06, + "loss": 0.2254, + "step": 647 + }, + { + "epoch": 1.272, + "grad_norm": 0.23820382356643677, + "learning_rate": 2.8081442660546126e-06, + "loss": 0.2909, + "step": 648 + }, + { + "epoch": 1.274, + "grad_norm": 0.1856381893157959, + "learning_rate": 2.7938944937838924e-06, + "loss": 0.2367, + "step": 649 + }, + { + "epoch": 1.276, + "grad_norm": 0.16763170063495636, + "learning_rate": 2.7796669369711294e-06, + "loss": 0.1991, + "step": 650 + }, + { + "epoch": 1.278, + "grad_norm": 0.25936460494995117, + "learning_rate": 2.7654617388876612e-06, + "loss": 0.3244, + "step": 651 + }, + { + "epoch": 1.28, + "grad_norm": 0.37680599093437195, + "learning_rate": 2.751279042579672e-06, + "loss": 0.409, + "step": 652 + }, + { + "epoch": 1.282, + "grad_norm": 0.2094666063785553, + "learning_rate": 2.7371189908667604e-06, + "loss": 0.3523, + "step": 653 + }, + { + "epoch": 1.284, + "grad_norm": 0.25615018606185913, + "learning_rate": 2.722981726340487e-06, + "loss": 0.3496, + "step": 654 + }, + { + "epoch": 1.286, + "grad_norm": 0.2155938446521759, + "learning_rate": 2.708867391362948e-06, + "loss": 0.2099, + "step": 655 + }, + { + "epoch": 1.288, + "grad_norm": 0.2571382522583008, + "learning_rate": 2.694776128065345e-06, + "loss": 0.2505, + "step": 656 + }, + { + "epoch": 1.29, + "grad_norm": 0.25513583421707153, + "learning_rate": 2.6807080783465376e-06, + "loss": 0.3528, + "step": 657 + }, + { + "epoch": 1.292, + "grad_norm": 0.21190734207630157, + "learning_rate": 2.6666633838716317e-06, + "loss": 0.3892, + "step": 658 + }, + { + "epoch": 1.294, + "grad_norm": 0.2990153133869171, + "learning_rate": 2.6526421860705474e-06, + "loss": 0.3916, + "step": 659 + }, + { + "epoch": 1.296, + "grad_norm": 0.22129324078559875, + "learning_rate": 2.6386446261365874e-06, + "loss": 0.2596, + "step": 660 + }, + { + "epoch": 1.298, + "grad_norm": 0.2187465876340866, + "learning_rate": 2.6246708450250256e-06, + "loss": 0.3962, + "step": 661 + }, + { + "epoch": 1.3, + "grad_norm": 0.17136049270629883, + "learning_rate": 2.6107209834516857e-06, + "loss": 0.3483, + "step": 662 + }, + { + "epoch": 1.302, + "grad_norm": 0.25110378861427307, + "learning_rate": 2.5967951818915137e-06, + "loss": 0.4098, + "step": 663 + }, + { + "epoch": 1.304, + "grad_norm": 0.3335612118244171, + "learning_rate": 2.5828935805771804e-06, + "loss": 0.3407, + "step": 664 + }, + { + "epoch": 1.306, + "grad_norm": 0.23392237722873688, + "learning_rate": 2.5690163194976576e-06, + "loss": 0.3893, + "step": 665 + }, + { + "epoch": 1.308, + "grad_norm": 0.21025826036930084, + "learning_rate": 2.5551635383968063e-06, + "loss": 0.3047, + "step": 666 + }, + { + "epoch": 1.31, + "grad_norm": 0.20678383111953735, + "learning_rate": 2.5413353767719805e-06, + "loss": 0.3068, + "step": 667 + }, + { + "epoch": 1.312, + "grad_norm": 0.255937397480011, + "learning_rate": 2.527531973872617e-06, + "loss": 0.2963, + "step": 668 + }, + { + "epoch": 1.314, + "grad_norm": 0.3448125422000885, + "learning_rate": 2.5137534686988265e-06, + "loss": 0.3944, + "step": 669 + }, + { + "epoch": 1.316, + "grad_norm": 0.21276655793190002, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.2955, + "step": 670 + }, + { + "epoch": 1.318, + "grad_norm": 0.2522459030151367, + "learning_rate": 2.486271706273421e-06, + "loss": 0.3536, + "step": 671 + }, + { + "epoch": 1.32, + "grad_norm": 0.2182285189628601, + "learning_rate": 2.4725687257628533e-06, + "loss": 0.3541, + "step": 672 + }, + { + "epoch": 1.322, + "grad_norm": 0.30204272270202637, + "learning_rate": 2.4588911964571557e-06, + "loss": 0.268, + "step": 673 + }, + { + "epoch": 1.324, + "grad_norm": 0.27727144956588745, + "learning_rate": 2.445239256088898e-06, + "loss": 0.3061, + "step": 674 + }, + { + "epoch": 1.326, + "grad_norm": 0.22263972461223602, + "learning_rate": 2.4316130421329696e-06, + "loss": 0.3317, + "step": 675 + }, + { + "epoch": 1.328, + "grad_norm": 0.23461495339870453, + "learning_rate": 2.418012691805191e-06, + "loss": 0.3153, + "step": 676 + }, + { + "epoch": 1.33, + "grad_norm": 0.1453184336423874, + "learning_rate": 2.404438342060941e-06, + "loss": 0.1933, + "step": 677 + }, + { + "epoch": 1.332, + "grad_norm": 0.20232437551021576, + "learning_rate": 2.3908901295937713e-06, + "loss": 0.1941, + "step": 678 + }, + { + "epoch": 1.334, + "grad_norm": 0.23894034326076508, + "learning_rate": 2.3773681908340284e-06, + "loss": 0.3198, + "step": 679 + }, + { + "epoch": 1.336, + "grad_norm": 0.3079819977283478, + "learning_rate": 2.363872661947488e-06, + "loss": 0.3761, + "step": 680 + }, + { + "epoch": 1.338, + "grad_norm": 0.20794443786144257, + "learning_rate": 2.3504036788339763e-06, + "loss": 0.3837, + "step": 681 + }, + { + "epoch": 1.34, + "grad_norm": 0.2881450057029724, + "learning_rate": 2.3369613771260006e-06, + "loss": 0.2904, + "step": 682 + }, + { + "epoch": 1.342, + "grad_norm": 0.20050355792045593, + "learning_rate": 2.323545892187393e-06, + "loss": 0.2323, + "step": 683 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 0.22167599201202393, + "learning_rate": 2.310157359111938e-06, + "loss": 0.2501, + "step": 684 + }, + { + "epoch": 1.346, + "grad_norm": 0.29652273654937744, + "learning_rate": 2.296795912722014e-06, + "loss": 0.3702, + "step": 685 + }, + { + "epoch": 1.3479999999999999, + "grad_norm": 0.20178988575935364, + "learning_rate": 2.2834616875672362e-06, + "loss": 0.2581, + "step": 686 + }, + { + "epoch": 1.35, + "grad_norm": 0.25368136167526245, + "learning_rate": 2.2701548179231048e-06, + "loss": 0.3034, + "step": 687 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 0.20186640322208405, + "learning_rate": 2.2568754377896516e-06, + "loss": 0.2991, + "step": 688 + }, + { + "epoch": 1.354, + "grad_norm": 0.2289544939994812, + "learning_rate": 2.2436236808900846e-06, + "loss": 0.3188, + "step": 689 + }, + { + "epoch": 1.3559999999999999, + "grad_norm": 0.2351309210062027, + "learning_rate": 2.230399680669449e-06, + "loss": 0.2942, + "step": 690 + }, + { + "epoch": 1.358, + "grad_norm": 0.19411875307559967, + "learning_rate": 2.2172035702932828e-06, + "loss": 0.3415, + "step": 691 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.23344936966896057, + "learning_rate": 2.204035482646267e-06, + "loss": 0.2904, + "step": 692 + }, + { + "epoch": 1.362, + "grad_norm": 0.17623913288116455, + "learning_rate": 2.190895550330899e-06, + "loss": 0.1493, + "step": 693 + }, + { + "epoch": 1.3639999999999999, + "grad_norm": 0.22438128292560577, + "learning_rate": 2.1777839056661555e-06, + "loss": 0.3669, + "step": 694 + }, + { + "epoch": 1.366, + "grad_norm": 0.25720444321632385, + "learning_rate": 2.1647006806861472e-06, + "loss": 0.4394, + "step": 695 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 0.17176856100559235, + "learning_rate": 2.1516460071388062e-06, + "loss": 0.2309, + "step": 696 + }, + { + "epoch": 1.37, + "grad_norm": 0.26110807061195374, + "learning_rate": 2.1386200164845527e-06, + "loss": 0.4329, + "step": 697 + }, + { + "epoch": 1.3719999999999999, + "grad_norm": 0.24240969121456146, + "learning_rate": 2.125622839894964e-06, + "loss": 0.2596, + "step": 698 + }, + { + "epoch": 1.374, + "grad_norm": 0.202704519033432, + "learning_rate": 2.1126546082514665e-06, + "loss": 0.2737, + "step": 699 + }, + { + "epoch": 1.376, + "grad_norm": 0.20342108607292175, + "learning_rate": 2.09971545214401e-06, + "loss": 0.2692, + "step": 700 + }, + { + "epoch": 1.3780000000000001, + "grad_norm": 0.3197811543941498, + "learning_rate": 2.086805501869749e-06, + "loss": 0.3117, + "step": 701 + }, + { + "epoch": 1.38, + "grad_norm": 0.29925206303596497, + "learning_rate": 2.073924887431744e-06, + "loss": 0.2391, + "step": 702 + }, + { + "epoch": 1.3820000000000001, + "grad_norm": 0.2412380427122116, + "learning_rate": 2.061073738537635e-06, + "loss": 0.2434, + "step": 703 + }, + { + "epoch": 1.384, + "grad_norm": 0.25253570079803467, + "learning_rate": 2.0482521845983522e-06, + "loss": 0.3284, + "step": 704 + }, + { + "epoch": 1.3860000000000001, + "grad_norm": 0.18548652529716492, + "learning_rate": 2.0354603547267985e-06, + "loss": 0.2562, + "step": 705 + }, + { + "epoch": 1.388, + "grad_norm": 0.2307010442018509, + "learning_rate": 2.0226983777365604e-06, + "loss": 0.2445, + "step": 706 + }, + { + "epoch": 1.3900000000000001, + "grad_norm": 0.1840142160654068, + "learning_rate": 2.009966382140606e-06, + "loss": 0.3521, + "step": 707 + }, + { + "epoch": 1.392, + "grad_norm": 0.2078990340232849, + "learning_rate": 1.9972644961499853e-06, + "loss": 0.2887, + "step": 708 + }, + { + "epoch": 1.3940000000000001, + "grad_norm": 0.20442235469818115, + "learning_rate": 1.9845928476725522e-06, + "loss": 0.3453, + "step": 709 + }, + { + "epoch": 1.396, + "grad_norm": 0.1933489441871643, + "learning_rate": 1.971951564311668e-06, + "loss": 0.3581, + "step": 710 + }, + { + "epoch": 1.3980000000000001, + "grad_norm": 0.19691258668899536, + "learning_rate": 1.959340773364911e-06, + "loss": 0.2933, + "step": 711 + }, + { + "epoch": 1.4, + "grad_norm": 0.1842382252216339, + "learning_rate": 1.946760601822809e-06, + "loss": 0.2894, + "step": 712 + }, + { + "epoch": 1.4020000000000001, + "grad_norm": 0.35139110684394836, + "learning_rate": 1.9342111763675512e-06, + "loss": 0.3405, + "step": 713 + }, + { + "epoch": 1.404, + "grad_norm": 0.19070106744766235, + "learning_rate": 1.9216926233717087e-06, + "loss": 0.213, + "step": 714 + }, + { + "epoch": 1.4060000000000001, + "grad_norm": 0.20061296224594116, + "learning_rate": 1.9092050688969736e-06, + "loss": 0.2858, + "step": 715 + }, + { + "epoch": 1.408, + "grad_norm": 0.30167287588119507, + "learning_rate": 1.8967486386928819e-06, + "loss": 0.4004, + "step": 716 + }, + { + "epoch": 1.41, + "grad_norm": 0.21128444373607635, + "learning_rate": 1.8843234581955444e-06, + "loss": 0.2326, + "step": 717 + }, + { + "epoch": 1.412, + "grad_norm": 0.23791776597499847, + "learning_rate": 1.8719296525263925e-06, + "loss": 0.2337, + "step": 718 + }, + { + "epoch": 1.414, + "grad_norm": 0.27308812737464905, + "learning_rate": 1.859567346490913e-06, + "loss": 0.2667, + "step": 719 + }, + { + "epoch": 1.416, + "grad_norm": 0.19012384116649628, + "learning_rate": 1.8472366645773892e-06, + "loss": 0.2042, + "step": 720 + }, + { + "epoch": 1.418, + "grad_norm": 0.2819920480251312, + "learning_rate": 1.8349377309556487e-06, + "loss": 0.3546, + "step": 721 + }, + { + "epoch": 1.42, + "grad_norm": 0.16963627934455872, + "learning_rate": 1.8226706694758194e-06, + "loss": 0.2087, + "step": 722 + }, + { + "epoch": 1.422, + "grad_norm": 0.222882941365242, + "learning_rate": 1.810435603667075e-06, + "loss": 0.3519, + "step": 723 + }, + { + "epoch": 1.424, + "grad_norm": 0.200264573097229, + "learning_rate": 1.798232656736389e-06, + "loss": 0.2172, + "step": 724 + }, + { + "epoch": 1.426, + "grad_norm": 0.25277942419052124, + "learning_rate": 1.7860619515673034e-06, + "loss": 0.3984, + "step": 725 + }, + { + "epoch": 1.428, + "grad_norm": 0.24608227610588074, + "learning_rate": 1.7739236107186858e-06, + "loss": 0.2575, + "step": 726 + }, + { + "epoch": 1.43, + "grad_norm": 0.30379989743232727, + "learning_rate": 1.7618177564234907e-06, + "loss": 0.2949, + "step": 727 + }, + { + "epoch": 1.432, + "grad_norm": 0.15659303963184357, + "learning_rate": 1.7497445105875377e-06, + "loss": 0.1913, + "step": 728 + }, + { + "epoch": 1.434, + "grad_norm": 0.2043537199497223, + "learning_rate": 1.7377039947882802e-06, + "loss": 0.2716, + "step": 729 + }, + { + "epoch": 1.436, + "grad_norm": 0.20367324352264404, + "learning_rate": 1.7256963302735752e-06, + "loss": 0.2358, + "step": 730 + }, + { + "epoch": 1.438, + "grad_norm": 0.28134340047836304, + "learning_rate": 1.7137216379604727e-06, + "loss": 0.2814, + "step": 731 + }, + { + "epoch": 1.44, + "grad_norm": 0.2837545871734619, + "learning_rate": 1.7017800384339928e-06, + "loss": 0.3792, + "step": 732 + }, + { + "epoch": 1.442, + "grad_norm": 0.22841040790081024, + "learning_rate": 1.6898716519459074e-06, + "loss": 0.2819, + "step": 733 + }, + { + "epoch": 1.444, + "grad_norm": 0.21164868772029877, + "learning_rate": 1.6779965984135376e-06, + "loss": 0.2676, + "step": 734 + }, + { + "epoch": 1.446, + "grad_norm": 0.2656158208847046, + "learning_rate": 1.6661549974185426e-06, + "loss": 0.284, + "step": 735 + }, + { + "epoch": 1.448, + "grad_norm": 0.2675846815109253, + "learning_rate": 1.6543469682057105e-06, + "loss": 0.3098, + "step": 736 + }, + { + "epoch": 1.45, + "grad_norm": 0.2900715172290802, + "learning_rate": 1.6425726296817634e-06, + "loss": 0.3378, + "step": 737 + }, + { + "epoch": 1.452, + "grad_norm": 0.27534744143486023, + "learning_rate": 1.6308321004141609e-06, + "loss": 0.3497, + "step": 738 + }, + { + "epoch": 1.454, + "grad_norm": 0.30499523878097534, + "learning_rate": 1.6191254986299044e-06, + "loss": 0.3271, + "step": 739 + }, + { + "epoch": 1.456, + "grad_norm": 0.1775362193584442, + "learning_rate": 1.6074529422143398e-06, + "loss": 0.1754, + "step": 740 + }, + { + "epoch": 1.458, + "grad_norm": 0.25734683871269226, + "learning_rate": 1.5958145487099829e-06, + "loss": 0.3568, + "step": 741 + }, + { + "epoch": 1.46, + "grad_norm": 0.22716552019119263, + "learning_rate": 1.5842104353153286e-06, + "loss": 0.2856, + "step": 742 + }, + { + "epoch": 1.462, + "grad_norm": 0.2042451947927475, + "learning_rate": 1.5726407188836672e-06, + "loss": 0.2623, + "step": 743 + }, + { + "epoch": 1.464, + "grad_norm": 0.26923978328704834, + "learning_rate": 1.561105515921915e-06, + "loss": 0.4326, + "step": 744 + }, + { + "epoch": 1.466, + "grad_norm": 0.22442659735679626, + "learning_rate": 1.549604942589441e-06, + "loss": 0.2867, + "step": 745 + }, + { + "epoch": 1.468, + "grad_norm": 0.16880613565444946, + "learning_rate": 1.5381391146968866e-06, + "loss": 0.1821, + "step": 746 + }, + { + "epoch": 1.47, + "grad_norm": 0.24349483847618103, + "learning_rate": 1.5267081477050132e-06, + "loss": 0.2753, + "step": 747 + }, + { + "epoch": 1.472, + "grad_norm": 0.27072674036026, + "learning_rate": 1.5153121567235334e-06, + "loss": 0.2222, + "step": 748 + }, + { + "epoch": 1.474, + "grad_norm": 0.291255921125412, + "learning_rate": 1.5039512565099468e-06, + "loss": 0.3485, + "step": 749 + }, + { + "epoch": 1.476, + "grad_norm": 0.20078301429748535, + "learning_rate": 1.4926255614683931e-06, + "loss": 0.2959, + "step": 750 + }, + { + "epoch": 1.476, + "eval_loss": 0.2654268741607666, + "eval_runtime": 76.2376, + "eval_samples_per_second": 7.241, + "eval_steps_per_second": 0.905, + "step": 750 + }, + { + "epoch": 1.478, + "grad_norm": 0.2795911431312561, + "learning_rate": 1.4813351856484981e-06, + "loss": 0.1859, + "step": 751 + }, + { + "epoch": 1.48, + "grad_norm": 0.35663336515426636, + "learning_rate": 1.470080242744218e-06, + "loss": 0.3358, + "step": 752 + }, + { + "epoch": 1.482, + "grad_norm": 0.23237483203411102, + "learning_rate": 1.458860846092705e-06, + "loss": 0.2874, + "step": 753 + }, + { + "epoch": 1.484, + "grad_norm": 0.19958510994911194, + "learning_rate": 1.4476771086731567e-06, + "loss": 0.3507, + "step": 754 + }, + { + "epoch": 1.486, + "grad_norm": 0.22077733278274536, + "learning_rate": 1.4365291431056871e-06, + "loss": 0.3085, + "step": 755 + }, + { + "epoch": 1.488, + "grad_norm": 0.31041693687438965, + "learning_rate": 1.4254170616501828e-06, + "loss": 0.3724, + "step": 756 + }, + { + "epoch": 1.49, + "grad_norm": 0.18345925211906433, + "learning_rate": 1.4143409762051829e-06, + "loss": 0.1957, + "step": 757 + }, + { + "epoch": 1.492, + "grad_norm": 0.1973162293434143, + "learning_rate": 1.4033009983067454e-06, + "loss": 0.2304, + "step": 758 + }, + { + "epoch": 1.494, + "grad_norm": 0.2636561095714569, + "learning_rate": 1.3922972391273226e-06, + "loss": 0.3215, + "step": 759 + }, + { + "epoch": 1.496, + "grad_norm": 0.22231453657150269, + "learning_rate": 1.3813298094746491e-06, + "loss": 0.2346, + "step": 760 + }, + { + "epoch": 1.498, + "grad_norm": 0.21096548438072205, + "learning_rate": 1.3703988197906209e-06, + "loss": 0.297, + "step": 761 + }, + { + "epoch": 1.5, + "grad_norm": 0.29171353578567505, + "learning_rate": 1.3595043801501794e-06, + "loss": 0.362, + "step": 762 + }, + { + "epoch": 1.502, + "grad_norm": 0.2302405834197998, + "learning_rate": 1.3486466002602133e-06, + "loss": 0.3468, + "step": 763 + }, + { + "epoch": 1.504, + "grad_norm": 0.1669236272573471, + "learning_rate": 1.3378255894584463e-06, + "loss": 0.2525, + "step": 764 + }, + { + "epoch": 1.506, + "grad_norm": 0.22917306423187256, + "learning_rate": 1.3270414567123342e-06, + "loss": 0.34, + "step": 765 + }, + { + "epoch": 1.508, + "grad_norm": 0.22837324440479279, + "learning_rate": 1.3162943106179748e-06, + "loss": 0.516, + "step": 766 + }, + { + "epoch": 1.51, + "grad_norm": 0.1973070204257965, + "learning_rate": 1.305584259399013e-06, + "loss": 0.2083, + "step": 767 + }, + { + "epoch": 1.512, + "grad_norm": 0.25936761498451233, + "learning_rate": 1.2949114109055417e-06, + "loss": 0.4483, + "step": 768 + }, + { + "epoch": 1.514, + "grad_norm": 0.23405812680721283, + "learning_rate": 1.2842758726130283e-06, + "loss": 0.3334, + "step": 769 + }, + { + "epoch": 1.516, + "grad_norm": 0.2227783501148224, + "learning_rate": 1.2736777516212267e-06, + "loss": 0.3724, + "step": 770 + }, + { + "epoch": 1.518, + "grad_norm": 0.23398268222808838, + "learning_rate": 1.263117154653097e-06, + "loss": 0.2008, + "step": 771 + }, + { + "epoch": 1.52, + "grad_norm": 0.16665144264698029, + "learning_rate": 1.2525941880537307e-06, + "loss": 0.2177, + "step": 772 + }, + { + "epoch": 1.522, + "grad_norm": 0.21703177690505981, + "learning_rate": 1.242108957789287e-06, + "loss": 0.2668, + "step": 773 + }, + { + "epoch": 1.524, + "grad_norm": 0.3440599739551544, + "learning_rate": 1.2316615694459188e-06, + "loss": 0.3352, + "step": 774 + }, + { + "epoch": 1.526, + "grad_norm": 0.2005206048488617, + "learning_rate": 1.2212521282287093e-06, + "loss": 0.2719, + "step": 775 + }, + { + "epoch": 1.528, + "grad_norm": 0.2054724395275116, + "learning_rate": 1.210880738960616e-06, + "loss": 0.3181, + "step": 776 + }, + { + "epoch": 1.53, + "grad_norm": 0.2903349995613098, + "learning_rate": 1.200547506081416e-06, + "loss": 0.3382, + "step": 777 + }, + { + "epoch": 1.532, + "grad_norm": 0.22862407565116882, + "learning_rate": 1.1902525336466465e-06, + "loss": 0.2544, + "step": 778 + }, + { + "epoch": 1.534, + "grad_norm": 0.20812873542308807, + "learning_rate": 1.1799959253265668e-06, + "loss": 0.3118, + "step": 779 + }, + { + "epoch": 1.536, + "grad_norm": 0.2820591330528259, + "learning_rate": 1.1697777844051105e-06, + "loss": 0.3646, + "step": 780 + }, + { + "epoch": 1.538, + "grad_norm": 0.21943072974681854, + "learning_rate": 1.1595982137788403e-06, + "loss": 0.1957, + "step": 781 + }, + { + "epoch": 1.54, + "grad_norm": 0.1949055939912796, + "learning_rate": 1.1494573159559214e-06, + "loss": 0.253, + "step": 782 + }, + { + "epoch": 1.542, + "grad_norm": 0.20829080045223236, + "learning_rate": 1.1393551930550828e-06, + "loss": 0.2558, + "step": 783 + }, + { + "epoch": 1.544, + "grad_norm": 0.20741114020347595, + "learning_rate": 1.1292919468045876e-06, + "loss": 0.2221, + "step": 784 + }, + { + "epoch": 1.546, + "grad_norm": 0.24327073991298676, + "learning_rate": 1.1192676785412154e-06, + "loss": 0.2616, + "step": 785 + }, + { + "epoch": 1.548, + "grad_norm": 0.2541949152946472, + "learning_rate": 1.1092824892092375e-06, + "loss": 0.2435, + "step": 786 + }, + { + "epoch": 1.55, + "grad_norm": 0.2096426635980606, + "learning_rate": 1.099336479359398e-06, + "loss": 0.2448, + "step": 787 + }, + { + "epoch": 1.552, + "grad_norm": 0.24535740911960602, + "learning_rate": 1.0894297491479044e-06, + "loss": 0.2892, + "step": 788 + }, + { + "epoch": 1.554, + "grad_norm": 0.2067105919122696, + "learning_rate": 1.0795623983354214e-06, + "loss": 0.2584, + "step": 789 + }, + { + "epoch": 1.556, + "grad_norm": 0.2478252500295639, + "learning_rate": 1.0697345262860638e-06, + "loss": 0.3474, + "step": 790 + }, + { + "epoch": 1.558, + "grad_norm": 0.17269453406333923, + "learning_rate": 1.0599462319663906e-06, + "loss": 0.2793, + "step": 791 + }, + { + "epoch": 1.56, + "grad_norm": 0.2102997750043869, + "learning_rate": 1.0501976139444191e-06, + "loss": 0.3124, + "step": 792 + }, + { + "epoch": 1.562, + "grad_norm": 0.29494714736938477, + "learning_rate": 1.0404887703886252e-06, + "loss": 0.2693, + "step": 793 + }, + { + "epoch": 1.564, + "grad_norm": 0.19094854593276978, + "learning_rate": 1.0308197990669538e-06, + "loss": 0.3593, + "step": 794 + }, + { + "epoch": 1.5659999999999998, + "grad_norm": 0.20082080364227295, + "learning_rate": 1.0211907973458391e-06, + "loss": 0.2296, + "step": 795 + }, + { + "epoch": 1.568, + "grad_norm": 0.24483440816402435, + "learning_rate": 1.0116018621892237e-06, + "loss": 0.344, + "step": 796 + }, + { + "epoch": 1.5699999999999998, + "grad_norm": 0.21700353920459747, + "learning_rate": 1.0020530901575754e-06, + "loss": 0.2562, + "step": 797 + }, + { + "epoch": 1.572, + "grad_norm": 0.18885864317417145, + "learning_rate": 9.925445774069232e-07, + "loss": 0.2155, + "step": 798 + }, + { + "epoch": 1.5739999999999998, + "grad_norm": 0.2546456754207611, + "learning_rate": 9.830764196878872e-07, + "loss": 0.3539, + "step": 799 + }, + { + "epoch": 1.576, + "grad_norm": 0.20347674190998077, + "learning_rate": 9.73648712344707e-07, + "loss": 0.2864, + "step": 800 + }, + { + "epoch": 1.5779999999999998, + "grad_norm": 0.3315930962562561, + "learning_rate": 9.642615503142927e-07, + "loss": 0.3753, + "step": 801 + }, + { + "epoch": 1.58, + "grad_norm": 0.18244577944278717, + "learning_rate": 9.549150281252633e-07, + "loss": 0.2116, + "step": 802 + }, + { + "epoch": 1.5819999999999999, + "grad_norm": 0.24047374725341797, + "learning_rate": 9.456092398969902e-07, + "loss": 0.3352, + "step": 803 + }, + { + "epoch": 1.584, + "grad_norm": 0.2712211012840271, + "learning_rate": 9.363442793386606e-07, + "loss": 0.4647, + "step": 804 + }, + { + "epoch": 1.5859999999999999, + "grad_norm": 0.15284787118434906, + "learning_rate": 9.271202397483214e-07, + "loss": 0.2296, + "step": 805 + }, + { + "epoch": 1.588, + "grad_norm": 0.2665194571018219, + "learning_rate": 9.179372140119524e-07, + "loss": 0.353, + "step": 806 + }, + { + "epoch": 1.5899999999999999, + "grad_norm": 0.2965538799762726, + "learning_rate": 9.087952946025175e-07, + "loss": 0.2863, + "step": 807 + }, + { + "epoch": 1.592, + "grad_norm": 0.19379866123199463, + "learning_rate": 8.996945735790447e-07, + "loss": 0.3056, + "step": 808 + }, + { + "epoch": 1.5939999999999999, + "grad_norm": 0.2339809238910675, + "learning_rate": 8.906351425856952e-07, + "loss": 0.3741, + "step": 809 + }, + { + "epoch": 1.596, + "grad_norm": 0.2753208577632904, + "learning_rate": 8.816170928508367e-07, + "loss": 0.2715, + "step": 810 + }, + { + "epoch": 1.5979999999999999, + "grad_norm": 0.2367635816335678, + "learning_rate": 8.7264051518613e-07, + "loss": 0.3268, + "step": 811 + }, + { + "epoch": 1.6, + "grad_norm": 0.2004977911710739, + "learning_rate": 8.637054999856148e-07, + "loss": 0.2217, + "step": 812 + }, + { + "epoch": 1.6019999999999999, + "grad_norm": 0.3549105226993561, + "learning_rate": 8.54812137224792e-07, + "loss": 0.3371, + "step": 813 + }, + { + "epoch": 1.604, + "grad_norm": 0.27921661734580994, + "learning_rate": 8.459605164597268e-07, + "loss": 0.3983, + "step": 814 + }, + { + "epoch": 1.6059999999999999, + "grad_norm": 0.2014499306678772, + "learning_rate": 8.371507268261436e-07, + "loss": 0.2413, + "step": 815 + }, + { + "epoch": 1.608, + "grad_norm": 0.20690080523490906, + "learning_rate": 8.283828570385239e-07, + "loss": 0.2012, + "step": 816 + }, + { + "epoch": 1.6099999999999999, + "grad_norm": 0.21998871862888336, + "learning_rate": 8.196569953892202e-07, + "loss": 0.3298, + "step": 817 + }, + { + "epoch": 1.612, + "grad_norm": 0.3980468511581421, + "learning_rate": 8.109732297475637e-07, + "loss": 0.3194, + "step": 818 + }, + { + "epoch": 1.6139999999999999, + "grad_norm": 0.20355728268623352, + "learning_rate": 8.023316475589754e-07, + "loss": 0.1823, + "step": 819 + }, + { + "epoch": 1.616, + "grad_norm": 0.17916588485240936, + "learning_rate": 7.937323358440935e-07, + "loss": 0.2189, + "step": 820 + }, + { + "epoch": 1.6179999999999999, + "grad_norm": 0.3024926781654358, + "learning_rate": 7.851753811978924e-07, + "loss": 0.3149, + "step": 821 + }, + { + "epoch": 1.62, + "grad_norm": 0.20770519971847534, + "learning_rate": 7.766608697888095e-07, + "loss": 0.2967, + "step": 822 + }, + { + "epoch": 1.6219999999999999, + "grad_norm": 0.2985385060310364, + "learning_rate": 7.681888873578786e-07, + "loss": 0.3245, + "step": 823 + }, + { + "epoch": 1.624, + "grad_norm": 0.238825723528862, + "learning_rate": 7.597595192178702e-07, + "loss": 0.2024, + "step": 824 + }, + { + "epoch": 1.626, + "grad_norm": 0.24210689961910248, + "learning_rate": 7.513728502524286e-07, + "loss": 0.3364, + "step": 825 + }, + { + "epoch": 1.6280000000000001, + "grad_norm": 0.2465432733297348, + "learning_rate": 7.430289649152156e-07, + "loss": 0.3643, + "step": 826 + }, + { + "epoch": 1.63, + "grad_norm": 0.37851664423942566, + "learning_rate": 7.347279472290647e-07, + "loss": 0.4549, + "step": 827 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 0.29046836495399475, + "learning_rate": 7.264698807851328e-07, + "loss": 0.3777, + "step": 828 + }, + { + "epoch": 1.634, + "grad_norm": 0.17954066395759583, + "learning_rate": 7.182548487420555e-07, + "loss": 0.1817, + "step": 829 + }, + { + "epoch": 1.6360000000000001, + "grad_norm": 0.21587719023227692, + "learning_rate": 7.100829338251147e-07, + "loss": 0.3208, + "step": 830 + }, + { + "epoch": 1.638, + "grad_norm": 0.24211935698986053, + "learning_rate": 7.019542183254047e-07, + "loss": 0.302, + "step": 831 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.3430536389350891, + "learning_rate": 6.938687840989972e-07, + "loss": 0.3358, + "step": 832 + }, + { + "epoch": 1.642, + "grad_norm": 0.26358646154403687, + "learning_rate": 6.858267125661272e-07, + "loss": 0.3329, + "step": 833 + }, + { + "epoch": 1.6440000000000001, + "grad_norm": 0.21013550460338593, + "learning_rate": 6.778280847103668e-07, + "loss": 0.247, + "step": 834 + }, + { + "epoch": 1.646, + "grad_norm": 0.17694292962551117, + "learning_rate": 6.698729810778065e-07, + "loss": 0.2205, + "step": 835 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 0.15793128311634064, + "learning_rate": 6.619614817762537e-07, + "loss": 0.1541, + "step": 836 + }, + { + "epoch": 1.65, + "grad_norm": 0.18143923580646515, + "learning_rate": 6.540936664744197e-07, + "loss": 0.2367, + "step": 837 + }, + { + "epoch": 1.6520000000000001, + "grad_norm": 0.21212640404701233, + "learning_rate": 6.462696144011149e-07, + "loss": 0.3049, + "step": 838 + }, + { + "epoch": 1.654, + "grad_norm": 0.21567395329475403, + "learning_rate": 6.384894043444568e-07, + "loss": 0.2519, + "step": 839 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 0.17464697360992432, + "learning_rate": 6.307531146510754e-07, + "loss": 0.1692, + "step": 840 + }, + { + "epoch": 1.658, + "grad_norm": 0.23152326047420502, + "learning_rate": 6.230608232253227e-07, + "loss": 0.2823, + "step": 841 + }, + { + "epoch": 1.6600000000000001, + "grad_norm": 0.3341864049434662, + "learning_rate": 6.154126075284855e-07, + "loss": 0.2823, + "step": 842 + }, + { + "epoch": 1.662, + "grad_norm": 0.24136964976787567, + "learning_rate": 6.07808544578013e-07, + "loss": 0.3713, + "step": 843 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 0.21439406275749207, + "learning_rate": 6.002487109467347e-07, + "loss": 0.2631, + "step": 844 + }, + { + "epoch": 1.666, + "grad_norm": 0.3102458715438843, + "learning_rate": 5.927331827620902e-07, + "loss": 0.3513, + "step": 845 + }, + { + "epoch": 1.6680000000000001, + "grad_norm": 0.20326466858386993, + "learning_rate": 5.852620357053651e-07, + "loss": 0.2738, + "step": 846 + }, + { + "epoch": 1.67, + "grad_norm": 0.185090109705925, + "learning_rate": 5.778353450109286e-07, + "loss": 0.2665, + "step": 847 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 0.17061105370521545, + "learning_rate": 5.704531854654721e-07, + "loss": 0.2018, + "step": 848 + }, + { + "epoch": 1.674, + "grad_norm": 0.18026676774024963, + "learning_rate": 5.631156314072605e-07, + "loss": 0.2182, + "step": 849 + }, + { + "epoch": 1.6760000000000002, + "grad_norm": 0.24431855976581573, + "learning_rate": 5.558227567253832e-07, + "loss": 0.3036, + "step": 850 + }, + { + "epoch": 1.678, + "grad_norm": 0.1817561835050583, + "learning_rate": 5.485746348590048e-07, + "loss": 0.2786, + "step": 851 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 0.20034758746623993, + "learning_rate": 5.413713387966329e-07, + "loss": 0.2073, + "step": 852 + }, + { + "epoch": 1.682, + "grad_norm": 0.23046346008777618, + "learning_rate": 5.34212941075381e-07, + "loss": 0.2456, + "step": 853 + }, + { + "epoch": 1.6840000000000002, + "grad_norm": 0.28231683373451233, + "learning_rate": 5.270995137802315e-07, + "loss": 0.2962, + "step": 854 + }, + { + "epoch": 1.686, + "grad_norm": 0.20535282790660858, + "learning_rate": 5.200311285433213e-07, + "loss": 0.2003, + "step": 855 + }, + { + "epoch": 1.688, + "grad_norm": 0.27334460616111755, + "learning_rate": 5.130078565432089e-07, + "loss": 0.2784, + "step": 856 + }, + { + "epoch": 1.69, + "grad_norm": 0.2541443109512329, + "learning_rate": 5.06029768504166e-07, + "loss": 0.3575, + "step": 857 + }, + { + "epoch": 1.692, + "grad_norm": 0.20568181574344635, + "learning_rate": 4.990969346954611e-07, + "loss": 0.3116, + "step": 858 + }, + { + "epoch": 1.694, + "grad_norm": 0.2725497782230377, + "learning_rate": 4.922094249306559e-07, + "loss": 0.2698, + "step": 859 + }, + { + "epoch": 1.696, + "grad_norm": 0.2767050862312317, + "learning_rate": 4.853673085668947e-07, + "loss": 0.3246, + "step": 860 + }, + { + "epoch": 1.698, + "grad_norm": 0.27081194519996643, + "learning_rate": 4.785706545042141e-07, + "loss": 0.3067, + "step": 861 + }, + { + "epoch": 1.7, + "grad_norm": 0.2148142009973526, + "learning_rate": 4.7181953118484556e-07, + "loss": 0.335, + "step": 862 + }, + { + "epoch": 1.702, + "grad_norm": 0.20924992859363556, + "learning_rate": 4.651140065925269e-07, + "loss": 0.2473, + "step": 863 + }, + { + "epoch": 1.704, + "grad_norm": 0.1969323456287384, + "learning_rate": 4.58454148251814e-07, + "loss": 0.2384, + "step": 864 + }, + { + "epoch": 1.706, + "grad_norm": 0.21272586286067963, + "learning_rate": 4.5184002322740784e-07, + "loss": 0.1894, + "step": 865 + }, + { + "epoch": 1.708, + "grad_norm": 0.22230306267738342, + "learning_rate": 4.4527169812347446e-07, + "loss": 0.2878, + "step": 866 + }, + { + "epoch": 1.71, + "grad_norm": 0.23957069218158722, + "learning_rate": 4.387492390829734e-07, + "loss": 0.2608, + "step": 867 + }, + { + "epoch": 1.712, + "grad_norm": 0.19603803753852844, + "learning_rate": 4.322727117869951e-07, + "loss": 0.2291, + "step": 868 + }, + { + "epoch": 1.714, + "grad_norm": 0.19814668595790863, + "learning_rate": 4.2584218145409916e-07, + "loss": 0.2933, + "step": 869 + }, + { + "epoch": 1.716, + "grad_norm": 0.2840145230293274, + "learning_rate": 4.194577128396521e-07, + "loss": 0.2678, + "step": 870 + }, + { + "epoch": 1.718, + "grad_norm": 0.3841419816017151, + "learning_rate": 4.131193702351827e-07, + "loss": 0.4492, + "step": 871 + }, + { + "epoch": 1.72, + "grad_norm": 0.1749158352613449, + "learning_rate": 4.0682721746773346e-07, + "loss": 0.2205, + "step": 872 + }, + { + "epoch": 1.722, + "grad_norm": 0.22776730358600616, + "learning_rate": 4.005813178992091e-07, + "loss": 0.2634, + "step": 873 + }, + { + "epoch": 1.724, + "grad_norm": 0.20322760939598083, + "learning_rate": 3.9438173442575e-07, + "loss": 0.3125, + "step": 874 + }, + { + "epoch": 1.726, + "grad_norm": 0.24371430277824402, + "learning_rate": 3.882285294770938e-07, + "loss": 0.3223, + "step": 875 + }, + { + "epoch": 1.726, + "eval_loss": 0.26352861523628235, + "eval_runtime": 76.577, + "eval_samples_per_second": 7.208, + "eval_steps_per_second": 0.901, + "step": 875 + }, + { + "epoch": 1.728, + "grad_norm": 0.2777194678783417, + "learning_rate": 3.821217650159453e-07, + "loss": 0.3117, + "step": 876 + }, + { + "epoch": 1.73, + "grad_norm": 0.21060119569301605, + "learning_rate": 3.760615025373543e-07, + "loss": 0.2444, + "step": 877 + }, + { + "epoch": 1.732, + "grad_norm": 0.19364982843399048, + "learning_rate": 3.7004780306809873e-07, + "loss": 0.2534, + "step": 878 + }, + { + "epoch": 1.734, + "grad_norm": 0.2388126105070114, + "learning_rate": 3.6408072716606346e-07, + "loss": 0.5307, + "step": 879 + }, + { + "epoch": 1.736, + "grad_norm": 0.21501779556274414, + "learning_rate": 3.581603349196372e-07, + "loss": 0.299, + "step": 880 + }, + { + "epoch": 1.738, + "grad_norm": 0.2748852074146271, + "learning_rate": 3.522866859471047e-07, + "loss": 0.4626, + "step": 881 + }, + { + "epoch": 1.74, + "grad_norm": 0.2657471299171448, + "learning_rate": 3.46459839396045e-07, + "loss": 0.2947, + "step": 882 + }, + { + "epoch": 1.742, + "grad_norm": 0.1825701743364334, + "learning_rate": 3.406798539427386e-07, + "loss": 0.2525, + "step": 883 + }, + { + "epoch": 1.744, + "grad_norm": 0.18898171186447144, + "learning_rate": 3.3494678779157464e-07, + "loss": 0.2188, + "step": 884 + }, + { + "epoch": 1.746, + "grad_norm": 0.2019154280424118, + "learning_rate": 3.2926069867446673e-07, + "loss": 0.2575, + "step": 885 + }, + { + "epoch": 1.748, + "grad_norm": 0.26931118965148926, + "learning_rate": 3.2362164385026704e-07, + "loss": 0.2867, + "step": 886 + }, + { + "epoch": 1.75, + "grad_norm": 0.25869134068489075, + "learning_rate": 3.180296801041971e-07, + "loss": 0.4233, + "step": 887 + }, + { + "epoch": 1.752, + "grad_norm": 0.24689964950084686, + "learning_rate": 3.1248486374726884e-07, + "loss": 0.3778, + "step": 888 + }, + { + "epoch": 1.754, + "grad_norm": 0.2961515486240387, + "learning_rate": 3.069872506157212e-07, + "loss": 0.3767, + "step": 889 + }, + { + "epoch": 1.756, + "grad_norm": 0.2758214473724365, + "learning_rate": 3.015368960704584e-07, + "loss": 0.4107, + "step": 890 + }, + { + "epoch": 1.758, + "grad_norm": 0.19258597493171692, + "learning_rate": 2.9613385499648926e-07, + "loss": 0.2285, + "step": 891 + }, + { + "epoch": 1.76, + "grad_norm": 0.21885156631469727, + "learning_rate": 2.9077818180237693e-07, + "loss": 0.2726, + "step": 892 + }, + { + "epoch": 1.762, + "grad_norm": 0.20850767195224762, + "learning_rate": 2.8546993041969173e-07, + "loss": 0.3443, + "step": 893 + }, + { + "epoch": 1.764, + "grad_norm": 0.22747254371643066, + "learning_rate": 2.802091543024671e-07, + "loss": 0.2785, + "step": 894 + }, + { + "epoch": 1.766, + "grad_norm": 0.18733809888362885, + "learning_rate": 2.7499590642665773e-07, + "loss": 0.2047, + "step": 895 + }, + { + "epoch": 1.768, + "grad_norm": 0.230934277176857, + "learning_rate": 2.6983023928961406e-07, + "loss": 0.2994, + "step": 896 + }, + { + "epoch": 1.77, + "grad_norm": 0.1833610087633133, + "learning_rate": 2.647122049095463e-07, + "loss": 0.2064, + "step": 897 + }, + { + "epoch": 1.772, + "grad_norm": 0.2077609896659851, + "learning_rate": 2.596418548250029e-07, + "loss": 0.2537, + "step": 898 + }, + { + "epoch": 1.774, + "grad_norm": 0.163072407245636, + "learning_rate": 2.546192400943537e-07, + "loss": 0.194, + "step": 899 + }, + { + "epoch": 1.776, + "grad_norm": 0.1943567395210266, + "learning_rate": 2.4964441129527337e-07, + "loss": 0.2519, + "step": 900 + }, + { + "epoch": 1.778, + "grad_norm": 0.18382684886455536, + "learning_rate": 2.447174185242324e-07, + "loss": 0.1944, + "step": 901 + }, + { + "epoch": 1.78, + "grad_norm": 0.20981475710868835, + "learning_rate": 2.398383113959929e-07, + "loss": 0.173, + "step": 902 + }, + { + "epoch": 1.782, + "grad_norm": 0.1996649205684662, + "learning_rate": 2.3500713904311023e-07, + "loss": 0.2536, + "step": 903 + }, + { + "epoch": 1.784, + "grad_norm": 0.2560986578464508, + "learning_rate": 2.3022395011543687e-07, + "loss": 0.374, + "step": 904 + }, + { + "epoch": 1.786, + "grad_norm": 0.20811672508716583, + "learning_rate": 2.2548879277963065e-07, + "loss": 0.3225, + "step": 905 + }, + { + "epoch": 1.788, + "grad_norm": 0.1996699571609497, + "learning_rate": 2.2080171471867362e-07, + "loss": 0.2632, + "step": 906 + }, + { + "epoch": 1.79, + "grad_norm": 0.20678700506687164, + "learning_rate": 2.161627631313923e-07, + "loss": 0.3513, + "step": 907 + }, + { + "epoch": 1.792, + "grad_norm": 0.20172181725502014, + "learning_rate": 2.1157198473197417e-07, + "loss": 0.2117, + "step": 908 + }, + { + "epoch": 1.794, + "grad_norm": 0.16854679584503174, + "learning_rate": 2.0702942574950812e-07, + "loss": 0.3006, + "step": 909 + }, + { + "epoch": 1.796, + "grad_norm": 0.1959567815065384, + "learning_rate": 2.0253513192751374e-07, + "loss": 0.2695, + "step": 910 + }, + { + "epoch": 1.798, + "grad_norm": 0.1726803481578827, + "learning_rate": 1.9808914852347817e-07, + "loss": 0.2635, + "step": 911 + }, + { + "epoch": 1.8, + "grad_norm": 0.22450147569179535, + "learning_rate": 1.9369152030840553e-07, + "loss": 0.2598, + "step": 912 + }, + { + "epoch": 1.802, + "grad_norm": 0.26783040165901184, + "learning_rate": 1.8934229156636453e-07, + "loss": 0.2029, + "step": 913 + }, + { + "epoch": 1.804, + "grad_norm": 0.2690034508705139, + "learning_rate": 1.8504150609403858e-07, + "loss": 0.2446, + "step": 914 + }, + { + "epoch": 1.806, + "grad_norm": 0.23306065797805786, + "learning_rate": 1.807892072002898e-07, + "loss": 0.3264, + "step": 915 + }, + { + "epoch": 1.808, + "grad_norm": 0.2681446075439453, + "learning_rate": 1.765854377057219e-07, + "loss": 0.302, + "step": 916 + }, + { + "epoch": 1.81, + "grad_norm": 0.19500699639320374, + "learning_rate": 1.724302399422456e-07, + "loss": 0.2066, + "step": 917 + }, + { + "epoch": 1.812, + "grad_norm": 0.2524206340312958, + "learning_rate": 1.6832365575265742e-07, + "loss": 0.3334, + "step": 918 + }, + { + "epoch": 1.814, + "grad_norm": 0.2076834887266159, + "learning_rate": 1.6426572649021477e-07, + "loss": 0.2737, + "step": 919 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 0.28093916177749634, + "learning_rate": 1.6025649301821877e-07, + "loss": 0.3558, + "step": 920 + }, + { + "epoch": 1.818, + "grad_norm": 0.24566200375556946, + "learning_rate": 1.562959957096072e-07, + "loss": 0.3636, + "step": 921 + }, + { + "epoch": 1.8199999999999998, + "grad_norm": 0.2996765077114105, + "learning_rate": 1.5238427444654368e-07, + "loss": 0.3945, + "step": 922 + }, + { + "epoch": 1.822, + "grad_norm": 0.24855782091617584, + "learning_rate": 1.4852136862001766e-07, + "loss": 0.1894, + "step": 923 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 0.2089153230190277, + "learning_rate": 1.4470731712944885e-07, + "loss": 0.3297, + "step": 924 + }, + { + "epoch": 1.826, + "grad_norm": 0.3130733072757721, + "learning_rate": 1.4094215838229176e-07, + "loss": 0.4001, + "step": 925 + }, + { + "epoch": 1.8279999999999998, + "grad_norm": 0.2722707688808441, + "learning_rate": 1.372259302936546e-07, + "loss": 0.356, + "step": 926 + }, + { + "epoch": 1.83, + "grad_norm": 0.15767575800418854, + "learning_rate": 1.3355867028591209e-07, + "loss": 0.2161, + "step": 927 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 0.18771317601203918, + "learning_rate": 1.2994041528833267e-07, + "loss": 0.1912, + "step": 928 + }, + { + "epoch": 1.834, + "grad_norm": 0.15640737116336823, + "learning_rate": 1.263712017367036e-07, + "loss": 0.2173, + "step": 929 + }, + { + "epoch": 1.8359999999999999, + "grad_norm": 0.2588789463043213, + "learning_rate": 1.2285106557296479e-07, + "loss": 0.3506, + "step": 930 + }, + { + "epoch": 1.838, + "grad_norm": 0.21290963888168335, + "learning_rate": 1.193800422448499e-07, + "loss": 0.2377, + "step": 931 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 0.198676198720932, + "learning_rate": 1.1595816670552429e-07, + "loss": 0.1823, + "step": 932 + }, + { + "epoch": 1.842, + "grad_norm": 0.23629765212535858, + "learning_rate": 1.12585473413237e-07, + "loss": 0.2565, + "step": 933 + }, + { + "epoch": 1.8439999999999999, + "grad_norm": 0.23395268619060516, + "learning_rate": 1.0926199633097156e-07, + "loss": 0.2184, + "step": 934 + }, + { + "epoch": 1.846, + "grad_norm": 0.2589554190635681, + "learning_rate": 1.0598776892610685e-07, + "loss": 0.369, + "step": 935 + }, + { + "epoch": 1.8479999999999999, + "grad_norm": 0.22093115746974945, + "learning_rate": 1.0276282417007399e-07, + "loss": 0.3437, + "step": 936 + }, + { + "epoch": 1.85, + "grad_norm": 0.23697194457054138, + "learning_rate": 9.958719453803278e-08, + "loss": 0.3288, + "step": 937 + }, + { + "epoch": 1.8519999999999999, + "grad_norm": 0.22383596003055573, + "learning_rate": 9.646091200853802e-08, + "loss": 0.4897, + "step": 938 + }, + { + "epoch": 1.854, + "grad_norm": 0.20475724339485168, + "learning_rate": 9.338400806321979e-08, + "loss": 0.257, + "step": 939 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 0.263615220785141, + "learning_rate": 9.035651368646647e-08, + "loss": 0.4592, + "step": 940 + }, + { + "epoch": 1.858, + "grad_norm": 0.24478185176849365, + "learning_rate": 8.737845936511335e-08, + "loss": 0.4337, + "step": 941 + }, + { + "epoch": 1.8599999999999999, + "grad_norm": 0.2436402142047882, + "learning_rate": 8.444987508813451e-08, + "loss": 0.3344, + "step": 942 + }, + { + "epoch": 1.862, + "grad_norm": 0.23337677121162415, + "learning_rate": 8.157079034633974e-08, + "loss": 0.2967, + "step": 943 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 0.20073962211608887, + "learning_rate": 7.874123413208145e-08, + "loss": 0.1952, + "step": 944 + }, + { + "epoch": 1.866, + "grad_norm": 0.2582467496395111, + "learning_rate": 7.59612349389599e-08, + "loss": 0.372, + "step": 945 + }, + { + "epoch": 1.8679999999999999, + "grad_norm": 0.2121819704771042, + "learning_rate": 7.32308207615351e-08, + "loss": 0.2619, + "step": 946 + }, + { + "epoch": 1.87, + "grad_norm": 0.16836410760879517, + "learning_rate": 7.055001909504755e-08, + "loss": 0.293, + "step": 947 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 0.18819768726825714, + "learning_rate": 6.791885693514134e-08, + "loss": 0.2476, + "step": 948 + }, + { + "epoch": 1.874, + "grad_norm": 0.2157561331987381, + "learning_rate": 6.533736077758868e-08, + "loss": 0.2615, + "step": 949 + }, + { + "epoch": 1.876, + "grad_norm": 0.24670301377773285, + "learning_rate": 6.280555661802857e-08, + "loss": 0.371, + "step": 950 + }, + { + "epoch": 1.8780000000000001, + "grad_norm": 0.21483668684959412, + "learning_rate": 6.032346995169968e-08, + "loss": 0.2231, + "step": 951 + }, + { + "epoch": 1.88, + "grad_norm": 0.1763847917318344, + "learning_rate": 5.7891125773187896e-08, + "loss": 0.2074, + "step": 952 + }, + { + "epoch": 1.8820000000000001, + "grad_norm": 0.20190970599651337, + "learning_rate": 5.550854857617194e-08, + "loss": 0.3226, + "step": 953 + }, + { + "epoch": 1.884, + "grad_norm": 0.23266001045703888, + "learning_rate": 5.3175762353177563e-08, + "loss": 0.3055, + "step": 954 + }, + { + "epoch": 1.8860000000000001, + "grad_norm": 0.26426488161087036, + "learning_rate": 5.089279059533658e-08, + "loss": 0.3319, + "step": 955 + }, + { + "epoch": 1.888, + "grad_norm": 0.24322916567325592, + "learning_rate": 4.865965629214819e-08, + "loss": 0.2372, + "step": 956 + }, + { + "epoch": 1.8900000000000001, + "grad_norm": 0.23628686368465424, + "learning_rate": 4.6476381931251366e-08, + "loss": 0.3808, + "step": 957 + }, + { + "epoch": 1.892, + "grad_norm": 0.16934725642204285, + "learning_rate": 4.434298949819449e-08, + "loss": 0.1737, + "step": 958 + }, + { + "epoch": 1.8940000000000001, + "grad_norm": 0.30660754442214966, + "learning_rate": 4.225950047621441e-08, + "loss": 0.3483, + "step": 959 + }, + { + "epoch": 1.896, + "grad_norm": 0.27640894055366516, + "learning_rate": 4.02259358460233e-08, + "loss": 0.3264, + "step": 960 + }, + { + "epoch": 1.8980000000000001, + "grad_norm": 0.2123912125825882, + "learning_rate": 3.8242316085594923e-08, + "loss": 0.3876, + "step": 961 + }, + { + "epoch": 1.9, + "grad_norm": 0.2987152636051178, + "learning_rate": 3.630866116995757e-08, + "loss": 0.4525, + "step": 962 + }, + { + "epoch": 1.9020000000000001, + "grad_norm": 0.22001074254512787, + "learning_rate": 3.44249905709948e-08, + "loss": 0.1842, + "step": 963 + }, + { + "epoch": 1.904, + "grad_norm": 0.20775096118450165, + "learning_rate": 3.25913232572489e-08, + "loss": 0.3012, + "step": 964 + }, + { + "epoch": 1.9060000000000001, + "grad_norm": 0.19180834293365479, + "learning_rate": 3.080767769372939e-08, + "loss": 0.2681, + "step": 965 + }, + { + "epoch": 1.908, + "grad_norm": 0.22222468256950378, + "learning_rate": 2.907407184172706e-08, + "loss": 0.1809, + "step": 966 + }, + { + "epoch": 1.9100000000000001, + "grad_norm": 0.20555076003074646, + "learning_rate": 2.7390523158633552e-08, + "loss": 0.1482, + "step": 967 + }, + { + "epoch": 1.912, + "grad_norm": 0.29668375849723816, + "learning_rate": 2.57570485977654e-08, + "loss": 0.2179, + "step": 968 + }, + { + "epoch": 1.9140000000000001, + "grad_norm": 0.19830183684825897, + "learning_rate": 2.4173664608193592e-08, + "loss": 0.2677, + "step": 969 + }, + { + "epoch": 1.916, + "grad_norm": 0.23050029575824738, + "learning_rate": 2.264038713457706e-08, + "loss": 0.3348, + "step": 970 + }, + { + "epoch": 1.9180000000000001, + "grad_norm": 0.36921679973602295, + "learning_rate": 2.1157231617002783e-08, + "loss": 0.4821, + "step": 971 + }, + { + "epoch": 1.92, + "grad_norm": 0.16172367334365845, + "learning_rate": 1.9724212990830938e-08, + "loss": 0.2348, + "step": 972 + }, + { + "epoch": 1.9220000000000002, + "grad_norm": 0.18016183376312256, + "learning_rate": 1.834134568654333e-08, + "loss": 0.2486, + "step": 973 + }, + { + "epoch": 1.924, + "grad_norm": 0.32527899742126465, + "learning_rate": 1.7008643629596866e-08, + "loss": 0.3623, + "step": 974 + }, + { + "epoch": 1.9260000000000002, + "grad_norm": 0.21802493929862976, + "learning_rate": 1.5726120240288632e-08, + "loss": 0.2155, + "step": 975 + }, + { + "epoch": 1.928, + "grad_norm": 0.23393763601779938, + "learning_rate": 1.449378843361271e-08, + "loss": 0.284, + "step": 976 + }, + { + "epoch": 1.9300000000000002, + "grad_norm": 0.2498655915260315, + "learning_rate": 1.3311660619138578e-08, + "loss": 0.2816, + "step": 977 + }, + { + "epoch": 1.932, + "grad_norm": 0.20273719727993011, + "learning_rate": 1.2179748700879013e-08, + "loss": 0.2945, + "step": 978 + }, + { + "epoch": 1.9340000000000002, + "grad_norm": 0.16979333758354187, + "learning_rate": 1.109806407717462e-08, + "loss": 0.1949, + "step": 979 + }, + { + "epoch": 1.936, + "grad_norm": 0.18881943821907043, + "learning_rate": 1.006661764057837e-08, + "loss": 0.2681, + "step": 980 + }, + { + "epoch": 1.938, + "grad_norm": 0.23016507923603058, + "learning_rate": 9.085419777743465e-09, + "loss": 0.4162, + "step": 981 + }, + { + "epoch": 1.94, + "grad_norm": 0.21829769015312195, + "learning_rate": 8.15448036932176e-09, + "loss": 0.3911, + "step": 982 + }, + { + "epoch": 1.942, + "grad_norm": 0.192356139421463, + "learning_rate": 7.273808789862724e-09, + "loss": 0.3076, + "step": 983 + }, + { + "epoch": 1.944, + "grad_norm": 0.20806097984313965, + "learning_rate": 6.4434139077201865e-09, + "loss": 0.2808, + "step": 984 + }, + { + "epoch": 1.946, + "grad_norm": 0.2533554434776306, + "learning_rate": 5.6633040849601865e-09, + "loss": 0.264, + "step": 985 + }, + { + "epoch": 1.948, + "grad_norm": 0.25440603494644165, + "learning_rate": 4.933487177280483e-09, + "loss": 0.386, + "step": 986 + }, + { + "epoch": 1.95, + "grad_norm": 0.2403300553560257, + "learning_rate": 4.253970533929508e-09, + "loss": 0.2665, + "step": 987 + }, + { + "epoch": 1.952, + "grad_norm": 0.18095187842845917, + "learning_rate": 3.6247609976319818e-09, + "loss": 0.2414, + "step": 988 + }, + { + "epoch": 1.954, + "grad_norm": 0.43698740005493164, + "learning_rate": 3.0458649045211897e-09, + "loss": 0.4131, + "step": 989 + }, + { + "epoch": 1.956, + "grad_norm": 0.2908496856689453, + "learning_rate": 2.5172880840745873e-09, + "loss": 0.2955, + "step": 990 + }, + { + "epoch": 1.958, + "grad_norm": 0.19435322284698486, + "learning_rate": 2.0390358590538507e-09, + "loss": 0.1839, + "step": 991 + }, + { + "epoch": 1.96, + "grad_norm": 0.20639224350452423, + "learning_rate": 1.61111304545436e-09, + "loss": 0.336, + "step": 992 + }, + { + "epoch": 1.962, + "grad_norm": 0.18591168522834778, + "learning_rate": 1.2335239524541298e-09, + "loss": 0.2653, + "step": 993 + }, + { + "epoch": 1.964, + "grad_norm": 0.2295517921447754, + "learning_rate": 9.062723823710651e-10, + "loss": 0.3478, + "step": 994 + }, + { + "epoch": 1.966, + "grad_norm": 0.2810915410518646, + "learning_rate": 6.293616306246586e-10, + "loss": 0.3266, + "step": 995 + }, + { + "epoch": 1.968, + "grad_norm": 0.19316555559635162, + "learning_rate": 4.027944857032395e-10, + "loss": 0.2753, + "step": 996 + }, + { + "epoch": 1.97, + "grad_norm": 0.24243375658988953, + "learning_rate": 2.265732291356626e-10, + "loss": 0.2786, + "step": 997 + }, + { + "epoch": 1.972, + "grad_norm": 0.27688726782798767, + "learning_rate": 1.0069963546743833e-10, + "loss": 0.2615, + "step": 998 + }, + { + "epoch": 1.974, + "grad_norm": 0.18696589767932892, + "learning_rate": 2.5174972244634834e-11, + "loss": 0.2866, + "step": 999 + }, + { + "epoch": 1.976, + "grad_norm": 0.21791526675224304, + "learning_rate": 0.0, + "loss": 0.2074, + "step": 1000 + }, + { + "epoch": 1.976, + "eval_loss": 0.26330506801605225, + "eval_runtime": 76.7272, + "eval_samples_per_second": 7.194, + "eval_steps_per_second": 0.899, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 1000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.531674674724864e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}