diff --git "a/checkpoint-5250/trainer_state.json" "b/checkpoint-5250/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-5250/trainer_state.json" @@ -0,0 +1,21328 @@ +{ + "best_global_step": 5250, + "best_metric": 4.961794376373291, + "best_model_checkpoint": "/gpfs/scratch/guoh/DNAFM/output/gencode_human_12.8k_12800/Gencode-BPE/checkpoint-5250", + "epoch": 1.4893617021276595, + "eval_steps": 125, + "global_step": 5250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005673758865248227, + "grad_norm": 1275.0146484375, + "loss": 281.4781, + "lr": 2e-06, + "step": 2, + "tokens_trained": 0.000192256 + }, + { + "epoch": 0.0011347517730496454, + "grad_norm": 1437.579833984375, + "loss": 267.2211, + "lr": 6e-06, + "step": 4, + "tokens_trained": 0.000382024 + }, + { + "epoch": 0.001702127659574468, + "grad_norm": 1719.271484375, + "loss": 219.3822, + "lr": 1e-05, + "step": 6, + "tokens_trained": 0.00057072 + }, + { + "epoch": 0.0022695035460992908, + "grad_norm": 1444.94970703125, + "loss": 133.8172, + "lr": 1.4e-05, + "step": 8, + "tokens_trained": 0.000761336 + }, + { + "epoch": 0.0028368794326241137, + "grad_norm": 238.9689178466797, + "loss": 90.8177, + "lr": 1.8e-05, + "step": 10, + "tokens_trained": 0.000953248 + }, + { + "epoch": 0.003404255319148936, + "grad_norm": 158.53497314453125, + "loss": 84.6922, + "lr": 2.2e-05, + "step": 12, + "tokens_trained": 0.00114424 + }, + { + "epoch": 0.003971631205673759, + "grad_norm": 146.10595703125, + "loss": 76.7055, + "lr": 2.6e-05, + "step": 14, + "tokens_trained": 0.001334104 + }, + { + "epoch": 0.0045390070921985815, + "grad_norm": 140.69964599609375, + "loss": 67.9952, + "lr": 3e-05, + "step": 16, + "tokens_trained": 0.00152392 + }, + { + "epoch": 0.005106382978723404, + "grad_norm": 108.80303192138672, + "loss": 57.8088, + "lr": 3.4000000000000007e-05, + "step": 18, + "tokens_trained": 0.001713872 + }, + { + "epoch": 0.005673758865248227, + "grad_norm": 106.82334899902344, + "loss": 48.6585, + "lr": 3.8e-05, + "step": 20, + "tokens_trained": 0.001903976 + }, + { + "epoch": 0.00624113475177305, + "grad_norm": 93.58769989013672, + "loss": 41.7984, + "lr": 4.2000000000000004e-05, + "step": 22, + "tokens_trained": 0.002094288 + }, + { + "epoch": 0.006808510638297872, + "grad_norm": 87.5854721069336, + "loss": 37.6201, + "lr": 4.6e-05, + "step": 24, + "tokens_trained": 0.002282496 + }, + { + "epoch": 0.007375886524822695, + "grad_norm": 84.12794494628906, + "loss": 35.0091, + "lr": 5e-05, + "step": 26, + "tokens_trained": 0.00247068 + }, + { + "epoch": 0.007943262411347518, + "grad_norm": 79.77535247802734, + "loss": 33.2253, + "lr": 5.4e-05, + "step": 28, + "tokens_trained": 0.002662888 + }, + { + "epoch": 0.00851063829787234, + "grad_norm": 66.42157745361328, + "loss": 32.0682, + "lr": 5.800000000000001e-05, + "step": 30, + "tokens_trained": 0.002851968 + }, + { + "epoch": 0.009078014184397163, + "grad_norm": 87.52485656738281, + "loss": 30.893, + "lr": 6.2e-05, + "step": 32, + "tokens_trained": 0.003041384 + }, + { + "epoch": 0.009645390070921986, + "grad_norm": 58.33614730834961, + "loss": 30.0513, + "lr": 6.6e-05, + "step": 34, + "tokens_trained": 0.003232872 + }, + { + "epoch": 0.010212765957446808, + "grad_norm": 54.629329681396484, + "loss": 29.0115, + "lr": 7.000000000000001e-05, + "step": 36, + "tokens_trained": 0.003423824 + }, + { + "epoch": 0.01078014184397163, + "grad_norm": 52.79097366333008, + "loss": 28.2084, + "lr": 7.4e-05, + "step": 38, + "tokens_trained": 0.003613232 + }, + { + "epoch": 0.011347517730496455, + "grad_norm": 54.481224060058594, + "loss": 27.4345, + "lr": 7.8e-05, + "step": 40, + "tokens_trained": 0.003800952 + }, + { + "epoch": 0.011914893617021277, + "grad_norm": 58.7069091796875, + "loss": 26.5936, + "lr": 8.2e-05, + "step": 42, + "tokens_trained": 0.003991512 + }, + { + "epoch": 0.0124822695035461, + "grad_norm": 49.30760955810547, + "loss": 26.0608, + "lr": 8.599999999999999e-05, + "step": 44, + "tokens_trained": 0.004180648 + }, + { + "epoch": 0.013049645390070922, + "grad_norm": 61.902587890625, + "loss": 25.5363, + "lr": 8.999999999999999e-05, + "step": 46, + "tokens_trained": 0.00437148 + }, + { + "epoch": 0.013617021276595745, + "grad_norm": 46.76111602783203, + "loss": 24.9599, + "lr": 9.400000000000001e-05, + "step": 48, + "tokens_trained": 0.004559344 + }, + { + "epoch": 0.014184397163120567, + "grad_norm": 57.06416702270508, + "loss": 24.4087, + "lr": 9.800000000000001e-05, + "step": 50, + "tokens_trained": 0.004749256 + }, + { + "epoch": 0.01475177304964539, + "grad_norm": 44.798736572265625, + "loss": 24.1444, + "lr": 0.000102, + "step": 52, + "tokens_trained": 0.004940192 + }, + { + "epoch": 0.015319148936170212, + "grad_norm": 40.29296875, + "loss": 23.6011, + "lr": 0.000106, + "step": 54, + "tokens_trained": 0.005130304 + }, + { + "epoch": 0.015886524822695036, + "grad_norm": 38.75099563598633, + "loss": 23.1781, + "lr": 0.00011, + "step": 56, + "tokens_trained": 0.005322864 + }, + { + "epoch": 0.016453900709219857, + "grad_norm": 37.470706939697266, + "loss": 22.9136, + "lr": 0.000114, + "step": 58, + "tokens_trained": 0.00551392 + }, + { + "epoch": 0.01702127659574468, + "grad_norm": 35.1894645690918, + "loss": 22.6336, + "lr": 0.000118, + "step": 60, + "tokens_trained": 0.005703096 + }, + { + "epoch": 0.017588652482269502, + "grad_norm": 35.136573791503906, + "loss": 22.2998, + "lr": 0.000122, + "step": 62, + "tokens_trained": 0.005892448 + }, + { + "epoch": 0.018156028368794326, + "grad_norm": 38.05111312866211, + "loss": 21.9401, + "lr": 0.000126, + "step": 64, + "tokens_trained": 0.006081656 + }, + { + "epoch": 0.01872340425531915, + "grad_norm": 35.63850021362305, + "loss": 21.7206, + "lr": 0.00013000000000000002, + "step": 66, + "tokens_trained": 0.006273032 + }, + { + "epoch": 0.01929078014184397, + "grad_norm": 34.327667236328125, + "loss": 21.4051, + "lr": 0.000134, + "step": 68, + "tokens_trained": 0.00646304 + }, + { + "epoch": 0.019858156028368795, + "grad_norm": 31.457059860229492, + "loss": 21.0774, + "lr": 0.00013800000000000002, + "step": 70, + "tokens_trained": 0.006652832 + }, + { + "epoch": 0.020425531914893616, + "grad_norm": 34.91672897338867, + "loss": 20.8718, + "lr": 0.00014199999999999998, + "step": 72, + "tokens_trained": 0.006843512 + }, + { + "epoch": 0.02099290780141844, + "grad_norm": 27.959579467773438, + "loss": 20.6932, + "lr": 0.000146, + "step": 74, + "tokens_trained": 0.007033584 + }, + { + "epoch": 0.02156028368794326, + "grad_norm": 26.569866180419922, + "loss": 20.4072, + "lr": 0.00015, + "step": 76, + "tokens_trained": 0.007224032 + }, + { + "epoch": 0.022127659574468085, + "grad_norm": 28.009904861450195, + "loss": 20.2229, + "lr": 0.000154, + "step": 78, + "tokens_trained": 0.00741368 + }, + { + "epoch": 0.02269503546099291, + "grad_norm": 28.892959594726562, + "loss": 20.0528, + "lr": 0.000158, + "step": 80, + "tokens_trained": 0.00760416 + }, + { + "epoch": 0.02326241134751773, + "grad_norm": 31.58131980895996, + "loss": 19.8016, + "lr": 0.000162, + "step": 82, + "tokens_trained": 0.007793952 + }, + { + "epoch": 0.023829787234042554, + "grad_norm": 31.01254653930664, + "loss": 19.634, + "lr": 0.00016600000000000002, + "step": 84, + "tokens_trained": 0.007980792 + }, + { + "epoch": 0.024397163120567375, + "grad_norm": 28.732515335083008, + "loss": 19.3777, + "lr": 0.00017, + "step": 86, + "tokens_trained": 0.008171968 + }, + { + "epoch": 0.0249645390070922, + "grad_norm": 24.31264877319336, + "loss": 19.1346, + "lr": 0.000174, + "step": 88, + "tokens_trained": 0.008361632 + }, + { + "epoch": 0.02553191489361702, + "grad_norm": 26.557010650634766, + "loss": 19.0014, + "lr": 0.000178, + "step": 90, + "tokens_trained": 0.008552328 + }, + { + "epoch": 0.026099290780141844, + "grad_norm": 21.156103134155273, + "loss": 18.7032, + "lr": 0.000182, + "step": 92, + "tokens_trained": 0.008743136 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 25.7484188079834, + "loss": 18.4836, + "lr": 0.000186, + "step": 94, + "tokens_trained": 0.008932056 + }, + { + "epoch": 0.02723404255319149, + "grad_norm": 22.27949333190918, + "loss": 18.2233, + "lr": 0.00019, + "step": 96, + "tokens_trained": 0.009121608 + }, + { + "epoch": 0.027801418439716313, + "grad_norm": 24.9247989654541, + "loss": 17.9867, + "lr": 0.000194, + "step": 98, + "tokens_trained": 0.009311008 + }, + { + "epoch": 0.028368794326241134, + "grad_norm": 24.302066802978516, + "loss": 17.8016, + "lr": 0.00019800000000000002, + "step": 100, + "tokens_trained": 0.009501456 + }, + { + "epoch": 0.02893617021276596, + "grad_norm": 23.458459854125977, + "loss": 17.6295, + "lr": 0.000202, + "step": 102, + "tokens_trained": 0.009693952 + }, + { + "epoch": 0.02950354609929078, + "grad_norm": 24.092350006103516, + "loss": 17.4593, + "lr": 0.000206, + "step": 104, + "tokens_trained": 0.009883328 + }, + { + "epoch": 0.030070921985815603, + "grad_norm": 22.54726219177246, + "loss": 17.2141, + "lr": 0.00021, + "step": 106, + "tokens_trained": 0.01007316 + }, + { + "epoch": 0.030638297872340424, + "grad_norm": 21.334760665893555, + "loss": 17.044, + "lr": 0.000214, + "step": 108, + "tokens_trained": 0.010266504 + }, + { + "epoch": 0.031205673758865248, + "grad_norm": 20.584287643432617, + "loss": 16.8919, + "lr": 0.000218, + "step": 110, + "tokens_trained": 0.010455736 + }, + { + "epoch": 0.03177304964539007, + "grad_norm": 23.51676368713379, + "loss": 16.751, + "lr": 0.000222, + "step": 112, + "tokens_trained": 0.010645208 + }, + { + "epoch": 0.03234042553191489, + "grad_norm": 23.278276443481445, + "loss": 16.5997, + "lr": 0.00022600000000000002, + "step": 114, + "tokens_trained": 0.010838928 + }, + { + "epoch": 0.032907801418439714, + "grad_norm": 25.4830265045166, + "loss": 16.3416, + "lr": 0.00023, + "step": 116, + "tokens_trained": 0.011027792 + }, + { + "epoch": 0.03347517730496454, + "grad_norm": 29.442413330078125, + "loss": 16.24, + "lr": 0.00023400000000000002, + "step": 118, + "tokens_trained": 0.011217456 + }, + { + "epoch": 0.03404255319148936, + "grad_norm": 21.77578353881836, + "loss": 16.1922, + "lr": 0.00023799999999999998, + "step": 120, + "tokens_trained": 0.01140804 + }, + { + "epoch": 0.03460992907801418, + "grad_norm": 27.040719985961914, + "loss": 15.9059, + "lr": 0.000242, + "step": 122, + "tokens_trained": 0.011597816 + }, + { + "epoch": 0.035177304964539004, + "grad_norm": 24.74480628967285, + "loss": 15.7818, + "lr": 0.000246, + "step": 124, + "tokens_trained": 0.011785624 + }, + { + "epoch": 0.03546099290780142, + "eval_loss": 15.553059577941895, + "eval_runtime": 23.5485, + "step": 125, + "tokens_trained": 0.011880832 + }, + { + "epoch": 0.03574468085106383, + "grad_norm": 23.13482666015625, + "loss": 15.5739, + "lr": 0.00025, + "step": 126, + "tokens_trained": 0.011975976 + }, + { + "epoch": 0.03631205673758865, + "grad_norm": 22.8618106842041, + "loss": 15.4302, + "lr": 0.000254, + "step": 128, + "tokens_trained": 0.012166744 + }, + { + "epoch": 0.03687943262411347, + "grad_norm": 26.804859161376953, + "loss": 15.3623, + "lr": 0.00025800000000000004, + "step": 130, + "tokens_trained": 0.01235436 + }, + { + "epoch": 0.0374468085106383, + "grad_norm": 21.826601028442383, + "loss": 15.1465, + "lr": 0.000262, + "step": 132, + "tokens_trained": 0.012544976 + }, + { + "epoch": 0.03801418439716312, + "grad_norm": 39.447086334228516, + "loss": 15.0137, + "lr": 0.000266, + "step": 134, + "tokens_trained": 0.012736352 + }, + { + "epoch": 0.03858156028368794, + "grad_norm": 23.44275665283203, + "loss": 14.9355, + "lr": 0.00027, + "step": 136, + "tokens_trained": 0.012925008 + }, + { + "epoch": 0.03914893617021276, + "grad_norm": 21.631427764892578, + "loss": 14.6825, + "lr": 0.00027400000000000005, + "step": 138, + "tokens_trained": 0.013114672 + }, + { + "epoch": 0.03971631205673759, + "grad_norm": 23.674650192260742, + "loss": 14.5194, + "lr": 0.00027800000000000004, + "step": 140, + "tokens_trained": 0.013304016 + }, + { + "epoch": 0.04028368794326241, + "grad_norm": 23.974796295166016, + "loss": 14.4829, + "lr": 0.00028199999999999997, + "step": 142, + "tokens_trained": 0.013496696 + }, + { + "epoch": 0.04085106382978723, + "grad_norm": 26.112201690673828, + "loss": 14.3027, + "lr": 0.00028599999999999996, + "step": 144, + "tokens_trained": 0.013684816 + }, + { + "epoch": 0.04141843971631206, + "grad_norm": 20.67386817932129, + "loss": 14.1499, + "lr": 0.00029, + "step": 146, + "tokens_trained": 0.013874832 + }, + { + "epoch": 0.04198581560283688, + "grad_norm": 24.253408432006836, + "loss": 13.9378, + "lr": 0.000294, + "step": 148, + "tokens_trained": 0.014065056 + }, + { + "epoch": 0.0425531914893617, + "grad_norm": 35.716087341308594, + "loss": 14.0562, + "lr": 0.000298, + "step": 150, + "tokens_trained": 0.014256784 + }, + { + "epoch": 0.04312056737588652, + "grad_norm": 29.414331436157227, + "loss": 14.0462, + "lr": 0.000302, + "step": 152, + "tokens_trained": 0.014446312 + }, + { + "epoch": 0.04368794326241135, + "grad_norm": 30.687482833862305, + "loss": 13.7603, + "lr": 0.000306, + "step": 154, + "tokens_trained": 0.014639872 + }, + { + "epoch": 0.04425531914893617, + "grad_norm": 29.806455612182617, + "loss": 13.708, + "lr": 0.00031, + "step": 156, + "tokens_trained": 0.014831112 + }, + { + "epoch": 0.04482269503546099, + "grad_norm": 24.900897979736328, + "loss": 13.548, + "lr": 0.000314, + "step": 158, + "tokens_trained": 0.015021288 + }, + { + "epoch": 0.04539007092198582, + "grad_norm": 24.29252815246582, + "loss": 13.3119, + "lr": 0.00031800000000000003, + "step": 160, + "tokens_trained": 0.01521228 + }, + { + "epoch": 0.04595744680851064, + "grad_norm": 20.68342399597168, + "loss": 13.1829, + "lr": 0.000322, + "step": 162, + "tokens_trained": 0.015403688 + }, + { + "epoch": 0.04652482269503546, + "grad_norm": 20.822795867919922, + "loss": 12.9044, + "lr": 0.000326, + "step": 164, + "tokens_trained": 0.015593416 + }, + { + "epoch": 0.04709219858156028, + "grad_norm": 21.689916610717773, + "loss": 12.6862, + "lr": 0.00033, + "step": 166, + "tokens_trained": 0.015784408 + }, + { + "epoch": 0.04765957446808511, + "grad_norm": 17.873889923095703, + "loss": 12.5502, + "lr": 0.00033400000000000004, + "step": 168, + "tokens_trained": 0.0159744 + }, + { + "epoch": 0.04822695035460993, + "grad_norm": 18.951616287231445, + "loss": 12.308, + "lr": 0.00033800000000000003, + "step": 170, + "tokens_trained": 0.016163736 + }, + { + "epoch": 0.04879432624113475, + "grad_norm": 15.146363258361816, + "loss": 12.1558, + "lr": 0.000342, + "step": 172, + "tokens_trained": 0.016353832 + }, + { + "epoch": 0.04936170212765958, + "grad_norm": 18.336984634399414, + "loss": 12.0386, + "lr": 0.000346, + "step": 174, + "tokens_trained": 0.016545088 + }, + { + "epoch": 0.0499290780141844, + "grad_norm": 17.221126556396484, + "loss": 11.8791, + "lr": 0.00035, + "step": 176, + "tokens_trained": 0.016735704 + }, + { + "epoch": 0.05049645390070922, + "grad_norm": 19.362564086914062, + "loss": 11.7224, + "lr": 0.000354, + "step": 178, + "tokens_trained": 0.016927944 + }, + { + "epoch": 0.05106382978723404, + "grad_norm": 15.564507484436035, + "loss": 11.6448, + "lr": 0.000358, + "step": 180, + "tokens_trained": 0.017116096 + }, + { + "epoch": 0.05163120567375887, + "grad_norm": 20.711383819580078, + "loss": 11.4398, + "lr": 0.000362, + "step": 182, + "tokens_trained": 0.01730564 + }, + { + "epoch": 0.05219858156028369, + "grad_norm": 18.627403259277344, + "loss": 11.3377, + "lr": 0.000366, + "step": 184, + "tokens_trained": 0.017495864 + }, + { + "epoch": 0.05276595744680851, + "grad_norm": 15.00942325592041, + "loss": 11.1416, + "lr": 0.00037, + "step": 186, + "tokens_trained": 0.017686464 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 17.070598602294922, + "loss": 11.0148, + "lr": 0.000374, + "step": 188, + "tokens_trained": 0.017879488 + }, + { + "epoch": 0.05390070921985816, + "grad_norm": 16.101457595825195, + "loss": 10.8874, + "lr": 0.000378, + "step": 190, + "tokens_trained": 0.018068312 + }, + { + "epoch": 0.05446808510638298, + "grad_norm": 15.613334655761719, + "loss": 10.7055, + "lr": 0.000382, + "step": 192, + "tokens_trained": 0.018255752 + }, + { + "epoch": 0.0550354609929078, + "grad_norm": 17.671857833862305, + "loss": 10.5706, + "lr": 0.000386, + "step": 194, + "tokens_trained": 0.018447096 + }, + { + "epoch": 0.05560283687943263, + "grad_norm": 16.080909729003906, + "loss": 10.4476, + "lr": 0.00039000000000000005, + "step": 196, + "tokens_trained": 0.018637264 + }, + { + "epoch": 0.05617021276595745, + "grad_norm": 15.02849292755127, + "loss": 10.2962, + "lr": 0.00039400000000000004, + "step": 198, + "tokens_trained": 0.018827552 + }, + { + "epoch": 0.05673758865248227, + "grad_norm": 14.990167617797852, + "loss": 10.1912, + "lr": 0.000398, + "step": 200, + "tokens_trained": 0.019018 + }, + { + "epoch": 0.05730496453900709, + "grad_norm": 15.390633583068848, + "loss": 10.0442, + "lr": 0.000402, + "step": 202, + "tokens_trained": 0.019209864 + }, + { + "epoch": 0.05787234042553192, + "grad_norm": 16.871570587158203, + "loss": 9.9685, + "lr": 0.00040600000000000006, + "step": 204, + "tokens_trained": 0.019400176 + }, + { + "epoch": 0.05843971631205674, + "grad_norm": 20.16544532775879, + "loss": 9.8531, + "lr": 0.00041, + "step": 206, + "tokens_trained": 0.019589424 + }, + { + "epoch": 0.05900709219858156, + "grad_norm": 16.825023651123047, + "loss": 9.7777, + "lr": 0.000414, + "step": 208, + "tokens_trained": 0.019779112 + }, + { + "epoch": 0.059574468085106386, + "grad_norm": 16.43510627746582, + "loss": 9.6122, + "lr": 0.00041799999999999997, + "step": 210, + "tokens_trained": 0.019970048 + }, + { + "epoch": 0.060141843971631206, + "grad_norm": 17.340473175048828, + "loss": 9.4859, + "lr": 0.000422, + "step": 212, + "tokens_trained": 0.020160968 + }, + { + "epoch": 0.06070921985815603, + "grad_norm": 15.019119262695312, + "loss": 9.3656, + "lr": 0.000426, + "step": 214, + "tokens_trained": 0.020349664 + }, + { + "epoch": 0.06127659574468085, + "grad_norm": 13.379194259643555, + "loss": 9.2348, + "lr": 0.00043, + "step": 216, + "tokens_trained": 0.020538192 + }, + { + "epoch": 0.061843971631205676, + "grad_norm": 16.71472930908203, + "loss": 9.2258, + "lr": 0.00043400000000000003, + "step": 218, + "tokens_trained": 0.020728936 + }, + { + "epoch": 0.062411347517730496, + "grad_norm": 12.743139266967773, + "loss": 9.0569, + "lr": 0.000438, + "step": 220, + "tokens_trained": 0.020917472 + }, + { + "epoch": 0.06297872340425532, + "grad_norm": 15.739934921264648, + "loss": 8.9623, + "lr": 0.000442, + "step": 222, + "tokens_trained": 0.02110928 + }, + { + "epoch": 0.06354609929078014, + "grad_norm": 14.23620891571045, + "loss": 8.8201, + "lr": 0.000446, + "step": 224, + "tokens_trained": 0.021300168 + }, + { + "epoch": 0.06411347517730497, + "grad_norm": 13.005538940429688, + "loss": 8.7235, + "lr": 0.00045000000000000004, + "step": 226, + "tokens_trained": 0.021490272 + }, + { + "epoch": 0.06468085106382979, + "grad_norm": 17.17629051208496, + "loss": 8.6907, + "lr": 0.00045400000000000003, + "step": 228, + "tokens_trained": 0.021681552 + }, + { + "epoch": 0.06524822695035461, + "grad_norm": 14.430739402770996, + "loss": 8.6196, + "lr": 0.000458, + "step": 230, + "tokens_trained": 0.02187236 + }, + { + "epoch": 0.06581560283687943, + "grad_norm": 14.575714111328125, + "loss": 8.4741, + "lr": 0.000462, + "step": 232, + "tokens_trained": 0.022061976 + }, + { + "epoch": 0.06638297872340425, + "grad_norm": 13.892754554748535, + "loss": 8.4118, + "lr": 0.00046600000000000005, + "step": 234, + "tokens_trained": 0.022252008 + }, + { + "epoch": 0.06695035460992908, + "grad_norm": 11.58240795135498, + "loss": 8.2781, + "lr": 0.00047, + "step": 236, + "tokens_trained": 0.02244284 + }, + { + "epoch": 0.0675177304964539, + "grad_norm": 13.022644996643066, + "loss": 8.2139, + "lr": 0.000474, + "step": 238, + "tokens_trained": 0.022631152 + }, + { + "epoch": 0.06808510638297872, + "grad_norm": 11.844677925109863, + "loss": 8.1134, + "lr": 0.00047799999999999996, + "step": 240, + "tokens_trained": 0.022821096 + }, + { + "epoch": 0.06865248226950355, + "grad_norm": 13.878067016601562, + "loss": 8.0221, + "lr": 0.000482, + "step": 242, + "tokens_trained": 0.023011656 + }, + { + "epoch": 0.06921985815602837, + "grad_norm": 12.34648323059082, + "loss": 7.9755, + "lr": 0.000486, + "step": 244, + "tokens_trained": 0.023201 + }, + { + "epoch": 0.06978723404255319, + "grad_norm": 14.238297462463379, + "loss": 7.8969, + "lr": 0.00049, + "step": 246, + "tokens_trained": 0.023391128 + }, + { + "epoch": 0.07035460992907801, + "grad_norm": 14.386019706726074, + "loss": 7.8627, + "lr": 0.000494, + "step": 248, + "tokens_trained": 0.023581768 + }, + { + "epoch": 0.07092198581560284, + "grad_norm": 13.623086929321289, + "loss": 7.7568, + "lr": 0.000498, + "step": 250, + "tokens_trained": 0.023771248 + }, + { + "epoch": 0.07092198581560284, + "eval_loss": 7.70297384262085, + "eval_runtime": 21.3853, + "step": 250, + "tokens_trained": 0.023771248 + }, + { + "epoch": 0.07148936170212766, + "grad_norm": 14.347646713256836, + "loss": 7.6842, + "lr": 0.0005020000000000001, + "step": 252, + "tokens_trained": 0.023961056 + }, + { + "epoch": 0.07205673758865248, + "grad_norm": 12.5592041015625, + "loss": 7.6516, + "lr": 0.000506, + "step": 254, + "tokens_trained": 0.024150968 + }, + { + "epoch": 0.0726241134751773, + "grad_norm": 13.219141960144043, + "loss": 7.5789, + "lr": 0.00051, + "step": 256, + "tokens_trained": 0.024340072 + }, + { + "epoch": 0.07319148936170213, + "grad_norm": 12.654081344604492, + "loss": 7.5369, + "lr": 0.000514, + "step": 258, + "tokens_trained": 0.024529296 + }, + { + "epoch": 0.07375886524822695, + "grad_norm": 13.136971473693848, + "loss": 7.4949, + "lr": 0.000518, + "step": 260, + "tokens_trained": 0.024719688 + }, + { + "epoch": 0.07432624113475177, + "grad_norm": 12.680288314819336, + "loss": 7.3904, + "lr": 0.000522, + "step": 262, + "tokens_trained": 0.024909632 + }, + { + "epoch": 0.0748936170212766, + "grad_norm": 12.754518508911133, + "loss": 7.3514, + "lr": 0.000526, + "step": 264, + "tokens_trained": 0.025098416 + }, + { + "epoch": 0.07546099290780142, + "grad_norm": 13.22311019897461, + "loss": 7.2951, + "lr": 0.0005300000000000001, + "step": 266, + "tokens_trained": 0.025287344 + }, + { + "epoch": 0.07602836879432624, + "grad_norm": 12.11903190612793, + "loss": 7.2229, + "lr": 0.0005340000000000001, + "step": 268, + "tokens_trained": 0.025477152 + }, + { + "epoch": 0.07659574468085106, + "grad_norm": 13.771833419799805, + "loss": 7.1815, + "lr": 0.0005380000000000001, + "step": 270, + "tokens_trained": 0.025668288 + }, + { + "epoch": 0.07716312056737588, + "grad_norm": 11.756864547729492, + "loss": 7.1669, + "lr": 0.0005420000000000001, + "step": 272, + "tokens_trained": 0.025858528 + }, + { + "epoch": 0.0777304964539007, + "grad_norm": 13.613094329833984, + "loss": 7.1079, + "lr": 0.000546, + "step": 274, + "tokens_trained": 0.026048616 + }, + { + "epoch": 0.07829787234042553, + "grad_norm": 10.001923561096191, + "loss": 7.0508, + "lr": 0.00055, + "step": 276, + "tokens_trained": 0.026236944 + }, + { + "epoch": 0.07886524822695036, + "grad_norm": 14.262083053588867, + "loss": 6.9955, + "lr": 0.000554, + "step": 278, + "tokens_trained": 0.026426848 + }, + { + "epoch": 0.07943262411347518, + "grad_norm": 12.381136894226074, + "loss": 6.9831, + "lr": 0.000558, + "step": 280, + "tokens_trained": 0.026616784 + }, + { + "epoch": 0.08, + "grad_norm": 9.815845489501953, + "loss": 6.917, + "lr": 0.0005620000000000001, + "step": 282, + "tokens_trained": 0.026805176 + }, + { + "epoch": 0.08056737588652482, + "grad_norm": 11.669997215270996, + "loss": 6.8999, + "lr": 0.000566, + "step": 284, + "tokens_trained": 0.02699488 + }, + { + "epoch": 0.08113475177304964, + "grad_norm": 12.770941734313965, + "loss": 6.8998, + "lr": 0.00057, + "step": 286, + "tokens_trained": 0.027185784 + }, + { + "epoch": 0.08170212765957446, + "grad_norm": 15.572457313537598, + "loss": 6.841, + "lr": 0.000574, + "step": 288, + "tokens_trained": 0.027375896 + }, + { + "epoch": 0.08226950354609928, + "grad_norm": 10.980833053588867, + "loss": 6.8545, + "lr": 0.000578, + "step": 290, + "tokens_trained": 0.02756588 + }, + { + "epoch": 0.08283687943262412, + "grad_norm": 11.678337097167969, + "loss": 6.7853, + "lr": 0.0005819999999999999, + "step": 292, + "tokens_trained": 0.02775456 + }, + { + "epoch": 0.08340425531914894, + "grad_norm": 9.77885913848877, + "loss": 6.7465, + "lr": 0.0005859999999999999, + "step": 294, + "tokens_trained": 0.027942856 + }, + { + "epoch": 0.08397163120567376, + "grad_norm": 13.62730884552002, + "loss": 6.7276, + "lr": 0.00059, + "step": 296, + "tokens_trained": 0.028133152 + }, + { + "epoch": 0.08453900709219858, + "grad_norm": 10.644404411315918, + "loss": 6.6802, + "lr": 0.000594, + "step": 298, + "tokens_trained": 0.028322192 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 11.130610466003418, + "loss": 6.6548, + "lr": 0.000598, + "step": 300, + "tokens_trained": 0.0285122 + }, + { + "epoch": 0.08567375886524822, + "grad_norm": 11.557455062866211, + "loss": 6.6155, + "lr": 0.000602, + "step": 302, + "tokens_trained": 0.028699792 + }, + { + "epoch": 0.08624113475177304, + "grad_norm": 9.276884078979492, + "loss": 6.5989, + "lr": 0.000606, + "step": 304, + "tokens_trained": 0.028889896 + }, + { + "epoch": 0.08680851063829788, + "grad_norm": 9.616179466247559, + "loss": 6.5773, + "lr": 0.00061, + "step": 306, + "tokens_trained": 0.029082272 + }, + { + "epoch": 0.0873758865248227, + "grad_norm": 10.575953483581543, + "loss": 6.5358, + "lr": 0.000614, + "step": 308, + "tokens_trained": 0.029273352 + }, + { + "epoch": 0.08794326241134752, + "grad_norm": 9.089850425720215, + "loss": 6.5088, + "lr": 0.0006180000000000001, + "step": 310, + "tokens_trained": 0.029463848 + }, + { + "epoch": 0.08851063829787234, + "grad_norm": 9.090002059936523, + "loss": 6.4849, + "lr": 0.000622, + "step": 312, + "tokens_trained": 0.029653272 + }, + { + "epoch": 0.08907801418439716, + "grad_norm": 12.038308143615723, + "loss": 6.4624, + "lr": 0.000626, + "step": 314, + "tokens_trained": 0.029841928 + }, + { + "epoch": 0.08964539007092198, + "grad_norm": 9.073866844177246, + "loss": 6.4515, + "lr": 0.00063, + "step": 316, + "tokens_trained": 0.030029808 + }, + { + "epoch": 0.0902127659574468, + "grad_norm": 8.727197647094727, + "loss": 6.43, + "lr": 0.000634, + "step": 318, + "tokens_trained": 0.030221288 + }, + { + "epoch": 0.09078014184397164, + "grad_norm": 14.558151245117188, + "loss": 6.4487, + "lr": 0.000638, + "step": 320, + "tokens_trained": 0.030410872 + }, + { + "epoch": 0.09134751773049646, + "grad_norm": 9.98914623260498, + "loss": 6.4279, + "lr": 0.000642, + "step": 322, + "tokens_trained": 0.030602376 + }, + { + "epoch": 0.09191489361702128, + "grad_norm": 10.395442962646484, + "loss": 6.4311, + "lr": 0.000646, + "step": 324, + "tokens_trained": 0.030792968 + }, + { + "epoch": 0.0924822695035461, + "grad_norm": 10.8250093460083, + "loss": 6.3726, + "lr": 0.0006500000000000001, + "step": 326, + "tokens_trained": 0.030982944 + }, + { + "epoch": 0.09304964539007092, + "grad_norm": 9.73416805267334, + "loss": 6.34, + "lr": 0.0006540000000000001, + "step": 328, + "tokens_trained": 0.031174928 + }, + { + "epoch": 0.09361702127659574, + "grad_norm": 8.596503257751465, + "loss": 6.3322, + "lr": 0.0006580000000000001, + "step": 330, + "tokens_trained": 0.031364288 + }, + { + "epoch": 0.09418439716312056, + "grad_norm": 8.49472427368164, + "loss": 6.3096, + "lr": 0.000662, + "step": 332, + "tokens_trained": 0.03155376 + }, + { + "epoch": 0.0947517730496454, + "grad_norm": 7.857503414154053, + "loss": 6.2368, + "lr": 0.000666, + "step": 334, + "tokens_trained": 0.031744368 + }, + { + "epoch": 0.09531914893617022, + "grad_norm": 9.007513999938965, + "loss": 6.198, + "lr": 0.00067, + "step": 336, + "tokens_trained": 0.031934136 + }, + { + "epoch": 0.09588652482269504, + "grad_norm": 8.185524940490723, + "loss": 6.2328, + "lr": 0.000674, + "step": 338, + "tokens_trained": 0.032124984 + }, + { + "epoch": 0.09645390070921986, + "grad_norm": 8.784396171569824, + "loss": 6.1945, + "lr": 0.0006780000000000001, + "step": 340, + "tokens_trained": 0.032316016 + }, + { + "epoch": 0.09702127659574468, + "grad_norm": 8.642311096191406, + "loss": 6.218, + "lr": 0.0006820000000000001, + "step": 342, + "tokens_trained": 0.032506224 + }, + { + "epoch": 0.0975886524822695, + "grad_norm": 8.493780136108398, + "loss": 6.194, + "lr": 0.0006860000000000001, + "step": 344, + "tokens_trained": 0.032696152 + }, + { + "epoch": 0.09815602836879432, + "grad_norm": 9.120508193969727, + "loss": 6.2241, + "lr": 0.00069, + "step": 346, + "tokens_trained": 0.032885688 + }, + { + "epoch": 0.09872340425531916, + "grad_norm": 9.34500503540039, + "loss": 6.1548, + "lr": 0.000694, + "step": 348, + "tokens_trained": 0.03307568 + }, + { + "epoch": 0.09929078014184398, + "grad_norm": 7.483356952667236, + "loss": 6.1282, + "lr": 0.0006979999999999999, + "step": 350, + "tokens_trained": 0.033267208 + }, + { + "epoch": 0.0998581560283688, + "grad_norm": 7.974069118499756, + "loss": 6.1032, + "lr": 0.0007019999999999999, + "step": 352, + "tokens_trained": 0.033458144 + }, + { + "epoch": 0.10042553191489362, + "grad_norm": 8.247384071350098, + "loss": 6.1698, + "lr": 0.0007059999999999999, + "step": 354, + "tokens_trained": 0.033650352 + }, + { + "epoch": 0.10099290780141844, + "grad_norm": 8.554885864257812, + "loss": 6.1429, + "lr": 0.00071, + "step": 356, + "tokens_trained": 0.033840232 + }, + { + "epoch": 0.10156028368794326, + "grad_norm": 7.209281921386719, + "loss": 6.0997, + "lr": 0.000714, + "step": 358, + "tokens_trained": 0.034030032 + }, + { + "epoch": 0.10212765957446808, + "grad_norm": 8.660383224487305, + "loss": 6.1497, + "lr": 0.000718, + "step": 360, + "tokens_trained": 0.034218592 + }, + { + "epoch": 0.10269503546099291, + "grad_norm": 9.382761001586914, + "loss": 6.0665, + "lr": 0.000722, + "step": 362, + "tokens_trained": 0.034408408 + }, + { + "epoch": 0.10326241134751774, + "grad_norm": 6.915714263916016, + "loss": 6.0636, + "lr": 0.000726, + "step": 364, + "tokens_trained": 0.034600016 + }, + { + "epoch": 0.10382978723404256, + "grad_norm": 7.8990631103515625, + "loss": 6.0975, + "lr": 0.00073, + "step": 366, + "tokens_trained": 0.034790792 + }, + { + "epoch": 0.10439716312056738, + "grad_norm": 8.859809875488281, + "loss": 6.0754, + "lr": 0.000734, + "step": 368, + "tokens_trained": 0.034981304 + }, + { + "epoch": 0.1049645390070922, + "grad_norm": 7.392801761627197, + "loss": 6.039, + "lr": 0.000738, + "step": 370, + "tokens_trained": 0.03516956 + }, + { + "epoch": 0.10553191489361702, + "grad_norm": 9.427324295043945, + "loss": 6.084, + "lr": 0.000742, + "step": 372, + "tokens_trained": 0.035358816 + }, + { + "epoch": 0.10609929078014184, + "grad_norm": 7.168910503387451, + "loss": 6.0498, + "lr": 0.000746, + "step": 374, + "tokens_trained": 0.035548016 + }, + { + "epoch": 0.10638297872340426, + "eval_loss": 6.038269996643066, + "eval_runtime": 21.3445, + "step": 375, + "tokens_trained": 0.035644104 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 7.899259567260742, + "loss": 6.0345, + "lr": 0.00075, + "step": 376, + "tokens_trained": 0.035739856 + }, + { + "epoch": 0.1072340425531915, + "grad_norm": 8.91533374786377, + "loss": 6.0386, + "lr": 0.000754, + "step": 378, + "tokens_trained": 0.035930264 + }, + { + "epoch": 0.10780141843971631, + "grad_norm": 6.998043060302734, + "loss": 6.0294, + "lr": 0.000758, + "step": 380, + "tokens_trained": 0.036119616 + }, + { + "epoch": 0.10836879432624114, + "grad_norm": 7.343894958496094, + "loss": 6.0116, + "lr": 0.000762, + "step": 382, + "tokens_trained": 0.036308416 + }, + { + "epoch": 0.10893617021276596, + "grad_norm": 8.182528495788574, + "loss": 5.9904, + "lr": 0.0007660000000000001, + "step": 384, + "tokens_trained": 0.036497264 + }, + { + "epoch": 0.10950354609929078, + "grad_norm": 7.927818775177002, + "loss": 6.0345, + "lr": 0.0007700000000000001, + "step": 386, + "tokens_trained": 0.036688192 + }, + { + "epoch": 0.1100709219858156, + "grad_norm": 8.07447338104248, + "loss": 5.9685, + "lr": 0.0007740000000000001, + "step": 388, + "tokens_trained": 0.036878256 + }, + { + "epoch": 0.11063829787234042, + "grad_norm": 7.281871318817139, + "loss": 6.0125, + "lr": 0.000778, + "step": 390, + "tokens_trained": 0.037068272 + }, + { + "epoch": 0.11120567375886525, + "grad_norm": 8.298929214477539, + "loss": 6.0071, + "lr": 0.000782, + "step": 392, + "tokens_trained": 0.037259464 + }, + { + "epoch": 0.11177304964539007, + "grad_norm": 7.546716690063477, + "loss": 5.9721, + "lr": 0.000786, + "step": 394, + "tokens_trained": 0.037449696 + }, + { + "epoch": 0.1123404255319149, + "grad_norm": 8.28548526763916, + "loss": 5.9819, + "lr": 0.00079, + "step": 396, + "tokens_trained": 0.037639672 + }, + { + "epoch": 0.11290780141843972, + "grad_norm": 7.064655303955078, + "loss": 5.9873, + "lr": 0.0007940000000000001, + "step": 398, + "tokens_trained": 0.03782712 + }, + { + "epoch": 0.11347517730496454, + "grad_norm": 7.743175506591797, + "loss": 5.9528, + "lr": 0.0007980000000000001, + "step": 400, + "tokens_trained": 0.03801792 + }, + { + "epoch": 0.11404255319148936, + "grad_norm": 7.00898551940918, + "loss": 5.9504, + "lr": 0.0008020000000000001, + "step": 402, + "tokens_trained": 0.038209176 + }, + { + "epoch": 0.11460992907801418, + "grad_norm": 7.9350409507751465, + "loss": 5.9555, + "lr": 0.0008060000000000001, + "step": 404, + "tokens_trained": 0.03839824 + }, + { + "epoch": 0.11517730496453901, + "grad_norm": 7.048569679260254, + "loss": 5.9787, + "lr": 0.0008100000000000001, + "step": 406, + "tokens_trained": 0.03858732 + }, + { + "epoch": 0.11574468085106383, + "grad_norm": 7.088194370269775, + "loss": 5.928, + "lr": 0.0008139999999999999, + "step": 408, + "tokens_trained": 0.038777712 + }, + { + "epoch": 0.11631205673758865, + "grad_norm": 8.230712890625, + "loss": 5.9716, + "lr": 0.0008179999999999999, + "step": 410, + "tokens_trained": 0.038969464 + }, + { + "epoch": 0.11687943262411347, + "grad_norm": 8.076972007751465, + "loss": 5.9624, + "lr": 0.0008219999999999999, + "step": 412, + "tokens_trained": 0.039162064 + }, + { + "epoch": 0.1174468085106383, + "grad_norm": 8.065289497375488, + "loss": 5.9937, + "lr": 0.000826, + "step": 414, + "tokens_trained": 0.039348688 + }, + { + "epoch": 0.11801418439716312, + "grad_norm": 6.393420696258545, + "loss": 5.9278, + "lr": 0.00083, + "step": 416, + "tokens_trained": 0.03953732 + }, + { + "epoch": 0.11858156028368794, + "grad_norm": 7.384702682495117, + "loss": 5.931, + "lr": 0.000834, + "step": 418, + "tokens_trained": 0.039729808 + }, + { + "epoch": 0.11914893617021277, + "grad_norm": 7.007425308227539, + "loss": 5.93, + "lr": 0.000838, + "step": 420, + "tokens_trained": 0.039921096 + }, + { + "epoch": 0.11971631205673759, + "grad_norm": 7.112692832946777, + "loss": 5.9625, + "lr": 0.000842, + "step": 422, + "tokens_trained": 0.040110856 + }, + { + "epoch": 0.12028368794326241, + "grad_norm": 8.484418869018555, + "loss": 5.9848, + "lr": 0.000846, + "step": 424, + "tokens_trained": 0.040300504 + }, + { + "epoch": 0.12085106382978723, + "grad_norm": 6.633459091186523, + "loss": 6.0226, + "lr": 0.00085, + "step": 426, + "tokens_trained": 0.04049056 + }, + { + "epoch": 0.12141843971631205, + "grad_norm": 7.796964168548584, + "loss": 5.9152, + "lr": 0.000854, + "step": 428, + "tokens_trained": 0.040680544 + }, + { + "epoch": 0.12198581560283688, + "grad_norm": 7.833578586578369, + "loss": 5.924, + "lr": 0.000858, + "step": 430, + "tokens_trained": 0.040873128 + }, + { + "epoch": 0.1225531914893617, + "grad_norm": 6.7470550537109375, + "loss": 5.9318, + "lr": 0.000862, + "step": 432, + "tokens_trained": 0.041063488 + }, + { + "epoch": 0.12312056737588653, + "grad_norm": 6.066318988800049, + "loss": 5.9569, + "lr": 0.000866, + "step": 434, + "tokens_trained": 0.041254368 + }, + { + "epoch": 0.12368794326241135, + "grad_norm": 6.753541469573975, + "loss": 5.8851, + "lr": 0.00087, + "step": 436, + "tokens_trained": 0.04144516 + }, + { + "epoch": 0.12425531914893617, + "grad_norm": 6.471331596374512, + "loss": 5.864, + "lr": 0.000874, + "step": 438, + "tokens_trained": 0.041636912 + }, + { + "epoch": 0.12482269503546099, + "grad_norm": 6.129056930541992, + "loss": 5.8965, + "lr": 0.000878, + "step": 440, + "tokens_trained": 0.041828104 + }, + { + "epoch": 0.1253900709219858, + "grad_norm": 6.478890895843506, + "loss": 5.8817, + "lr": 0.000882, + "step": 442, + "tokens_trained": 0.04201808 + }, + { + "epoch": 0.12595744680851065, + "grad_norm": 6.014713287353516, + "loss": 5.8268, + "lr": 0.0008860000000000001, + "step": 444, + "tokens_trained": 0.042207328 + }, + { + "epoch": 0.12652482269503545, + "grad_norm": 5.505755424499512, + "loss": 5.8684, + "lr": 0.0008900000000000001, + "step": 446, + "tokens_trained": 0.042398152 + }, + { + "epoch": 0.1270921985815603, + "grad_norm": 10.096606254577637, + "loss": 5.8608, + "lr": 0.000894, + "step": 448, + "tokens_trained": 0.042588984 + }, + { + "epoch": 0.1276595744680851, + "grad_norm": 6.388499736785889, + "loss": 5.8766, + "lr": 0.000898, + "step": 450, + "tokens_trained": 0.042778592 + }, + { + "epoch": 0.12822695035460993, + "grad_norm": 7.145125865936279, + "loss": 5.8571, + "lr": 0.000902, + "step": 452, + "tokens_trained": 0.042967176 + }, + { + "epoch": 0.12879432624113477, + "grad_norm": 6.826383113861084, + "loss": 5.8655, + "lr": 0.000906, + "step": 454, + "tokens_trained": 0.043158952 + }, + { + "epoch": 0.12936170212765957, + "grad_norm": 6.036892414093018, + "loss": 5.8775, + "lr": 0.00091, + "step": 456, + "tokens_trained": 0.043349288 + }, + { + "epoch": 0.1299290780141844, + "grad_norm": 6.36528205871582, + "loss": 5.8908, + "lr": 0.0009140000000000001, + "step": 458, + "tokens_trained": 0.043539888 + }, + { + "epoch": 0.13049645390070921, + "grad_norm": 6.317558288574219, + "loss": 5.8702, + "lr": 0.0009180000000000001, + "step": 460, + "tokens_trained": 0.04373232 + }, + { + "epoch": 0.13106382978723405, + "grad_norm": 6.427131175994873, + "loss": 5.8399, + "lr": 0.0009220000000000001, + "step": 462, + "tokens_trained": 0.043922744 + }, + { + "epoch": 0.13163120567375886, + "grad_norm": 5.666539669036865, + "loss": 5.7899, + "lr": 0.0009260000000000001, + "step": 464, + "tokens_trained": 0.044112888 + }, + { + "epoch": 0.1321985815602837, + "grad_norm": 5.241824150085449, + "loss": 5.8203, + "lr": 0.00093, + "step": 466, + "tokens_trained": 0.04430244 + }, + { + "epoch": 0.1327659574468085, + "grad_norm": 6.072646141052246, + "loss": 5.8367, + "lr": 0.000934, + "step": 468, + "tokens_trained": 0.044493528 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 6.414418697357178, + "loss": 5.8236, + "lr": 0.0009379999999999999, + "step": 470, + "tokens_trained": 0.044682328 + }, + { + "epoch": 0.13390070921985817, + "grad_norm": 6.958801746368408, + "loss": 5.8179, + "lr": 0.000942, + "step": 472, + "tokens_trained": 0.044874256 + }, + { + "epoch": 0.13446808510638297, + "grad_norm": 5.787843227386475, + "loss": 5.8478, + "lr": 0.000946, + "step": 474, + "tokens_trained": 0.045065616 + }, + { + "epoch": 0.1350354609929078, + "grad_norm": 5.5841240882873535, + "loss": 5.8307, + "lr": 0.00095, + "step": 476, + "tokens_trained": 0.045257024 + }, + { + "epoch": 0.13560283687943261, + "grad_norm": 6.607712745666504, + "loss": 5.8512, + "lr": 0.000954, + "step": 478, + "tokens_trained": 0.045446432 + }, + { + "epoch": 0.13617021276595745, + "grad_norm": 5.473597049713135, + "loss": 5.8174, + "lr": 0.000958, + "step": 480, + "tokens_trained": 0.045636392 + }, + { + "epoch": 0.13673758865248226, + "grad_norm": 5.435728549957275, + "loss": 5.8308, + "lr": 0.000962, + "step": 482, + "tokens_trained": 0.045823784 + }, + { + "epoch": 0.1373049645390071, + "grad_norm": 6.049300670623779, + "loss": 5.8293, + "lr": 0.000966, + "step": 484, + "tokens_trained": 0.046013408 + }, + { + "epoch": 0.13787234042553193, + "grad_norm": 6.311764717102051, + "loss": 5.8086, + "lr": 0.0009699999999999999, + "step": 486, + "tokens_trained": 0.046202528 + }, + { + "epoch": 0.13843971631205673, + "grad_norm": 5.886009216308594, + "loss": 5.7986, + "lr": 0.000974, + "step": 488, + "tokens_trained": 0.04639404 + }, + { + "epoch": 0.13900709219858157, + "grad_norm": 5.438202381134033, + "loss": 5.8473, + "lr": 0.000978, + "step": 490, + "tokens_trained": 0.046586512 + }, + { + "epoch": 0.13957446808510637, + "grad_norm": 5.08393669128418, + "loss": 5.7613, + "lr": 0.000982, + "step": 492, + "tokens_trained": 0.046777448 + }, + { + "epoch": 0.1401418439716312, + "grad_norm": 5.645389080047607, + "loss": 5.7723, + "lr": 0.0009860000000000001, + "step": 494, + "tokens_trained": 0.046966096 + }, + { + "epoch": 0.14070921985815601, + "grad_norm": 6.320916652679443, + "loss": 5.7772, + "lr": 0.00099, + "step": 496, + "tokens_trained": 0.047155152 + }, + { + "epoch": 0.14127659574468085, + "grad_norm": 5.573540210723877, + "loss": 5.7412, + "lr": 0.000994, + "step": 498, + "tokens_trained": 0.047345352 + }, + { + "epoch": 0.14184397163120568, + "grad_norm": 4.939594745635986, + "loss": 5.8208, + "lr": 0.000998, + "step": 500, + "tokens_trained": 0.047535016 + }, + { + "epoch": 0.14184397163120568, + "eval_loss": 5.799490928649902, + "eval_runtime": 20.8575, + "step": 500, + "tokens_trained": 0.047535016 + }, + { + "epoch": 0.1424113475177305, + "grad_norm": 5.805343151092529, + "loss": 5.7734, + "lr": 0.00099986013986014, + "step": 502, + "tokens_trained": 0.047724216 + }, + { + "epoch": 0.14297872340425533, + "grad_norm": 5.831176280975342, + "loss": 5.8044, + "lr": 0.0009995804195804196, + "step": 504, + "tokens_trained": 0.047914328 + }, + { + "epoch": 0.14354609929078013, + "grad_norm": 5.045091152191162, + "loss": 5.8133, + "lr": 0.0009993006993006994, + "step": 506, + "tokens_trained": 0.048105032 + }, + { + "epoch": 0.14411347517730497, + "grad_norm": 5.276819705963135, + "loss": 5.7555, + "lr": 0.000999020979020979, + "step": 508, + "tokens_trained": 0.048293104 + }, + { + "epoch": 0.14468085106382977, + "grad_norm": 5.710324287414551, + "loss": 5.7619, + "lr": 0.0009987412587412587, + "step": 510, + "tokens_trained": 0.048483888 + }, + { + "epoch": 0.1452482269503546, + "grad_norm": 4.9472527503967285, + "loss": 5.767, + "lr": 0.0009984615384615386, + "step": 512, + "tokens_trained": 0.04867336 + }, + { + "epoch": 0.14581560283687944, + "grad_norm": 5.410078525543213, + "loss": 5.7238, + "lr": 0.0009981818181818182, + "step": 514, + "tokens_trained": 0.048863104 + }, + { + "epoch": 0.14638297872340425, + "grad_norm": 6.025843143463135, + "loss": 5.7664, + "lr": 0.000997902097902098, + "step": 516, + "tokens_trained": 0.049053856 + }, + { + "epoch": 0.14695035460992908, + "grad_norm": 5.3211669921875, + "loss": 5.747, + "lr": 0.0009976223776223777, + "step": 518, + "tokens_trained": 0.049245104 + }, + { + "epoch": 0.1475177304964539, + "grad_norm": 6.059483051300049, + "loss": 5.7611, + "lr": 0.0009973426573426573, + "step": 520, + "tokens_trained": 0.049434368 + }, + { + "epoch": 0.14808510638297873, + "grad_norm": 5.362505912780762, + "loss": 5.7607, + "lr": 0.000997062937062937, + "step": 522, + "tokens_trained": 0.049622648 + }, + { + "epoch": 0.14865248226950353, + "grad_norm": 5.391371726989746, + "loss": 5.7857, + "lr": 0.0009967832167832168, + "step": 524, + "tokens_trained": 0.049812304 + }, + { + "epoch": 0.14921985815602837, + "grad_norm": 4.3839030265808105, + "loss": 5.7334, + "lr": 0.0009965034965034964, + "step": 526, + "tokens_trained": 0.05000356 + }, + { + "epoch": 0.1497872340425532, + "grad_norm": 5.008530616760254, + "loss": 5.7475, + "lr": 0.0009962237762237763, + "step": 528, + "tokens_trained": 0.050193304 + }, + { + "epoch": 0.150354609929078, + "grad_norm": 5.068671226501465, + "loss": 5.7866, + "lr": 0.000995944055944056, + "step": 530, + "tokens_trained": 0.050382856 + }, + { + "epoch": 0.15092198581560284, + "grad_norm": 5.399240493774414, + "loss": 5.6857, + "lr": 0.0009956643356643356, + "step": 532, + "tokens_trained": 0.050570864 + }, + { + "epoch": 0.15148936170212765, + "grad_norm": 5.689481735229492, + "loss": 5.7586, + "lr": 0.0009953846153846154, + "step": 534, + "tokens_trained": 0.050760384 + }, + { + "epoch": 0.15205673758865249, + "grad_norm": 4.652275562286377, + "loss": 5.7866, + "lr": 0.000995104895104895, + "step": 536, + "tokens_trained": 0.050952712 + }, + { + "epoch": 0.1526241134751773, + "grad_norm": 4.126920223236084, + "loss": 5.7261, + "lr": 0.000994825174825175, + "step": 538, + "tokens_trained": 0.051141656 + }, + { + "epoch": 0.15319148936170213, + "grad_norm": 4.233098030090332, + "loss": 5.6903, + "lr": 0.0009945454545454546, + "step": 540, + "tokens_trained": 0.051331256 + }, + { + "epoch": 0.15375886524822696, + "grad_norm": 4.271973133087158, + "loss": 5.7293, + "lr": 0.0009942657342657344, + "step": 542, + "tokens_trained": 0.051522072 + }, + { + "epoch": 0.15432624113475177, + "grad_norm": 4.653008937835693, + "loss": 5.7133, + "lr": 0.000993986013986014, + "step": 544, + "tokens_trained": 0.051711624 + }, + { + "epoch": 0.1548936170212766, + "grad_norm": 4.192624092102051, + "loss": 5.6876, + "lr": 0.0009937062937062937, + "step": 546, + "tokens_trained": 0.051901744 + }, + { + "epoch": 0.1554609929078014, + "grad_norm": 5.497848033905029, + "loss": 5.7378, + "lr": 0.0009934265734265735, + "step": 548, + "tokens_trained": 0.052092872 + }, + { + "epoch": 0.15602836879432624, + "grad_norm": 4.350259780883789, + "loss": 5.6533, + "lr": 0.0009931468531468532, + "step": 550, + "tokens_trained": 0.052281768 + }, + { + "epoch": 0.15659574468085105, + "grad_norm": 4.515641689300537, + "loss": 5.7492, + "lr": 0.000992867132867133, + "step": 552, + "tokens_trained": 0.052471848 + }, + { + "epoch": 0.15716312056737589, + "grad_norm": 4.628066539764404, + "loss": 5.7113, + "lr": 0.0009925874125874127, + "step": 554, + "tokens_trained": 0.052660168 + }, + { + "epoch": 0.15773049645390072, + "grad_norm": 4.8322930335998535, + "loss": 5.6696, + "lr": 0.0009923076923076923, + "step": 556, + "tokens_trained": 0.05284776 + }, + { + "epoch": 0.15829787234042553, + "grad_norm": 3.999706506729126, + "loss": 5.7296, + "lr": 0.000992027972027972, + "step": 558, + "tokens_trained": 0.053037344 + }, + { + "epoch": 0.15886524822695036, + "grad_norm": 4.332971572875977, + "loss": 5.7362, + "lr": 0.0009917482517482518, + "step": 560, + "tokens_trained": 0.053228168 + }, + { + "epoch": 0.15943262411347517, + "grad_norm": 4.500301361083984, + "loss": 5.6982, + "lr": 0.0009914685314685314, + "step": 562, + "tokens_trained": 0.05341856 + }, + { + "epoch": 0.16, + "grad_norm": 4.721808910369873, + "loss": 5.7166, + "lr": 0.0009911888111888113, + "step": 564, + "tokens_trained": 0.053608824 + }, + { + "epoch": 0.1605673758865248, + "grad_norm": 5.265316009521484, + "loss": 5.7069, + "lr": 0.000990909090909091, + "step": 566, + "tokens_trained": 0.053799728 + }, + { + "epoch": 0.16113475177304964, + "grad_norm": 5.024131774902344, + "loss": 5.7113, + "lr": 0.0009906293706293705, + "step": 568, + "tokens_trained": 0.05398944 + }, + { + "epoch": 0.16170212765957448, + "grad_norm": 4.063276767730713, + "loss": 5.6251, + "lr": 0.0009903496503496504, + "step": 570, + "tokens_trained": 0.054176512 + }, + { + "epoch": 0.1622695035460993, + "grad_norm": 4.15974760055542, + "loss": 5.6912, + "lr": 0.00099006993006993, + "step": 572, + "tokens_trained": 0.054367072 + }, + { + "epoch": 0.16283687943262412, + "grad_norm": 4.338894844055176, + "loss": 5.6807, + "lr": 0.0009897902097902099, + "step": 574, + "tokens_trained": 0.054559184 + }, + { + "epoch": 0.16340425531914893, + "grad_norm": 5.535487174987793, + "loss": 5.6765, + "lr": 0.0009895104895104895, + "step": 576, + "tokens_trained": 0.054748904 + }, + { + "epoch": 0.16397163120567376, + "grad_norm": 4.379040241241455, + "loss": 5.6884, + "lr": 0.0009892307692307694, + "step": 578, + "tokens_trained": 0.054936136 + }, + { + "epoch": 0.16453900709219857, + "grad_norm": 4.746179103851318, + "loss": 5.6885, + "lr": 0.000988951048951049, + "step": 580, + "tokens_trained": 0.055125584 + }, + { + "epoch": 0.1651063829787234, + "grad_norm": 4.949806213378906, + "loss": 5.7061, + "lr": 0.0009886713286713286, + "step": 582, + "tokens_trained": 0.055314608 + }, + { + "epoch": 0.16567375886524824, + "grad_norm": 4.507448196411133, + "loss": 5.6339, + "lr": 0.0009883916083916085, + "step": 584, + "tokens_trained": 0.055503992 + }, + { + "epoch": 0.16624113475177305, + "grad_norm": 4.131013870239258, + "loss": 5.7122, + "lr": 0.0009881118881118881, + "step": 586, + "tokens_trained": 0.055693376 + }, + { + "epoch": 0.16680851063829788, + "grad_norm": 5.32897424697876, + "loss": 5.7192, + "lr": 0.000987832167832168, + "step": 588, + "tokens_trained": 0.05588452 + }, + { + "epoch": 0.1673758865248227, + "grad_norm": 4.166877746582031, + "loss": 5.6666, + "lr": 0.0009875524475524476, + "step": 590, + "tokens_trained": 0.056073936 + }, + { + "epoch": 0.16794326241134752, + "grad_norm": 4.393389701843262, + "loss": 5.6113, + "lr": 0.0009872727272727273, + "step": 592, + "tokens_trained": 0.056262224 + }, + { + "epoch": 0.16851063829787233, + "grad_norm": 4.466696739196777, + "loss": 5.6466, + "lr": 0.000986993006993007, + "step": 594, + "tokens_trained": 0.056454008 + }, + { + "epoch": 0.16907801418439716, + "grad_norm": 3.9413373470306396, + "loss": 5.6838, + "lr": 0.0009867132867132867, + "step": 596, + "tokens_trained": 0.05664444 + }, + { + "epoch": 0.169645390070922, + "grad_norm": 3.594649314880371, + "loss": 5.6684, + "lr": 0.0009864335664335664, + "step": 598, + "tokens_trained": 0.056833864 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 3.5969483852386475, + "loss": 5.6619, + "lr": 0.0009861538461538462, + "step": 600, + "tokens_trained": 0.05702332 + }, + { + "epoch": 0.17078014184397164, + "grad_norm": 3.845414638519287, + "loss": 5.5855, + "lr": 0.0009858741258741259, + "step": 602, + "tokens_trained": 0.057212776 + }, + { + "epoch": 0.17134751773049645, + "grad_norm": 3.9198834896087646, + "loss": 5.6551, + "lr": 0.0009855944055944055, + "step": 604, + "tokens_trained": 0.05740152 + }, + { + "epoch": 0.17191489361702128, + "grad_norm": 3.6764986515045166, + "loss": 5.6228, + "lr": 0.0009853146853146854, + "step": 606, + "tokens_trained": 0.057595616 + }, + { + "epoch": 0.1724822695035461, + "grad_norm": 3.8210043907165527, + "loss": 5.6557, + "lr": 0.000985034965034965, + "step": 608, + "tokens_trained": 0.057783968 + }, + { + "epoch": 0.17304964539007092, + "grad_norm": 3.893644094467163, + "loss": 5.6675, + "lr": 0.0009847552447552449, + "step": 610, + "tokens_trained": 0.057974832 + }, + { + "epoch": 0.17361702127659576, + "grad_norm": 3.280839681625366, + "loss": 5.6442, + "lr": 0.0009844755244755245, + "step": 612, + "tokens_trained": 0.058166272 + }, + { + "epoch": 0.17418439716312056, + "grad_norm": 3.4350404739379883, + "loss": 5.6555, + "lr": 0.0009841958041958043, + "step": 614, + "tokens_trained": 0.058356008 + }, + { + "epoch": 0.1747517730496454, + "grad_norm": 3.7700448036193848, + "loss": 5.6138, + "lr": 0.000983916083916084, + "step": 616, + "tokens_trained": 0.058546792 + }, + { + "epoch": 0.1753191489361702, + "grad_norm": 3.8182730674743652, + "loss": 5.6931, + "lr": 0.0009836363636363636, + "step": 618, + "tokens_trained": 0.058736296 + }, + { + "epoch": 0.17588652482269504, + "grad_norm": 3.9105372428894043, + "loss": 5.6431, + "lr": 0.0009833566433566435, + "step": 620, + "tokens_trained": 0.058927576 + }, + { + "epoch": 0.17645390070921985, + "grad_norm": 3.8897712230682373, + "loss": 5.6203, + "lr": 0.000983076923076923, + "step": 622, + "tokens_trained": 0.059118416 + }, + { + "epoch": 0.17702127659574468, + "grad_norm": 3.512194871902466, + "loss": 5.6292, + "lr": 0.000982797202797203, + "step": 624, + "tokens_trained": 0.059308568 + }, + { + "epoch": 0.1773049645390071, + "eval_loss": 5.630118370056152, + "eval_runtime": 21.1591, + "step": 625, + "tokens_trained": 0.059404056 + }, + { + "epoch": 0.17758865248226952, + "grad_norm": 2.990100383758545, + "loss": 5.622, + "lr": 0.0009825174825174826, + "step": 626, + "tokens_trained": 0.059499776 + }, + { + "epoch": 0.17815602836879432, + "grad_norm": 3.0487334728240967, + "loss": 5.6629, + "lr": 0.0009822377622377622, + "step": 628, + "tokens_trained": 0.059690208 + }, + { + "epoch": 0.17872340425531916, + "grad_norm": 3.6905510425567627, + "loss": 5.6345, + "lr": 0.0009819580419580419, + "step": 630, + "tokens_trained": 0.059881352 + }, + { + "epoch": 0.17929078014184396, + "grad_norm": 3.302255630493164, + "loss": 5.6733, + "lr": 0.0009816783216783217, + "step": 632, + "tokens_trained": 0.060071896 + }, + { + "epoch": 0.1798581560283688, + "grad_norm": 3.6833834648132324, + "loss": 5.5868, + "lr": 0.0009813986013986014, + "step": 634, + "tokens_trained": 0.060260504 + }, + { + "epoch": 0.1804255319148936, + "grad_norm": 3.1528804302215576, + "loss": 5.6128, + "lr": 0.0009811188811188812, + "step": 636, + "tokens_trained": 0.060450584 + }, + { + "epoch": 0.18099290780141844, + "grad_norm": 3.788860559463501, + "loss": 5.6235, + "lr": 0.0009808391608391608, + "step": 638, + "tokens_trained": 0.060640872 + }, + { + "epoch": 0.18156028368794327, + "grad_norm": 3.192462682723999, + "loss": 5.545, + "lr": 0.0009805594405594405, + "step": 640, + "tokens_trained": 0.060832776 + }, + { + "epoch": 0.18212765957446808, + "grad_norm": 3.505732774734497, + "loss": 5.5801, + "lr": 0.0009802797202797203, + "step": 642, + "tokens_trained": 0.06102204 + }, + { + "epoch": 0.18269503546099292, + "grad_norm": 3.9589102268218994, + "loss": 5.6091, + "lr": 0.00098, + "step": 644, + "tokens_trained": 0.061209744 + }, + { + "epoch": 0.18326241134751772, + "grad_norm": 3.4410059452056885, + "loss": 5.6279, + "lr": 0.0009797202797202798, + "step": 646, + "tokens_trained": 0.061400392 + }, + { + "epoch": 0.18382978723404256, + "grad_norm": 3.7746005058288574, + "loss": 5.6124, + "lr": 0.0009794405594405595, + "step": 648, + "tokens_trained": 0.061592232 + }, + { + "epoch": 0.18439716312056736, + "grad_norm": 3.75022292137146, + "loss": 5.5826, + "lr": 0.000979160839160839, + "step": 650, + "tokens_trained": 0.061781824 + }, + { + "epoch": 0.1849645390070922, + "grad_norm": 3.7629313468933105, + "loss": 5.555, + "lr": 0.000978881118881119, + "step": 652, + "tokens_trained": 0.061972744 + }, + { + "epoch": 0.18553191489361703, + "grad_norm": 4.5046820640563965, + "loss": 5.5972, + "lr": 0.0009786013986013986, + "step": 654, + "tokens_trained": 0.062163456 + }, + { + "epoch": 0.18609929078014184, + "grad_norm": 3.443138599395752, + "loss": 5.6061, + "lr": 0.0009783216783216782, + "step": 656, + "tokens_trained": 0.06235208 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 3.2661828994750977, + "loss": 5.5479, + "lr": 0.000978041958041958, + "step": 658, + "tokens_trained": 0.062544416 + }, + { + "epoch": 0.18723404255319148, + "grad_norm": 3.9571003913879395, + "loss": 5.6069, + "lr": 0.000977762237762238, + "step": 660, + "tokens_trained": 0.062733992 + }, + { + "epoch": 0.18780141843971632, + "grad_norm": 3.705880641937256, + "loss": 5.5915, + "lr": 0.0009774825174825176, + "step": 662, + "tokens_trained": 0.062922536 + }, + { + "epoch": 0.18836879432624112, + "grad_norm": 4.066433429718018, + "loss": 5.6031, + "lr": 0.0009772027972027972, + "step": 664, + "tokens_trained": 0.063114224 + }, + { + "epoch": 0.18893617021276596, + "grad_norm": 3.356651782989502, + "loss": 5.6045, + "lr": 0.0009769230769230768, + "step": 666, + "tokens_trained": 0.063304616 + }, + { + "epoch": 0.1895035460992908, + "grad_norm": 3.8084938526153564, + "loss": 5.6138, + "lr": 0.0009766433566433567, + "step": 668, + "tokens_trained": 0.06349476 + }, + { + "epoch": 0.1900709219858156, + "grad_norm": 4.282619953155518, + "loss": 5.5704, + "lr": 0.0009763636363636363, + "step": 670, + "tokens_trained": 0.063684848 + }, + { + "epoch": 0.19063829787234043, + "grad_norm": 3.045057773590088, + "loss": 5.6427, + "lr": 0.0009760839160839161, + "step": 672, + "tokens_trained": 0.063875192 + }, + { + "epoch": 0.19120567375886524, + "grad_norm": 3.360164165496826, + "loss": 5.5778, + "lr": 0.0009758041958041958, + "step": 674, + "tokens_trained": 0.06406636 + }, + { + "epoch": 0.19177304964539008, + "grad_norm": 3.5778472423553467, + "loss": 5.5389, + "lr": 0.0009755244755244756, + "step": 676, + "tokens_trained": 0.064254376 + }, + { + "epoch": 0.19234042553191488, + "grad_norm": 3.34869384765625, + "loss": 5.5894, + "lr": 0.0009752447552447553, + "step": 678, + "tokens_trained": 0.0644448 + }, + { + "epoch": 0.19290780141843972, + "grad_norm": 3.083582878112793, + "loss": 5.5776, + "lr": 0.0009749650349650349, + "step": 680, + "tokens_trained": 0.064633712 + }, + { + "epoch": 0.19347517730496455, + "grad_norm": 3.345973491668701, + "loss": 5.5987, + "lr": 0.0009746853146853148, + "step": 682, + "tokens_trained": 0.064824808 + }, + { + "epoch": 0.19404255319148936, + "grad_norm": 3.9262702465057373, + "loss": 5.64, + "lr": 0.0009744055944055944, + "step": 684, + "tokens_trained": 0.065016224 + }, + { + "epoch": 0.1946099290780142, + "grad_norm": 3.298543930053711, + "loss": 5.587, + "lr": 0.0009741258741258742, + "step": 686, + "tokens_trained": 0.065204216 + }, + { + "epoch": 0.195177304964539, + "grad_norm": 3.118626832962036, + "loss": 5.5864, + "lr": 0.0009738461538461538, + "step": 688, + "tokens_trained": 0.065393256 + }, + { + "epoch": 0.19574468085106383, + "grad_norm": 2.983548402786255, + "loss": 5.5506, + "lr": 0.0009735664335664336, + "step": 690, + "tokens_trained": 0.06558324 + }, + { + "epoch": 0.19631205673758864, + "grad_norm": 3.5204527378082275, + "loss": 5.5336, + "lr": 0.0009732867132867133, + "step": 692, + "tokens_trained": 0.065775624 + }, + { + "epoch": 0.19687943262411348, + "grad_norm": 3.138550281524658, + "loss": 5.5677, + "lr": 0.000973006993006993, + "step": 694, + "tokens_trained": 0.0659666 + }, + { + "epoch": 0.1974468085106383, + "grad_norm": 3.0961053371429443, + "loss": 5.5714, + "lr": 0.0009727272727272728, + "step": 696, + "tokens_trained": 0.066155512 + }, + { + "epoch": 0.19801418439716312, + "grad_norm": 3.4929685592651367, + "loss": 5.5829, + "lr": 0.0009724475524475524, + "step": 698, + "tokens_trained": 0.06634576 + }, + { + "epoch": 0.19858156028368795, + "grad_norm": 3.1820616722106934, + "loss": 5.6108, + "lr": 0.0009721678321678323, + "step": 700, + "tokens_trained": 0.066537016 + }, + { + "epoch": 0.19914893617021276, + "grad_norm": 3.4244654178619385, + "loss": 5.6025, + "lr": 0.0009718881118881119, + "step": 702, + "tokens_trained": 0.066727856 + }, + { + "epoch": 0.1997163120567376, + "grad_norm": 3.258605480194092, + "loss": 5.5581, + "lr": 0.0009716083916083917, + "step": 704, + "tokens_trained": 0.066916672 + }, + { + "epoch": 0.2002836879432624, + "grad_norm": 2.7159688472747803, + "loss": 5.5478, + "lr": 0.0009713286713286713, + "step": 706, + "tokens_trained": 0.067107704 + }, + { + "epoch": 0.20085106382978724, + "grad_norm": 3.1941912174224854, + "loss": 5.6126, + "lr": 0.000971048951048951, + "step": 708, + "tokens_trained": 0.067297896 + }, + { + "epoch": 0.20141843971631207, + "grad_norm": 3.20470929145813, + "loss": 5.5628, + "lr": 0.0009707692307692308, + "step": 710, + "tokens_trained": 0.06748608 + }, + { + "epoch": 0.20198581560283688, + "grad_norm": 3.6400153636932373, + "loss": 5.5758, + "lr": 0.0009704895104895105, + "step": 712, + "tokens_trained": 0.0676766 + }, + { + "epoch": 0.2025531914893617, + "grad_norm": 2.881639003753662, + "loss": 5.5512, + "lr": 0.0009702097902097903, + "step": 714, + "tokens_trained": 0.067865848 + }, + { + "epoch": 0.20312056737588652, + "grad_norm": 3.1113905906677246, + "loss": 5.5396, + "lr": 0.0009699300699300699, + "step": 716, + "tokens_trained": 0.068055368 + }, + { + "epoch": 0.20368794326241135, + "grad_norm": 3.135014772415161, + "loss": 5.5763, + "lr": 0.0009696503496503498, + "step": 718, + "tokens_trained": 0.068248544 + }, + { + "epoch": 0.20425531914893616, + "grad_norm": 3.1870718002319336, + "loss": 5.5903, + "lr": 0.0009693706293706294, + "step": 720, + "tokens_trained": 0.068436944 + }, + { + "epoch": 0.204822695035461, + "grad_norm": 3.125596523284912, + "loss": 5.6033, + "lr": 0.0009690909090909091, + "step": 722, + "tokens_trained": 0.06862548 + }, + { + "epoch": 0.20539007092198583, + "grad_norm": 2.897671699523926, + "loss": 5.5946, + "lr": 0.0009688111888111888, + "step": 724, + "tokens_trained": 0.068815232 + }, + { + "epoch": 0.20595744680851064, + "grad_norm": 2.855313539505005, + "loss": 5.5731, + "lr": 0.0009685314685314685, + "step": 726, + "tokens_trained": 0.06900692 + }, + { + "epoch": 0.20652482269503547, + "grad_norm": 2.7760672569274902, + "loss": 5.4949, + "lr": 0.0009682517482517483, + "step": 728, + "tokens_trained": 0.069195376 + }, + { + "epoch": 0.20709219858156028, + "grad_norm": 2.9300007820129395, + "loss": 5.5491, + "lr": 0.000967972027972028, + "step": 730, + "tokens_trained": 0.069385512 + }, + { + "epoch": 0.2076595744680851, + "grad_norm": 3.299860954284668, + "loss": 5.5405, + "lr": 0.0009676923076923078, + "step": 732, + "tokens_trained": 0.069573304 + }, + { + "epoch": 0.20822695035460992, + "grad_norm": 3.300189256668091, + "loss": 5.5797, + "lr": 0.0009674125874125874, + "step": 734, + "tokens_trained": 0.069764248 + }, + { + "epoch": 0.20879432624113475, + "grad_norm": 2.932995557785034, + "loss": 5.5556, + "lr": 0.0009671328671328672, + "step": 736, + "tokens_trained": 0.06995496 + }, + { + "epoch": 0.2093617021276596, + "grad_norm": 2.6711719036102295, + "loss": 5.48, + "lr": 0.0009668531468531469, + "step": 738, + "tokens_trained": 0.070142776 + }, + { + "epoch": 0.2099290780141844, + "grad_norm": 2.833314895629883, + "loss": 5.542, + "lr": 0.0009665734265734266, + "step": 740, + "tokens_trained": 0.070332064 + }, + { + "epoch": 0.21049645390070923, + "grad_norm": 2.899843215942383, + "loss": 5.5649, + "lr": 0.0009662937062937063, + "step": 742, + "tokens_trained": 0.070523448 + }, + { + "epoch": 0.21106382978723404, + "grad_norm": 2.96528697013855, + "loss": 5.5277, + "lr": 0.000966013986013986, + "step": 744, + "tokens_trained": 0.070713768 + }, + { + "epoch": 0.21163120567375887, + "grad_norm": 2.921109437942505, + "loss": 5.5646, + "lr": 0.0009657342657342657, + "step": 746, + "tokens_trained": 0.070905704 + }, + { + "epoch": 0.21219858156028368, + "grad_norm": 3.2725329399108887, + "loss": 5.4786, + "lr": 0.0009654545454545455, + "step": 748, + "tokens_trained": 0.071096008 + }, + { + "epoch": 0.2127659574468085, + "grad_norm": 2.8296804428100586, + "loss": 5.573, + "lr": 0.0009651748251748252, + "step": 750, + "tokens_trained": 0.07128828 + }, + { + "epoch": 0.2127659574468085, + "eval_loss": 5.535472869873047, + "eval_runtime": 21.0109, + "step": 750, + "tokens_trained": 0.07128828 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 3.0509591102600098, + "loss": 5.6037, + "lr": 0.0009648951048951049, + "step": 752, + "tokens_trained": 0.071479496 + }, + { + "epoch": 0.21390070921985815, + "grad_norm": 2.6773571968078613, + "loss": 5.5266, + "lr": 0.0009646153846153846, + "step": 754, + "tokens_trained": 0.071668568 + }, + { + "epoch": 0.214468085106383, + "grad_norm": 2.9600210189819336, + "loss": 5.5362, + "lr": 0.0009643356643356644, + "step": 756, + "tokens_trained": 0.071860552 + }, + { + "epoch": 0.2150354609929078, + "grad_norm": 2.6674885749816895, + "loss": 5.5388, + "lr": 0.0009640559440559441, + "step": 758, + "tokens_trained": 0.07204912 + }, + { + "epoch": 0.21560283687943263, + "grad_norm": 2.50179386138916, + "loss": 5.5027, + "lr": 0.0009637762237762237, + "step": 760, + "tokens_trained": 0.072239952 + }, + { + "epoch": 0.21617021276595744, + "grad_norm": 2.843411684036255, + "loss": 5.5221, + "lr": 0.0009634965034965035, + "step": 762, + "tokens_trained": 0.07243076 + }, + { + "epoch": 0.21673758865248227, + "grad_norm": 2.8686277866363525, + "loss": 5.4896, + "lr": 0.0009632167832167832, + "step": 764, + "tokens_trained": 0.072623272 + }, + { + "epoch": 0.2173049645390071, + "grad_norm": 2.611424684524536, + "loss": 5.5557, + "lr": 0.000962937062937063, + "step": 766, + "tokens_trained": 0.07281408 + }, + { + "epoch": 0.2178723404255319, + "grad_norm": 3.013145685195923, + "loss": 5.4964, + "lr": 0.0009626573426573427, + "step": 768, + "tokens_trained": 0.073005016 + }, + { + "epoch": 0.21843971631205675, + "grad_norm": 2.8682022094726562, + "loss": 5.5232, + "lr": 0.0009623776223776224, + "step": 770, + "tokens_trained": 0.07319652 + }, + { + "epoch": 0.21900709219858155, + "grad_norm": 2.6478466987609863, + "loss": 5.5517, + "lr": 0.0009620979020979021, + "step": 772, + "tokens_trained": 0.073387048 + }, + { + "epoch": 0.2195744680851064, + "grad_norm": 2.7273097038269043, + "loss": 5.5572, + "lr": 0.0009618181818181818, + "step": 774, + "tokens_trained": 0.073577424 + }, + { + "epoch": 0.2201418439716312, + "grad_norm": 3.104907751083374, + "loss": 5.5081, + "lr": 0.0009615384615384616, + "step": 776, + "tokens_trained": 0.073766712 + }, + { + "epoch": 0.22070921985815603, + "grad_norm": 2.9616432189941406, + "loss": 5.5059, + "lr": 0.0009612587412587412, + "step": 778, + "tokens_trained": 0.073956272 + }, + { + "epoch": 0.22127659574468084, + "grad_norm": 3.330319881439209, + "loss": 5.4811, + "lr": 0.000960979020979021, + "step": 780, + "tokens_trained": 0.074144008 + }, + { + "epoch": 0.22184397163120567, + "grad_norm": 2.964371919631958, + "loss": 5.4763, + "lr": 0.0009606993006993007, + "step": 782, + "tokens_trained": 0.074333888 + }, + { + "epoch": 0.2224113475177305, + "grad_norm": 3.13899827003479, + "loss": 5.5262, + "lr": 0.0009604195804195805, + "step": 784, + "tokens_trained": 0.074523584 + }, + { + "epoch": 0.2229787234042553, + "grad_norm": 3.2576637268066406, + "loss": 5.4983, + "lr": 0.0009601398601398602, + "step": 786, + "tokens_trained": 0.074714128 + }, + { + "epoch": 0.22354609929078015, + "grad_norm": 2.916149616241455, + "loss": 5.504, + "lr": 0.0009598601398601398, + "step": 788, + "tokens_trained": 0.074905104 + }, + { + "epoch": 0.22411347517730495, + "grad_norm": 2.842733144760132, + "loss": 5.4997, + "lr": 0.0009595804195804196, + "step": 790, + "tokens_trained": 0.075096328 + }, + { + "epoch": 0.2246808510638298, + "grad_norm": 2.880695104598999, + "loss": 5.5131, + "lr": 0.0009593006993006993, + "step": 792, + "tokens_trained": 0.075286104 + }, + { + "epoch": 0.2252482269503546, + "grad_norm": 2.620516300201416, + "loss": 5.5291, + "lr": 0.0009590209790209791, + "step": 794, + "tokens_trained": 0.075477392 + }, + { + "epoch": 0.22581560283687943, + "grad_norm": 2.622455358505249, + "loss": 5.5433, + "lr": 0.0009587412587412587, + "step": 796, + "tokens_trained": 0.0756682 + }, + { + "epoch": 0.22638297872340427, + "grad_norm": 2.532047986984253, + "loss": 5.5169, + "lr": 0.0009584615384615385, + "step": 798, + "tokens_trained": 0.075856528 + }, + { + "epoch": 0.22695035460992907, + "grad_norm": 2.628110885620117, + "loss": 5.5369, + "lr": 0.0009581818181818182, + "step": 800, + "tokens_trained": 0.076046256 + }, + { + "epoch": 0.2275177304964539, + "grad_norm": 2.376600980758667, + "loss": 5.4888, + "lr": 0.000957902097902098, + "step": 802, + "tokens_trained": 0.076236016 + }, + { + "epoch": 0.22808510638297871, + "grad_norm": 2.433666706085205, + "loss": 5.5044, + "lr": 0.0009576223776223777, + "step": 804, + "tokens_trained": 0.07642324 + }, + { + "epoch": 0.22865248226950355, + "grad_norm": 2.3850929737091064, + "loss": 5.4941, + "lr": 0.0009573426573426573, + "step": 806, + "tokens_trained": 0.07661376 + }, + { + "epoch": 0.22921985815602836, + "grad_norm": 2.4664969444274902, + "loss": 5.5257, + "lr": 0.0009570629370629371, + "step": 808, + "tokens_trained": 0.076804952 + }, + { + "epoch": 0.2297872340425532, + "grad_norm": 2.8514602184295654, + "loss": 5.5335, + "lr": 0.0009567832167832168, + "step": 810, + "tokens_trained": 0.076995064 + }, + { + "epoch": 0.23035460992907802, + "grad_norm": 2.508887767791748, + "loss": 5.5093, + "lr": 0.0009565034965034966, + "step": 812, + "tokens_trained": 0.077185344 + }, + { + "epoch": 0.23092198581560283, + "grad_norm": 2.5842514038085938, + "loss": 5.5246, + "lr": 0.0009562237762237762, + "step": 814, + "tokens_trained": 0.077375232 + }, + { + "epoch": 0.23148936170212767, + "grad_norm": 2.621562957763672, + "loss": 5.4948, + "lr": 0.0009559440559440559, + "step": 816, + "tokens_trained": 0.07756528 + }, + { + "epoch": 0.23205673758865247, + "grad_norm": 2.3230698108673096, + "loss": 5.5367, + "lr": 0.0009556643356643357, + "step": 818, + "tokens_trained": 0.077754936 + }, + { + "epoch": 0.2326241134751773, + "grad_norm": 2.728039264678955, + "loss": 5.4548, + "lr": 0.0009553846153846154, + "step": 820, + "tokens_trained": 0.077944056 + }, + { + "epoch": 0.23319148936170211, + "grad_norm": 2.786271333694458, + "loss": 5.4701, + "lr": 0.0009551048951048952, + "step": 822, + "tokens_trained": 0.07813272 + }, + { + "epoch": 0.23375886524822695, + "grad_norm": 2.449995517730713, + "loss": 5.5505, + "lr": 0.0009548251748251748, + "step": 824, + "tokens_trained": 0.078321888 + }, + { + "epoch": 0.23432624113475178, + "grad_norm": 2.394447088241577, + "loss": 5.4709, + "lr": 0.0009545454545454546, + "step": 826, + "tokens_trained": 0.078510288 + }, + { + "epoch": 0.2348936170212766, + "grad_norm": 2.5857675075531006, + "loss": 5.4986, + "lr": 0.0009542657342657343, + "step": 828, + "tokens_trained": 0.078698032 + }, + { + "epoch": 0.23546099290780143, + "grad_norm": 2.728743314743042, + "loss": 5.4983, + "lr": 0.000953986013986014, + "step": 830, + "tokens_trained": 0.078890608 + }, + { + "epoch": 0.23602836879432623, + "grad_norm": 2.3619866371154785, + "loss": 5.4985, + "lr": 0.0009537062937062937, + "step": 832, + "tokens_trained": 0.079081968 + }, + { + "epoch": 0.23659574468085107, + "grad_norm": 2.6265158653259277, + "loss": 5.5088, + "lr": 0.0009534265734265734, + "step": 834, + "tokens_trained": 0.079270712 + }, + { + "epoch": 0.23716312056737587, + "grad_norm": 2.3731281757354736, + "loss": 5.4682, + "lr": 0.0009531468531468532, + "step": 836, + "tokens_trained": 0.079459912 + }, + { + "epoch": 0.2377304964539007, + "grad_norm": 2.375283718109131, + "loss": 5.4278, + "lr": 0.0009528671328671329, + "step": 838, + "tokens_trained": 0.079649408 + }, + { + "epoch": 0.23829787234042554, + "grad_norm": 2.6856729984283447, + "loss": 5.5277, + "lr": 0.0009525874125874127, + "step": 840, + "tokens_trained": 0.079839552 + }, + { + "epoch": 0.23886524822695035, + "grad_norm": 2.5037410259246826, + "loss": 5.5022, + "lr": 0.0009523076923076923, + "step": 842, + "tokens_trained": 0.08002732 + }, + { + "epoch": 0.23943262411347518, + "grad_norm": 2.25175404548645, + "loss": 5.4918, + "lr": 0.000952027972027972, + "step": 844, + "tokens_trained": 0.080216416 + }, + { + "epoch": 0.24, + "grad_norm": 2.3555264472961426, + "loss": 5.5134, + "lr": 0.0009517482517482518, + "step": 846, + "tokens_trained": 0.080406928 + }, + { + "epoch": 0.24056737588652483, + "grad_norm": 2.390998601913452, + "loss": 5.4721, + "lr": 0.0009514685314685315, + "step": 848, + "tokens_trained": 0.080596232 + }, + { + "epoch": 0.24113475177304963, + "grad_norm": 2.1585986614227295, + "loss": 5.4511, + "lr": 0.0009511888111888112, + "step": 850, + "tokens_trained": 0.080786848 + }, + { + "epoch": 0.24170212765957447, + "grad_norm": 2.7733986377716064, + "loss": 5.5269, + "lr": 0.0009509090909090909, + "step": 852, + "tokens_trained": 0.080978144 + }, + { + "epoch": 0.2422695035460993, + "grad_norm": 2.8021209239959717, + "loss": 5.4751, + "lr": 0.0009506293706293707, + "step": 854, + "tokens_trained": 0.081167712 + }, + { + "epoch": 0.2428368794326241, + "grad_norm": 2.5434224605560303, + "loss": 5.5154, + "lr": 0.0009503496503496504, + "step": 856, + "tokens_trained": 0.081357584 + }, + { + "epoch": 0.24340425531914894, + "grad_norm": 2.456421136856079, + "loss": 5.5459, + "lr": 0.0009500699300699301, + "step": 858, + "tokens_trained": 0.081545992 + }, + { + "epoch": 0.24397163120567375, + "grad_norm": 2.317312002182007, + "loss": 5.4644, + "lr": 0.0009497902097902098, + "step": 860, + "tokens_trained": 0.081735392 + }, + { + "epoch": 0.24453900709219858, + "grad_norm": 2.3580780029296875, + "loss": 5.4359, + "lr": 0.0009495104895104895, + "step": 862, + "tokens_trained": 0.081925608 + }, + { + "epoch": 0.2451063829787234, + "grad_norm": 2.6440224647521973, + "loss": 5.4757, + "lr": 0.0009492307692307693, + "step": 864, + "tokens_trained": 0.08211328 + }, + { + "epoch": 0.24567375886524823, + "grad_norm": 2.5468132495880127, + "loss": 5.4115, + "lr": 0.000948951048951049, + "step": 866, + "tokens_trained": 0.082303736 + }, + { + "epoch": 0.24624113475177306, + "grad_norm": 2.431992530822754, + "loss": 5.4655, + "lr": 0.0009486713286713286, + "step": 868, + "tokens_trained": 0.082492896 + }, + { + "epoch": 0.24680851063829787, + "grad_norm": 2.443335771560669, + "loss": 5.4684, + "lr": 0.0009483916083916084, + "step": 870, + "tokens_trained": 0.082684024 + }, + { + "epoch": 0.2473758865248227, + "grad_norm": 2.6467180252075195, + "loss": 5.5017, + "lr": 0.0009481118881118881, + "step": 872, + "tokens_trained": 0.08287444 + }, + { + "epoch": 0.2479432624113475, + "grad_norm": 2.6044974327087402, + "loss": 5.4637, + "lr": 0.0009478321678321679, + "step": 874, + "tokens_trained": 0.08306436 + }, + { + "epoch": 0.24822695035460993, + "eval_loss": 5.4816508293151855, + "eval_runtime": 20.9467, + "step": 875, + "tokens_trained": 0.083158888 + }, + { + "epoch": 0.24851063829787234, + "grad_norm": 2.6221189498901367, + "loss": 5.4785, + "lr": 0.0009475524475524476, + "step": 876, + "tokens_trained": 0.083253472 + }, + { + "epoch": 0.24907801418439715, + "grad_norm": 2.409327983856201, + "loss": 5.42, + "lr": 0.0009472727272727273, + "step": 878, + "tokens_trained": 0.08344528 + }, + { + "epoch": 0.24964539007092199, + "grad_norm": 2.2504723072052, + "loss": 5.399, + "lr": 0.000946993006993007, + "step": 880, + "tokens_trained": 0.083635752 + }, + { + "epoch": 0.2502127659574468, + "grad_norm": 2.3018665313720703, + "loss": 5.4512, + "lr": 0.0009467132867132868, + "step": 882, + "tokens_trained": 0.08382576 + }, + { + "epoch": 0.2507801418439716, + "grad_norm": 2.5774636268615723, + "loss": 5.4592, + "lr": 0.0009464335664335665, + "step": 884, + "tokens_trained": 0.084016232 + }, + { + "epoch": 0.25134751773049646, + "grad_norm": 2.614935874938965, + "loss": 5.4772, + "lr": 0.0009461538461538461, + "step": 886, + "tokens_trained": 0.084206992 + }, + { + "epoch": 0.2519148936170213, + "grad_norm": 2.4281506538391113, + "loss": 5.4972, + "lr": 0.0009458741258741259, + "step": 888, + "tokens_trained": 0.084395848 + }, + { + "epoch": 0.2524822695035461, + "grad_norm": 2.3668100833892822, + "loss": 5.4505, + "lr": 0.0009455944055944056, + "step": 890, + "tokens_trained": 0.084583704 + }, + { + "epoch": 0.2530496453900709, + "grad_norm": 2.1937146186828613, + "loss": 5.4981, + "lr": 0.0009453146853146854, + "step": 892, + "tokens_trained": 0.08477096 + }, + { + "epoch": 0.25361702127659574, + "grad_norm": 2.2917556762695312, + "loss": 5.4224, + "lr": 0.000945034965034965, + "step": 894, + "tokens_trained": 0.084961048 + }, + { + "epoch": 0.2541843971631206, + "grad_norm": 2.1254703998565674, + "loss": 5.4409, + "lr": 0.0009447552447552447, + "step": 896, + "tokens_trained": 0.085153256 + }, + { + "epoch": 0.2547517730496454, + "grad_norm": 2.267159938812256, + "loss": 5.4527, + "lr": 0.0009444755244755245, + "step": 898, + "tokens_trained": 0.085343128 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 2.1975555419921875, + "loss": 5.516, + "lr": 0.0009441958041958042, + "step": 900, + "tokens_trained": 0.085534024 + }, + { + "epoch": 0.255886524822695, + "grad_norm": 2.3459436893463135, + "loss": 5.4592, + "lr": 0.000943916083916084, + "step": 902, + "tokens_trained": 0.085725136 + }, + { + "epoch": 0.25645390070921986, + "grad_norm": 2.4788501262664795, + "loss": 5.3937, + "lr": 0.0009436363636363636, + "step": 904, + "tokens_trained": 0.08591548 + }, + { + "epoch": 0.2570212765957447, + "grad_norm": 2.415065288543701, + "loss": 5.3991, + "lr": 0.0009433566433566434, + "step": 906, + "tokens_trained": 0.086105008 + }, + { + "epoch": 0.25758865248226953, + "grad_norm": 2.1260058879852295, + "loss": 5.4122, + "lr": 0.0009430769230769231, + "step": 908, + "tokens_trained": 0.08629424 + }, + { + "epoch": 0.2581560283687943, + "grad_norm": 2.1759092807769775, + "loss": 5.4663, + "lr": 0.0009427972027972029, + "step": 910, + "tokens_trained": 0.086485784 + }, + { + "epoch": 0.25872340425531914, + "grad_norm": 2.3481245040893555, + "loss": 5.4398, + "lr": 0.0009425174825174825, + "step": 912, + "tokens_trained": 0.086676744 + }, + { + "epoch": 0.259290780141844, + "grad_norm": 2.312612533569336, + "loss": 5.4615, + "lr": 0.0009422377622377622, + "step": 914, + "tokens_trained": 0.086866424 + }, + { + "epoch": 0.2598581560283688, + "grad_norm": 2.4709548950195312, + "loss": 5.4062, + "lr": 0.000941958041958042, + "step": 916, + "tokens_trained": 0.087055824 + }, + { + "epoch": 0.2604255319148936, + "grad_norm": 2.3664543628692627, + "loss": 5.4696, + "lr": 0.0009416783216783217, + "step": 918, + "tokens_trained": 0.087244136 + }, + { + "epoch": 0.26099290780141843, + "grad_norm": 2.423687696456909, + "loss": 5.4762, + "lr": 0.0009413986013986015, + "step": 920, + "tokens_trained": 0.087432584 + }, + { + "epoch": 0.26156028368794326, + "grad_norm": 2.4002890586853027, + "loss": 5.4743, + "lr": 0.0009411188811188811, + "step": 922, + "tokens_trained": 0.087622248 + }, + { + "epoch": 0.2621276595744681, + "grad_norm": 2.107527494430542, + "loss": 5.4013, + "lr": 0.0009408391608391608, + "step": 924, + "tokens_trained": 0.087809888 + }, + { + "epoch": 0.26269503546099293, + "grad_norm": 2.05177640914917, + "loss": 5.4601, + "lr": 0.0009405594405594406, + "step": 926, + "tokens_trained": 0.088002704 + }, + { + "epoch": 0.2632624113475177, + "grad_norm": 2.303874969482422, + "loss": 5.456, + "lr": 0.0009402797202797203, + "step": 928, + "tokens_trained": 0.088191344 + }, + { + "epoch": 0.26382978723404255, + "grad_norm": 2.4369659423828125, + "loss": 5.4162, + "lr": 0.00094, + "step": 930, + "tokens_trained": 0.088380832 + }, + { + "epoch": 0.2643971631205674, + "grad_norm": 2.4750819206237793, + "loss": 5.455, + "lr": 0.0009397202797202797, + "step": 932, + "tokens_trained": 0.088569936 + }, + { + "epoch": 0.2649645390070922, + "grad_norm": 2.09557843208313, + "loss": 5.4273, + "lr": 0.0009394405594405595, + "step": 934, + "tokens_trained": 0.08876116 + }, + { + "epoch": 0.265531914893617, + "grad_norm": 2.0984373092651367, + "loss": 5.4342, + "lr": 0.0009391608391608392, + "step": 936, + "tokens_trained": 0.088951032 + }, + { + "epoch": 0.26609929078014183, + "grad_norm": 2.1150097846984863, + "loss": 5.4344, + "lr": 0.000938881118881119, + "step": 938, + "tokens_trained": 0.08914124 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 2.1577563285827637, + "loss": 5.455, + "lr": 0.0009386013986013986, + "step": 940, + "tokens_trained": 0.089330952 + }, + { + "epoch": 0.2672340425531915, + "grad_norm": 2.0483016967773438, + "loss": 5.413, + "lr": 0.0009383216783216783, + "step": 942, + "tokens_trained": 0.08952116 + }, + { + "epoch": 0.26780141843971633, + "grad_norm": 2.3116559982299805, + "loss": 5.455, + "lr": 0.0009380419580419581, + "step": 944, + "tokens_trained": 0.089712888 + }, + { + "epoch": 0.2683687943262411, + "grad_norm": 2.2459256649017334, + "loss": 5.3971, + "lr": 0.0009377622377622378, + "step": 946, + "tokens_trained": 0.089903936 + }, + { + "epoch": 0.26893617021276595, + "grad_norm": 2.3048787117004395, + "loss": 5.4454, + "lr": 0.0009374825174825175, + "step": 948, + "tokens_trained": 0.090095888 + }, + { + "epoch": 0.2695035460992908, + "grad_norm": 2.196735143661499, + "loss": 5.4101, + "lr": 0.0009372027972027972, + "step": 950, + "tokens_trained": 0.090287472 + }, + { + "epoch": 0.2700709219858156, + "grad_norm": 2.3908562660217285, + "loss": 5.4731, + "lr": 0.0009369230769230769, + "step": 952, + "tokens_trained": 0.090476568 + }, + { + "epoch": 0.27063829787234045, + "grad_norm": 2.154932975769043, + "loss": 5.4104, + "lr": 0.0009366433566433567, + "step": 954, + "tokens_trained": 0.090665592 + }, + { + "epoch": 0.27120567375886523, + "grad_norm": 2.340907096862793, + "loss": 5.3707, + "lr": 0.0009363636363636364, + "step": 956, + "tokens_trained": 0.090853232 + }, + { + "epoch": 0.27177304964539006, + "grad_norm": 2.1736438274383545, + "loss": 5.4484, + "lr": 0.0009360839160839161, + "step": 958, + "tokens_trained": 0.091043808 + }, + { + "epoch": 0.2723404255319149, + "grad_norm": 2.3518154621124268, + "loss": 5.4919, + "lr": 0.0009358041958041958, + "step": 960, + "tokens_trained": 0.09123384 + }, + { + "epoch": 0.27290780141843973, + "grad_norm": 2.6673426628112793, + "loss": 5.4008, + "lr": 0.0009355244755244755, + "step": 962, + "tokens_trained": 0.091422544 + }, + { + "epoch": 0.2734751773049645, + "grad_norm": 2.4755311012268066, + "loss": 5.4533, + "lr": 0.0009352447552447553, + "step": 964, + "tokens_trained": 0.09161544 + }, + { + "epoch": 0.27404255319148935, + "grad_norm": 2.338452100753784, + "loss": 5.4953, + "lr": 0.0009349650349650349, + "step": 966, + "tokens_trained": 0.091806344 + }, + { + "epoch": 0.2746099290780142, + "grad_norm": 2.170426845550537, + "loss": 5.4588, + "lr": 0.0009346853146853147, + "step": 968, + "tokens_trained": 0.091996648 + }, + { + "epoch": 0.275177304964539, + "grad_norm": 2.2587599754333496, + "loss": 5.4547, + "lr": 0.0009344055944055944, + "step": 970, + "tokens_trained": 0.09218848 + }, + { + "epoch": 0.27574468085106385, + "grad_norm": 2.0009043216705322, + "loss": 5.4116, + "lr": 0.0009341258741258742, + "step": 972, + "tokens_trained": 0.092377984 + }, + { + "epoch": 0.27631205673758863, + "grad_norm": 2.0617294311523438, + "loss": 5.4541, + "lr": 0.0009338461538461539, + "step": 974, + "tokens_trained": 0.092569472 + }, + { + "epoch": 0.27687943262411346, + "grad_norm": 2.059300661087036, + "loss": 5.4414, + "lr": 0.0009335664335664336, + "step": 976, + "tokens_trained": 0.092758496 + }, + { + "epoch": 0.2774468085106383, + "grad_norm": 2.2815263271331787, + "loss": 5.4435, + "lr": 0.0009332867132867133, + "step": 978, + "tokens_trained": 0.092950368 + }, + { + "epoch": 0.27801418439716313, + "grad_norm": 2.1770365238189697, + "loss": 5.4481, + "lr": 0.0009330069930069929, + "step": 980, + "tokens_trained": 0.093140552 + }, + { + "epoch": 0.27858156028368797, + "grad_norm": 2.0089797973632812, + "loss": 5.4117, + "lr": 0.0009327272727272728, + "step": 982, + "tokens_trained": 0.093332312 + }, + { + "epoch": 0.27914893617021275, + "grad_norm": 2.2188286781311035, + "loss": 5.4594, + "lr": 0.0009324475524475524, + "step": 984, + "tokens_trained": 0.093520792 + }, + { + "epoch": 0.2797163120567376, + "grad_norm": 2.310481548309326, + "loss": 5.393, + "lr": 0.0009321678321678322, + "step": 986, + "tokens_trained": 0.093710608 + }, + { + "epoch": 0.2802836879432624, + "grad_norm": 2.3832972049713135, + "loss": 5.4277, + "lr": 0.0009318881118881119, + "step": 988, + "tokens_trained": 0.093900952 + }, + { + "epoch": 0.28085106382978725, + "grad_norm": 2.011126756668091, + "loss": 5.4097, + "lr": 0.0009316083916083917, + "step": 990, + "tokens_trained": 0.094091 + }, + { + "epoch": 0.28141843971631203, + "grad_norm": 2.2632968425750732, + "loss": 5.4388, + "lr": 0.0009313286713286714, + "step": 992, + "tokens_trained": 0.094281216 + }, + { + "epoch": 0.28198581560283686, + "grad_norm": 2.3477587699890137, + "loss": 5.3728, + "lr": 0.000931048951048951, + "step": 994, + "tokens_trained": 0.094470264 + }, + { + "epoch": 0.2825531914893617, + "grad_norm": 2.486196756362915, + "loss": 5.414, + "lr": 0.0009307692307692308, + "step": 996, + "tokens_trained": 0.094662816 + }, + { + "epoch": 0.28312056737588653, + "grad_norm": 2.5286316871643066, + "loss": 5.4063, + "lr": 0.0009304895104895104, + "step": 998, + "tokens_trained": 0.094852896 + }, + { + "epoch": 0.28368794326241137, + "grad_norm": 2.712406635284424, + "loss": 5.4274, + "lr": 0.0009302097902097903, + "step": 1000, + "tokens_trained": 0.09504336 + }, + { + "epoch": 0.28368794326241137, + "eval_loss": 5.434450626373291, + "eval_runtime": 21.0388, + "step": 1000, + "tokens_trained": 0.09504336 + }, + { + "epoch": 0.28425531914893615, + "grad_norm": 2.245316505432129, + "loss": 5.3551, + "lr": 0.0009299300699300699, + "step": 1002, + "tokens_trained": 0.095233944 + }, + { + "epoch": 0.284822695035461, + "grad_norm": 2.335533618927002, + "loss": 5.4608, + "lr": 0.0009296503496503497, + "step": 1004, + "tokens_trained": 0.095423184 + }, + { + "epoch": 0.2853900709219858, + "grad_norm": 2.232128858566284, + "loss": 5.4374, + "lr": 0.0009293706293706294, + "step": 1006, + "tokens_trained": 0.095612672 + }, + { + "epoch": 0.28595744680851065, + "grad_norm": 2.148329257965088, + "loss": 5.463, + "lr": 0.0009290909090909091, + "step": 1008, + "tokens_trained": 0.095802784 + }, + { + "epoch": 0.2865248226950355, + "grad_norm": 1.9580810070037842, + "loss": 5.291, + "lr": 0.0009288111888111889, + "step": 1010, + "tokens_trained": 0.095990776 + }, + { + "epoch": 0.28709219858156027, + "grad_norm": 1.9873988628387451, + "loss": 5.4103, + "lr": 0.0009285314685314685, + "step": 1012, + "tokens_trained": 0.096180648 + }, + { + "epoch": 0.2876595744680851, + "grad_norm": 2.0297746658325195, + "loss": 5.4078, + "lr": 0.0009282517482517483, + "step": 1014, + "tokens_trained": 0.09637224 + }, + { + "epoch": 0.28822695035460993, + "grad_norm": 1.928497076034546, + "loss": 5.3162, + "lr": 0.0009279720279720279, + "step": 1016, + "tokens_trained": 0.096561744 + }, + { + "epoch": 0.28879432624113477, + "grad_norm": 2.1219675540924072, + "loss": 5.4358, + "lr": 0.0009276923076923078, + "step": 1018, + "tokens_trained": 0.096752296 + }, + { + "epoch": 0.28936170212765955, + "grad_norm": 2.0021066665649414, + "loss": 5.4232, + "lr": 0.0009274125874125874, + "step": 1020, + "tokens_trained": 0.096943856 + }, + { + "epoch": 0.2899290780141844, + "grad_norm": 1.9920068979263306, + "loss": 5.407, + "lr": 0.0009271328671328671, + "step": 1022, + "tokens_trained": 0.097133632 + }, + { + "epoch": 0.2904964539007092, + "grad_norm": 1.8810361623764038, + "loss": 5.4293, + "lr": 0.0009268531468531469, + "step": 1024, + "tokens_trained": 0.097325976 + }, + { + "epoch": 0.29106382978723405, + "grad_norm": 1.8560134172439575, + "loss": 5.4236, + "lr": 0.0009265734265734266, + "step": 1026, + "tokens_trained": 0.0975142 + }, + { + "epoch": 0.2916312056737589, + "grad_norm": 2.1735010147094727, + "loss": 5.4252, + "lr": 0.0009262937062937064, + "step": 1028, + "tokens_trained": 0.0977042 + }, + { + "epoch": 0.29219858156028367, + "grad_norm": 2.2467288970947266, + "loss": 5.3756, + "lr": 0.000926013986013986, + "step": 1030, + "tokens_trained": 0.097893376 + }, + { + "epoch": 0.2927659574468085, + "grad_norm": 1.9609313011169434, + "loss": 5.4091, + "lr": 0.0009257342657342658, + "step": 1032, + "tokens_trained": 0.0980824 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 2.116384267807007, + "loss": 5.4001, + "lr": 0.0009254545454545454, + "step": 1034, + "tokens_trained": 0.098271304 + }, + { + "epoch": 0.29390070921985817, + "grad_norm": 2.1869800090789795, + "loss": 5.4102, + "lr": 0.0009251748251748252, + "step": 1036, + "tokens_trained": 0.098461528 + }, + { + "epoch": 0.294468085106383, + "grad_norm": 2.2882192134857178, + "loss": 5.4723, + "lr": 0.0009248951048951049, + "step": 1038, + "tokens_trained": 0.09865268 + }, + { + "epoch": 0.2950354609929078, + "grad_norm": 2.1590888500213623, + "loss": 5.3523, + "lr": 0.0009246153846153846, + "step": 1040, + "tokens_trained": 0.098842688 + }, + { + "epoch": 0.2956028368794326, + "grad_norm": 2.284207582473755, + "loss": 5.4647, + "lr": 0.0009243356643356644, + "step": 1042, + "tokens_trained": 0.099031544 + }, + { + "epoch": 0.29617021276595745, + "grad_norm": 2.333207845687866, + "loss": 5.4655, + "lr": 0.0009240559440559441, + "step": 1044, + "tokens_trained": 0.09922264 + }, + { + "epoch": 0.2967375886524823, + "grad_norm": 2.357572555541992, + "loss": 5.3909, + "lr": 0.0009237762237762239, + "step": 1046, + "tokens_trained": 0.099411416 + }, + { + "epoch": 0.29730496453900707, + "grad_norm": 1.88053297996521, + "loss": 5.4119, + "lr": 0.0009234965034965035, + "step": 1048, + "tokens_trained": 0.099602112 + }, + { + "epoch": 0.2978723404255319, + "grad_norm": 1.8860585689544678, + "loss": 5.3867, + "lr": 0.0009232167832167832, + "step": 1050, + "tokens_trained": 0.099792552 + }, + { + "epoch": 0.29843971631205674, + "grad_norm": 2.000173568725586, + "loss": 5.3773, + "lr": 0.0009229370629370629, + "step": 1052, + "tokens_trained": 0.099981752 + }, + { + "epoch": 0.29900709219858157, + "grad_norm": 2.015394926071167, + "loss": 5.3936, + "lr": 0.0009226573426573427, + "step": 1054, + "tokens_trained": 0.10017428 + }, + { + "epoch": 0.2995744680851064, + "grad_norm": 2.0050301551818848, + "loss": 5.3653, + "lr": 0.0009223776223776224, + "step": 1056, + "tokens_trained": 0.100364544 + }, + { + "epoch": 0.3001418439716312, + "grad_norm": 1.7397475242614746, + "loss": 5.3224, + "lr": 0.0009220979020979021, + "step": 1058, + "tokens_trained": 0.100555648 + }, + { + "epoch": 0.300709219858156, + "grad_norm": 1.9808533191680908, + "loss": 5.3822, + "lr": 0.0009218181818181819, + "step": 1060, + "tokens_trained": 0.100744968 + }, + { + "epoch": 0.30127659574468085, + "grad_norm": 2.034329652786255, + "loss": 5.3961, + "lr": 0.0009215384615384616, + "step": 1062, + "tokens_trained": 0.100934176 + }, + { + "epoch": 0.3018439716312057, + "grad_norm": 2.1286778450012207, + "loss": 5.4061, + "lr": 0.0009212587412587413, + "step": 1064, + "tokens_trained": 0.101125216 + }, + { + "epoch": 0.3024113475177305, + "grad_norm": 2.131822347640991, + "loss": 5.3675, + "lr": 0.000920979020979021, + "step": 1066, + "tokens_trained": 0.101314504 + }, + { + "epoch": 0.3029787234042553, + "grad_norm": 2.162069320678711, + "loss": 5.4552, + "lr": 0.0009206993006993007, + "step": 1068, + "tokens_trained": 0.101503352 + }, + { + "epoch": 0.30354609929078014, + "grad_norm": 2.5730931758880615, + "loss": 5.3978, + "lr": 0.0009204195804195804, + "step": 1070, + "tokens_trained": 0.101691504 + }, + { + "epoch": 0.30411347517730497, + "grad_norm": 2.2053022384643555, + "loss": 5.3604, + "lr": 0.0009201398601398602, + "step": 1072, + "tokens_trained": 0.101883072 + }, + { + "epoch": 0.3046808510638298, + "grad_norm": 2.1578407287597656, + "loss": 5.4236, + "lr": 0.0009198601398601398, + "step": 1074, + "tokens_trained": 0.102075832 + }, + { + "epoch": 0.3052482269503546, + "grad_norm": 2.0061423778533936, + "loss": 5.3882, + "lr": 0.0009195804195804196, + "step": 1076, + "tokens_trained": 0.102266768 + }, + { + "epoch": 0.3058156028368794, + "grad_norm": 1.8915576934814453, + "loss": 5.3539, + "lr": 0.0009193006993006993, + "step": 1078, + "tokens_trained": 0.102457096 + }, + { + "epoch": 0.30638297872340425, + "grad_norm": 2.15053129196167, + "loss": 5.4222, + "lr": 0.0009190209790209791, + "step": 1080, + "tokens_trained": 0.102647544 + }, + { + "epoch": 0.3069503546099291, + "grad_norm": 2.241217851638794, + "loss": 5.4275, + "lr": 0.0009187412587412588, + "step": 1082, + "tokens_trained": 0.10283904 + }, + { + "epoch": 0.3075177304964539, + "grad_norm": 2.37854266166687, + "loss": 5.419, + "lr": 0.0009184615384615385, + "step": 1084, + "tokens_trained": 0.103028464 + }, + { + "epoch": 0.3080851063829787, + "grad_norm": 2.00118350982666, + "loss": 5.4225, + "lr": 0.0009181818181818182, + "step": 1086, + "tokens_trained": 0.10321896 + }, + { + "epoch": 0.30865248226950354, + "grad_norm": 2.2643723487854004, + "loss": 5.4487, + "lr": 0.0009179020979020978, + "step": 1088, + "tokens_trained": 0.103409256 + }, + { + "epoch": 0.30921985815602837, + "grad_norm": 2.4618585109710693, + "loss": 5.4211, + "lr": 0.0009176223776223777, + "step": 1090, + "tokens_trained": 0.103597272 + }, + { + "epoch": 0.3097872340425532, + "grad_norm": 2.141491174697876, + "loss": 5.3758, + "lr": 0.0009173426573426573, + "step": 1092, + "tokens_trained": 0.103786128 + }, + { + "epoch": 0.31035460992907804, + "grad_norm": 1.9777475595474243, + "loss": 5.4129, + "lr": 0.0009170629370629371, + "step": 1094, + "tokens_trained": 0.103974864 + }, + { + "epoch": 0.3109219858156028, + "grad_norm": 1.9153270721435547, + "loss": 5.3912, + "lr": 0.0009167832167832168, + "step": 1096, + "tokens_trained": 0.104163864 + }, + { + "epoch": 0.31148936170212765, + "grad_norm": 2.172558546066284, + "loss": 5.3425, + "lr": 0.0009165034965034966, + "step": 1098, + "tokens_trained": 0.104353136 + }, + { + "epoch": 0.3120567375886525, + "grad_norm": 2.049896717071533, + "loss": 5.3732, + "lr": 0.0009162237762237763, + "step": 1100, + "tokens_trained": 0.10454476 + }, + { + "epoch": 0.3126241134751773, + "grad_norm": 1.9415545463562012, + "loss": 5.3873, + "lr": 0.0009159440559440559, + "step": 1102, + "tokens_trained": 0.104734296 + }, + { + "epoch": 0.3131914893617021, + "grad_norm": 1.7280856370925903, + "loss": 5.3857, + "lr": 0.0009156643356643357, + "step": 1104, + "tokens_trained": 0.104925648 + }, + { + "epoch": 0.31375886524822694, + "grad_norm": 1.9120069742202759, + "loss": 5.3216, + "lr": 0.0009153846153846153, + "step": 1106, + "tokens_trained": 0.105115776 + }, + { + "epoch": 0.31432624113475177, + "grad_norm": 2.007101058959961, + "loss": 5.4074, + "lr": 0.0009151048951048952, + "step": 1108, + "tokens_trained": 0.105305656 + }, + { + "epoch": 0.3148936170212766, + "grad_norm": 1.9159268140792847, + "loss": 5.3625, + "lr": 0.0009148251748251748, + "step": 1110, + "tokens_trained": 0.105494632 + }, + { + "epoch": 0.31546099290780144, + "grad_norm": 1.9235239028930664, + "loss": 5.3362, + "lr": 0.0009145454545454546, + "step": 1112, + "tokens_trained": 0.105683536 + }, + { + "epoch": 0.3160283687943262, + "grad_norm": 1.8954299688339233, + "loss": 5.3531, + "lr": 0.0009142657342657343, + "step": 1114, + "tokens_trained": 0.105873176 + }, + { + "epoch": 0.31659574468085105, + "grad_norm": 2.026578426361084, + "loss": 5.408, + "lr": 0.000913986013986014, + "step": 1116, + "tokens_trained": 0.10606276 + }, + { + "epoch": 0.3171631205673759, + "grad_norm": 1.9014806747436523, + "loss": 5.363, + "lr": 0.0009137062937062938, + "step": 1118, + "tokens_trained": 0.106254616 + }, + { + "epoch": 0.3177304964539007, + "grad_norm": 1.849649429321289, + "loss": 5.3811, + "lr": 0.0009134265734265734, + "step": 1120, + "tokens_trained": 0.106445376 + }, + { + "epoch": 0.31829787234042556, + "grad_norm": 1.7405186891555786, + "loss": 5.3504, + "lr": 0.0009131468531468532, + "step": 1122, + "tokens_trained": 0.106636072 + }, + { + "epoch": 0.31886524822695034, + "grad_norm": 1.867285966873169, + "loss": 5.3675, + "lr": 0.0009128671328671328, + "step": 1124, + "tokens_trained": 0.106827896 + }, + { + "epoch": 0.3191489361702128, + "eval_loss": 5.3796281814575195, + "eval_runtime": 20.7444, + "step": 1125, + "tokens_trained": 0.106922416 + }, + { + "epoch": 0.31943262411347517, + "grad_norm": 1.8044356107711792, + "loss": 5.3717, + "lr": 0.0009125874125874127, + "step": 1126, + "tokens_trained": 0.107016056 + }, + { + "epoch": 0.32, + "grad_norm": 1.6348600387573242, + "loss": 5.4676, + "lr": 0.0009123076923076923, + "step": 1128, + "tokens_trained": 0.107203992 + }, + { + "epoch": 0.32056737588652484, + "grad_norm": 1.7802475690841675, + "loss": 5.3304, + "lr": 0.000912027972027972, + "step": 1130, + "tokens_trained": 0.107394224 + }, + { + "epoch": 0.3211347517730496, + "grad_norm": 1.7695430517196655, + "loss": 5.3611, + "lr": 0.0009117482517482518, + "step": 1132, + "tokens_trained": 0.107583464 + }, + { + "epoch": 0.32170212765957445, + "grad_norm": 2.026853322982788, + "loss": 5.363, + "lr": 0.0009114685314685315, + "step": 1134, + "tokens_trained": 0.107776088 + }, + { + "epoch": 0.3222695035460993, + "grad_norm": 1.803208589553833, + "loss": 5.3801, + "lr": 0.0009111888111888113, + "step": 1136, + "tokens_trained": 0.107964416 + }, + { + "epoch": 0.3228368794326241, + "grad_norm": 1.812386155128479, + "loss": 5.3721, + "lr": 0.0009109090909090909, + "step": 1138, + "tokens_trained": 0.108153104 + }, + { + "epoch": 0.32340425531914896, + "grad_norm": 1.605839490890503, + "loss": 5.3339, + "lr": 0.0009106293706293707, + "step": 1140, + "tokens_trained": 0.108341408 + }, + { + "epoch": 0.32397163120567374, + "grad_norm": 1.7169313430786133, + "loss": 5.4051, + "lr": 0.0009103496503496503, + "step": 1142, + "tokens_trained": 0.108532312 + }, + { + "epoch": 0.3245390070921986, + "grad_norm": 2.0499444007873535, + "loss": 5.2992, + "lr": 0.0009100699300699301, + "step": 1144, + "tokens_trained": 0.108721864 + }, + { + "epoch": 0.3251063829787234, + "grad_norm": 1.988674521446228, + "loss": 5.3862, + "lr": 0.0009097902097902098, + "step": 1146, + "tokens_trained": 0.108912352 + }, + { + "epoch": 0.32567375886524824, + "grad_norm": 1.8733936548233032, + "loss": 5.3627, + "lr": 0.0009095104895104895, + "step": 1148, + "tokens_trained": 0.109101952 + }, + { + "epoch": 0.3262411347517731, + "grad_norm": 1.978704810142517, + "loss": 5.3668, + "lr": 0.0009092307692307692, + "step": 1150, + "tokens_trained": 0.109292712 + }, + { + "epoch": 0.32680851063829786, + "grad_norm": 1.9723341464996338, + "loss": 5.3545, + "lr": 0.000908951048951049, + "step": 1152, + "tokens_trained": 0.109484992 + }, + { + "epoch": 0.3273758865248227, + "grad_norm": 2.165728807449341, + "loss": 5.3731, + "lr": 0.0009086713286713288, + "step": 1154, + "tokens_trained": 0.109674336 + }, + { + "epoch": 0.3279432624113475, + "grad_norm": 1.9241019487380981, + "loss": 5.3456, + "lr": 0.0009083916083916084, + "step": 1156, + "tokens_trained": 0.109863368 + }, + { + "epoch": 0.32851063829787236, + "grad_norm": 1.9442275762557983, + "loss": 5.4065, + "lr": 0.0009081118881118881, + "step": 1158, + "tokens_trained": 0.110051744 + }, + { + "epoch": 0.32907801418439714, + "grad_norm": 1.7714000940322876, + "loss": 5.3888, + "lr": 0.0009078321678321678, + "step": 1160, + "tokens_trained": 0.11024344 + }, + { + "epoch": 0.329645390070922, + "grad_norm": 2.043646812438965, + "loss": 5.3835, + "lr": 0.0009075524475524476, + "step": 1162, + "tokens_trained": 0.11043488 + }, + { + "epoch": 0.3302127659574468, + "grad_norm": 1.837196946144104, + "loss": 5.3554, + "lr": 0.0009072727272727273, + "step": 1164, + "tokens_trained": 0.110626104 + }, + { + "epoch": 0.33078014184397164, + "grad_norm": 1.874135971069336, + "loss": 5.3457, + "lr": 0.000906993006993007, + "step": 1166, + "tokens_trained": 0.110814768 + }, + { + "epoch": 0.3313475177304965, + "grad_norm": 1.6493511199951172, + "loss": 5.3118, + "lr": 0.0009067132867132866, + "step": 1168, + "tokens_trained": 0.111004104 + }, + { + "epoch": 0.33191489361702126, + "grad_norm": 1.8386362791061401, + "loss": 5.3422, + "lr": 0.0009064335664335665, + "step": 1170, + "tokens_trained": 0.11119544 + }, + { + "epoch": 0.3324822695035461, + "grad_norm": 2.020859718322754, + "loss": 5.3565, + "lr": 0.0009061538461538462, + "step": 1172, + "tokens_trained": 0.111384384 + }, + { + "epoch": 0.3330496453900709, + "grad_norm": 2.049401044845581, + "loss": 5.3358, + "lr": 0.0009058741258741259, + "step": 1174, + "tokens_trained": 0.111573944 + }, + { + "epoch": 0.33361702127659576, + "grad_norm": 1.965345025062561, + "loss": 5.3431, + "lr": 0.0009055944055944056, + "step": 1176, + "tokens_trained": 0.111763504 + }, + { + "epoch": 0.3341843971631206, + "grad_norm": 1.9792066812515259, + "loss": 5.3579, + "lr": 0.0009053146853146853, + "step": 1178, + "tokens_trained": 0.111953664 + }, + { + "epoch": 0.3347517730496454, + "grad_norm": 1.7790883779525757, + "loss": 5.3499, + "lr": 0.0009050349650349651, + "step": 1180, + "tokens_trained": 0.11214324 + }, + { + "epoch": 0.3353191489361702, + "grad_norm": 1.6504682302474976, + "loss": 5.3415, + "lr": 0.0009047552447552448, + "step": 1182, + "tokens_trained": 0.112331256 + }, + { + "epoch": 0.33588652482269504, + "grad_norm": 1.9687312841415405, + "loss": 5.3565, + "lr": 0.0009044755244755245, + "step": 1184, + "tokens_trained": 0.11252208 + }, + { + "epoch": 0.3364539007092199, + "grad_norm": 1.7077507972717285, + "loss": 5.3568, + "lr": 0.0009041958041958041, + "step": 1186, + "tokens_trained": 0.112714272 + }, + { + "epoch": 0.33702127659574466, + "grad_norm": 1.6311697959899902, + "loss": 5.3345, + "lr": 0.000903916083916084, + "step": 1188, + "tokens_trained": 0.11290428 + }, + { + "epoch": 0.3375886524822695, + "grad_norm": 1.975233793258667, + "loss": 5.4161, + "lr": 0.0009036363636363637, + "step": 1190, + "tokens_trained": 0.113093984 + }, + { + "epoch": 0.3381560283687943, + "grad_norm": 1.7567362785339355, + "loss": 5.3481, + "lr": 0.0009033566433566434, + "step": 1192, + "tokens_trained": 0.113284904 + }, + { + "epoch": 0.33872340425531916, + "grad_norm": 2.121367931365967, + "loss": 5.3729, + "lr": 0.0009030769230769231, + "step": 1194, + "tokens_trained": 0.113477952 + }, + { + "epoch": 0.339290780141844, + "grad_norm": 2.143253803253174, + "loss": 5.3866, + "lr": 0.0009027972027972027, + "step": 1196, + "tokens_trained": 0.11366872 + }, + { + "epoch": 0.3398581560283688, + "grad_norm": 2.1118557453155518, + "loss": 5.3501, + "lr": 0.0009025174825174826, + "step": 1198, + "tokens_trained": 0.113861552 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 1.8132637739181519, + "loss": 5.3325, + "lr": 0.0009022377622377622, + "step": 1200, + "tokens_trained": 0.114051176 + }, + { + "epoch": 0.34099290780141844, + "grad_norm": 1.761227011680603, + "loss": 5.3629, + "lr": 0.000901958041958042, + "step": 1202, + "tokens_trained": 0.114240808 + }, + { + "epoch": 0.3415602836879433, + "grad_norm": 1.8358371257781982, + "loss": 5.3644, + "lr": 0.0009016783216783216, + "step": 1204, + "tokens_trained": 0.114430968 + }, + { + "epoch": 0.3421276595744681, + "grad_norm": 2.0768542289733887, + "loss": 5.3705, + "lr": 0.0009013986013986014, + "step": 1206, + "tokens_trained": 0.114620544 + }, + { + "epoch": 0.3426950354609929, + "grad_norm": 1.6928143501281738, + "loss": 5.2534, + "lr": 0.0009011188811188812, + "step": 1208, + "tokens_trained": 0.114811928 + }, + { + "epoch": 0.3432624113475177, + "grad_norm": 1.8634029626846313, + "loss": 5.3679, + "lr": 0.0009008391608391609, + "step": 1210, + "tokens_trained": 0.115002912 + }, + { + "epoch": 0.34382978723404256, + "grad_norm": 1.8048954010009766, + "loss": 5.3049, + "lr": 0.0009005594405594406, + "step": 1212, + "tokens_trained": 0.115192544 + }, + { + "epoch": 0.3443971631205674, + "grad_norm": 1.9170348644256592, + "loss": 5.2457, + "lr": 0.0009002797202797202, + "step": 1214, + "tokens_trained": 0.115383248 + }, + { + "epoch": 0.3449645390070922, + "grad_norm": 1.788751482963562, + "loss": 5.3678, + "lr": 0.0009000000000000001, + "step": 1216, + "tokens_trained": 0.115574304 + }, + { + "epoch": 0.345531914893617, + "grad_norm": 1.9751293659210205, + "loss": 5.3352, + "lr": 0.0008997202797202797, + "step": 1218, + "tokens_trained": 0.115766008 + }, + { + "epoch": 0.34609929078014184, + "grad_norm": 1.8202649354934692, + "loss": 5.37, + "lr": 0.0008994405594405595, + "step": 1220, + "tokens_trained": 0.11595804 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 1.656063199043274, + "loss": 5.3664, + "lr": 0.0008991608391608391, + "step": 1222, + "tokens_trained": 0.116146344 + }, + { + "epoch": 0.3472340425531915, + "grad_norm": 1.7509667873382568, + "loss": 5.3636, + "lr": 0.0008988811188811188, + "step": 1224, + "tokens_trained": 0.116334568 + }, + { + "epoch": 0.3478014184397163, + "grad_norm": 1.8556638956069946, + "loss": 5.3577, + "lr": 0.0008986013986013987, + "step": 1226, + "tokens_trained": 0.116525704 + }, + { + "epoch": 0.3483687943262411, + "grad_norm": 2.026033639907837, + "loss": 5.3657, + "lr": 0.0008983216783216783, + "step": 1228, + "tokens_trained": 0.116716032 + }, + { + "epoch": 0.34893617021276596, + "grad_norm": 1.6965924501419067, + "loss": 5.304, + "lr": 0.0008980419580419581, + "step": 1230, + "tokens_trained": 0.116904832 + }, + { + "epoch": 0.3495035460992908, + "grad_norm": 1.8144174814224243, + "loss": 5.3759, + "lr": 0.0008977622377622377, + "step": 1232, + "tokens_trained": 0.117095536 + }, + { + "epoch": 0.3500709219858156, + "grad_norm": 1.7229580879211426, + "loss": 5.3244, + "lr": 0.0008974825174825176, + "step": 1234, + "tokens_trained": 0.117285952 + }, + { + "epoch": 0.3506382978723404, + "grad_norm": 1.722578525543213, + "loss": 5.3442, + "lr": 0.0008972027972027972, + "step": 1236, + "tokens_trained": 0.117477488 + }, + { + "epoch": 0.35120567375886524, + "grad_norm": 1.8006796836853027, + "loss": 5.3624, + "lr": 0.000896923076923077, + "step": 1238, + "tokens_trained": 0.117667352 + }, + { + "epoch": 0.3517730496453901, + "grad_norm": 1.7172250747680664, + "loss": 5.3002, + "lr": 0.0008966433566433566, + "step": 1240, + "tokens_trained": 0.117856504 + }, + { + "epoch": 0.3523404255319149, + "grad_norm": 1.8281760215759277, + "loss": 5.3311, + "lr": 0.0008963636363636363, + "step": 1242, + "tokens_trained": 0.11804676 + }, + { + "epoch": 0.3529078014184397, + "grad_norm": 1.7666652202606201, + "loss": 5.3847, + "lr": 0.0008960839160839162, + "step": 1244, + "tokens_trained": 0.118235688 + }, + { + "epoch": 0.3534751773049645, + "grad_norm": 1.7723621129989624, + "loss": 5.3506, + "lr": 0.0008958041958041958, + "step": 1246, + "tokens_trained": 0.11842632 + }, + { + "epoch": 0.35404255319148936, + "grad_norm": 1.7779643535614014, + "loss": 5.3066, + "lr": 0.0008955244755244756, + "step": 1248, + "tokens_trained": 0.118616536 + }, + { + "epoch": 0.3546099290780142, + "grad_norm": 1.746245265007019, + "loss": 5.2993, + "lr": 0.0008952447552447552, + "step": 1250, + "tokens_trained": 0.118807672 + }, + { + "epoch": 0.3546099290780142, + "eval_loss": 5.34489107131958, + "eval_runtime": 21.0838, + "step": 1250, + "tokens_trained": 0.118807672 + }, + { + "epoch": 0.35517730496453903, + "grad_norm": 1.8439521789550781, + "loss": 5.3796, + "lr": 0.000894965034965035, + "step": 1252, + "tokens_trained": 0.118996672 + }, + { + "epoch": 0.3557446808510638, + "grad_norm": 1.7830157279968262, + "loss": 5.3435, + "lr": 0.0008946853146853147, + "step": 1254, + "tokens_trained": 0.119189544 + }, + { + "epoch": 0.35631205673758864, + "grad_norm": 1.6022379398345947, + "loss": 5.3772, + "lr": 0.0008944055944055944, + "step": 1256, + "tokens_trained": 0.119379312 + }, + { + "epoch": 0.3568794326241135, + "grad_norm": 1.6100343465805054, + "loss": 5.3411, + "lr": 0.0008941258741258741, + "step": 1258, + "tokens_trained": 0.119572072 + }, + { + "epoch": 0.3574468085106383, + "grad_norm": 1.7826210260391235, + "loss": 5.317, + "lr": 0.0008938461538461538, + "step": 1260, + "tokens_trained": 0.119761304 + }, + { + "epoch": 0.3580141843971631, + "grad_norm": 1.510432243347168, + "loss": 5.4018, + "lr": 0.0008935664335664337, + "step": 1262, + "tokens_trained": 0.11994984 + }, + { + "epoch": 0.35858156028368793, + "grad_norm": 1.7209227085113525, + "loss": 5.3651, + "lr": 0.0008932867132867133, + "step": 1264, + "tokens_trained": 0.120139368 + }, + { + "epoch": 0.35914893617021276, + "grad_norm": 1.7528654336929321, + "loss": 5.3329, + "lr": 0.000893006993006993, + "step": 1266, + "tokens_trained": 0.1203308 + }, + { + "epoch": 0.3597163120567376, + "grad_norm": 1.8427083492279053, + "loss": 5.3897, + "lr": 0.0008927272727272727, + "step": 1268, + "tokens_trained": 0.12052112 + }, + { + "epoch": 0.36028368794326243, + "grad_norm": 1.530527114868164, + "loss": 5.3407, + "lr": 0.0008924475524475525, + "step": 1270, + "tokens_trained": 0.120709456 + }, + { + "epoch": 0.3608510638297872, + "grad_norm": 1.5996145009994507, + "loss": 5.3697, + "lr": 0.0008921678321678322, + "step": 1272, + "tokens_trained": 0.12089976 + }, + { + "epoch": 0.36141843971631205, + "grad_norm": 1.5235425233840942, + "loss": 5.335, + "lr": 0.0008918881118881119, + "step": 1274, + "tokens_trained": 0.121089184 + }, + { + "epoch": 0.3619858156028369, + "grad_norm": 1.757206678390503, + "loss": 5.2983, + "lr": 0.0008916083916083916, + "step": 1276, + "tokens_trained": 0.1212798 + }, + { + "epoch": 0.3625531914893617, + "grad_norm": 1.5952467918395996, + "loss": 5.3593, + "lr": 0.0008913286713286713, + "step": 1278, + "tokens_trained": 0.121472816 + }, + { + "epoch": 0.36312056737588655, + "grad_norm": 1.6975666284561157, + "loss": 5.3867, + "lr": 0.0008910489510489512, + "step": 1280, + "tokens_trained": 0.121659944 + }, + { + "epoch": 0.36368794326241133, + "grad_norm": 1.8659151792526245, + "loss": 5.3032, + "lr": 0.0008907692307692308, + "step": 1282, + "tokens_trained": 0.121848552 + }, + { + "epoch": 0.36425531914893616, + "grad_norm": 1.8692409992218018, + "loss": 5.3643, + "lr": 0.0008904895104895105, + "step": 1284, + "tokens_trained": 0.12203916 + }, + { + "epoch": 0.364822695035461, + "grad_norm": 1.786490559577942, + "loss": 5.4001, + "lr": 0.0008902097902097902, + "step": 1286, + "tokens_trained": 0.122228464 + }, + { + "epoch": 0.36539007092198583, + "grad_norm": 1.6635786294937134, + "loss": 5.3158, + "lr": 0.00088993006993007, + "step": 1288, + "tokens_trained": 0.122419768 + }, + { + "epoch": 0.3659574468085106, + "grad_norm": 1.8413279056549072, + "loss": 5.315, + "lr": 0.0008896503496503497, + "step": 1290, + "tokens_trained": 0.122608512 + }, + { + "epoch": 0.36652482269503545, + "grad_norm": 1.802370548248291, + "loss": 5.3203, + "lr": 0.0008893706293706294, + "step": 1292, + "tokens_trained": 0.122795944 + }, + { + "epoch": 0.3670921985815603, + "grad_norm": 1.5968035459518433, + "loss": 5.3833, + "lr": 0.000889090909090909, + "step": 1294, + "tokens_trained": 0.1229842 + }, + { + "epoch": 0.3676595744680851, + "grad_norm": 1.8354761600494385, + "loss": 5.3365, + "lr": 0.0008888111888111888, + "step": 1296, + "tokens_trained": 0.123175336 + }, + { + "epoch": 0.36822695035460995, + "grad_norm": 1.925227403640747, + "loss": 5.3687, + "lr": 0.0008885314685314686, + "step": 1298, + "tokens_trained": 0.123366848 + }, + { + "epoch": 0.36879432624113473, + "grad_norm": 1.7477060556411743, + "loss": 5.4033, + "lr": 0.0008882517482517483, + "step": 1300, + "tokens_trained": 0.123556656 + }, + { + "epoch": 0.36936170212765956, + "grad_norm": 1.8925527334213257, + "loss": 5.2854, + "lr": 0.000887972027972028, + "step": 1302, + "tokens_trained": 0.12374612 + }, + { + "epoch": 0.3699290780141844, + "grad_norm": 1.8904681205749512, + "loss": 5.2903, + "lr": 0.0008876923076923077, + "step": 1304, + "tokens_trained": 0.123936192 + }, + { + "epoch": 0.37049645390070923, + "grad_norm": 1.9903556108474731, + "loss": 5.2994, + "lr": 0.0008874125874125875, + "step": 1306, + "tokens_trained": 0.124126112 + }, + { + "epoch": 0.37106382978723407, + "grad_norm": 2.014011859893799, + "loss": 5.353, + "lr": 0.0008871328671328671, + "step": 1308, + "tokens_trained": 0.124314592 + }, + { + "epoch": 0.37163120567375885, + "grad_norm": 1.9086287021636963, + "loss": 5.3924, + "lr": 0.0008868531468531469, + "step": 1310, + "tokens_trained": 0.124503496 + }, + { + "epoch": 0.3721985815602837, + "grad_norm": 1.8927134275436401, + "loss": 5.3098, + "lr": 0.0008865734265734265, + "step": 1312, + "tokens_trained": 0.124693296 + }, + { + "epoch": 0.3727659574468085, + "grad_norm": 1.850883960723877, + "loss": 5.356, + "lr": 0.0008862937062937063, + "step": 1314, + "tokens_trained": 0.124883528 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 1.813315510749817, + "loss": 5.3564, + "lr": 0.0008860139860139861, + "step": 1316, + "tokens_trained": 0.125072328 + }, + { + "epoch": 0.37390070921985813, + "grad_norm": 1.6776509284973145, + "loss": 5.3348, + "lr": 0.0008857342657342658, + "step": 1318, + "tokens_trained": 0.125263128 + }, + { + "epoch": 0.37446808510638296, + "grad_norm": 1.7775620222091675, + "loss": 5.298, + "lr": 0.0008854545454545455, + "step": 1320, + "tokens_trained": 0.125453944 + }, + { + "epoch": 0.3750354609929078, + "grad_norm": 1.6916086673736572, + "loss": 5.332, + "lr": 0.0008851748251748251, + "step": 1322, + "tokens_trained": 0.125644264 + }, + { + "epoch": 0.37560283687943263, + "grad_norm": 1.7182034254074097, + "loss": 5.3405, + "lr": 0.000884895104895105, + "step": 1324, + "tokens_trained": 0.125835256 + }, + { + "epoch": 0.37617021276595747, + "grad_norm": 1.690463662147522, + "loss": 5.355, + "lr": 0.0008846153846153846, + "step": 1326, + "tokens_trained": 0.126025952 + }, + { + "epoch": 0.37673758865248225, + "grad_norm": 1.7073352336883545, + "loss": 5.3304, + "lr": 0.0008843356643356644, + "step": 1328, + "tokens_trained": 0.126217456 + }, + { + "epoch": 0.3773049645390071, + "grad_norm": 1.6633049249649048, + "loss": 5.2724, + "lr": 0.000884055944055944, + "step": 1330, + "tokens_trained": 0.126407688 + }, + { + "epoch": 0.3778723404255319, + "grad_norm": 1.618843913078308, + "loss": 5.2952, + "lr": 0.0008837762237762238, + "step": 1332, + "tokens_trained": 0.126599504 + }, + { + "epoch": 0.37843971631205675, + "grad_norm": 1.7496757507324219, + "loss": 5.2846, + "lr": 0.0008834965034965036, + "step": 1334, + "tokens_trained": 0.126787648 + }, + { + "epoch": 0.3790070921985816, + "grad_norm": 1.7284750938415527, + "loss": 5.3229, + "lr": 0.0008832167832167832, + "step": 1336, + "tokens_trained": 0.126977568 + }, + { + "epoch": 0.37957446808510636, + "grad_norm": 1.55423903465271, + "loss": 5.3112, + "lr": 0.000882937062937063, + "step": 1338, + "tokens_trained": 0.12716944 + }, + { + "epoch": 0.3801418439716312, + "grad_norm": 1.5783073902130127, + "loss": 5.3002, + "lr": 0.0008826573426573426, + "step": 1340, + "tokens_trained": 0.127357296 + }, + { + "epoch": 0.38070921985815603, + "grad_norm": 1.6970964670181274, + "loss": 5.3003, + "lr": 0.0008823776223776225, + "step": 1342, + "tokens_trained": 0.127547112 + }, + { + "epoch": 0.38127659574468087, + "grad_norm": 1.8086830377578735, + "loss": 5.3018, + "lr": 0.0008820979020979021, + "step": 1344, + "tokens_trained": 0.12773616 + }, + { + "epoch": 0.38184397163120565, + "grad_norm": 1.6589199304580688, + "loss": 5.2903, + "lr": 0.0008818181818181819, + "step": 1346, + "tokens_trained": 0.127924704 + }, + { + "epoch": 0.3824113475177305, + "grad_norm": 1.6546344757080078, + "loss": 5.2639, + "lr": 0.0008815384615384615, + "step": 1348, + "tokens_trained": 0.128114848 + }, + { + "epoch": 0.3829787234042553, + "grad_norm": 1.6867282390594482, + "loss": 5.2713, + "lr": 0.0008812587412587412, + "step": 1350, + "tokens_trained": 0.12830572 + }, + { + "epoch": 0.38354609929078015, + "grad_norm": 1.6336158514022827, + "loss": 5.2688, + "lr": 0.0008809790209790211, + "step": 1352, + "tokens_trained": 0.128497336 + }, + { + "epoch": 0.384113475177305, + "grad_norm": 1.591659665107727, + "loss": 5.3073, + "lr": 0.0008806993006993007, + "step": 1354, + "tokens_trained": 0.128689088 + }, + { + "epoch": 0.38468085106382977, + "grad_norm": 1.6427522897720337, + "loss": 5.2649, + "lr": 0.0008804195804195805, + "step": 1356, + "tokens_trained": 0.128879208 + }, + { + "epoch": 0.3852482269503546, + "grad_norm": 1.693124771118164, + "loss": 5.334, + "lr": 0.0008801398601398601, + "step": 1358, + "tokens_trained": 0.129069376 + }, + { + "epoch": 0.38581560283687943, + "grad_norm": 1.6677430868148804, + "loss": 5.3091, + "lr": 0.00087986013986014, + "step": 1360, + "tokens_trained": 0.12925972 + }, + { + "epoch": 0.38638297872340427, + "grad_norm": 1.6829359531402588, + "loss": 5.3529, + "lr": 0.0008795804195804196, + "step": 1362, + "tokens_trained": 0.129449816 + }, + { + "epoch": 0.3869503546099291, + "grad_norm": 1.6984829902648926, + "loss": 5.2832, + "lr": 0.0008793006993006993, + "step": 1364, + "tokens_trained": 0.129638736 + }, + { + "epoch": 0.3875177304964539, + "grad_norm": 1.6351298093795776, + "loss": 5.3654, + "lr": 0.000879020979020979, + "step": 1366, + "tokens_trained": 0.129831512 + }, + { + "epoch": 0.3880851063829787, + "grad_norm": 1.588394045829773, + "loss": 5.3203, + "lr": 0.0008787412587412587, + "step": 1368, + "tokens_trained": 0.130021424 + }, + { + "epoch": 0.38865248226950355, + "grad_norm": 1.7608240842819214, + "loss": 5.3387, + "lr": 0.0008784615384615386, + "step": 1370, + "tokens_trained": 0.130211848 + }, + { + "epoch": 0.3892198581560284, + "grad_norm": 1.7742120027542114, + "loss": 5.3054, + "lr": 0.0008781818181818182, + "step": 1372, + "tokens_trained": 0.130400256 + }, + { + "epoch": 0.38978723404255317, + "grad_norm": 1.8751057386398315, + "loss": 5.3569, + "lr": 0.000877902097902098, + "step": 1374, + "tokens_trained": 0.130591616 + }, + { + "epoch": 0.3900709219858156, + "eval_loss": 5.315512180328369, + "eval_runtime": 20.9232, + "step": 1375, + "tokens_trained": 0.130685128 + }, + { + "epoch": 0.390354609929078, + "grad_norm": 1.8666746616363525, + "loss": 5.3088, + "lr": 0.0008776223776223776, + "step": 1376, + "tokens_trained": 0.130781056 + }, + { + "epoch": 0.39092198581560283, + "grad_norm": 1.8694190979003906, + "loss": 5.2391, + "lr": 0.0008773426573426574, + "step": 1378, + "tokens_trained": 0.130971152 + }, + { + "epoch": 0.39148936170212767, + "grad_norm": 2.0663323402404785, + "loss": 5.3497, + "lr": 0.0008770629370629371, + "step": 1380, + "tokens_trained": 0.131163224 + }, + { + "epoch": 0.3920567375886525, + "grad_norm": 1.956207275390625, + "loss": 5.3227, + "lr": 0.0008767832167832168, + "step": 1382, + "tokens_trained": 0.131353832 + }, + { + "epoch": 0.3926241134751773, + "grad_norm": 1.6816498041152954, + "loss": 5.2626, + "lr": 0.0008765034965034965, + "step": 1384, + "tokens_trained": 0.13154472 + }, + { + "epoch": 0.3931914893617021, + "grad_norm": 1.655116319656372, + "loss": 5.3334, + "lr": 0.0008762237762237762, + "step": 1386, + "tokens_trained": 0.131732128 + }, + { + "epoch": 0.39375886524822695, + "grad_norm": 1.6439241170883179, + "loss": 5.3038, + "lr": 0.0008759440559440561, + "step": 1388, + "tokens_trained": 0.131920728 + }, + { + "epoch": 0.3943262411347518, + "grad_norm": 1.5000464916229248, + "loss": 5.2798, + "lr": 0.0008756643356643357, + "step": 1390, + "tokens_trained": 0.1321094 + }, + { + "epoch": 0.3948936170212766, + "grad_norm": 1.7129333019256592, + "loss": 5.2918, + "lr": 0.0008753846153846154, + "step": 1392, + "tokens_trained": 0.132299832 + }, + { + "epoch": 0.3954609929078014, + "grad_norm": 1.7489241361618042, + "loss": 5.3551, + "lr": 0.0008751048951048951, + "step": 1394, + "tokens_trained": 0.13249016 + }, + { + "epoch": 0.39602836879432624, + "grad_norm": 1.6597840785980225, + "loss": 5.3718, + "lr": 0.0008748251748251749, + "step": 1396, + "tokens_trained": 0.132680568 + }, + { + "epoch": 0.39659574468085107, + "grad_norm": 1.8800175189971924, + "loss": 5.3578, + "lr": 0.0008745454545454546, + "step": 1398, + "tokens_trained": 0.132871296 + }, + { + "epoch": 0.3971631205673759, + "grad_norm": 1.8190884590148926, + "loss": 5.2714, + "lr": 0.0008742657342657343, + "step": 1400, + "tokens_trained": 0.133062288 + }, + { + "epoch": 0.3977304964539007, + "grad_norm": 1.602634310722351, + "loss": 5.2914, + "lr": 0.0008739860139860139, + "step": 1402, + "tokens_trained": 0.133252584 + }, + { + "epoch": 0.3982978723404255, + "grad_norm": 1.7363992929458618, + "loss": 5.3154, + "lr": 0.0008737062937062937, + "step": 1404, + "tokens_trained": 0.133444784 + }, + { + "epoch": 0.39886524822695035, + "grad_norm": 1.7578014135360718, + "loss": 5.3735, + "lr": 0.0008734265734265734, + "step": 1406, + "tokens_trained": 0.133636288 + }, + { + "epoch": 0.3994326241134752, + "grad_norm": 1.8847187757492065, + "loss": 5.3118, + "lr": 0.0008731468531468532, + "step": 1408, + "tokens_trained": 0.133825824 + }, + { + "epoch": 0.4, + "grad_norm": 1.750780701637268, + "loss": 5.3101, + "lr": 0.0008728671328671329, + "step": 1410, + "tokens_trained": 0.134016688 + }, + { + "epoch": 0.4005673758865248, + "grad_norm": 1.6785613298416138, + "loss": 5.2823, + "lr": 0.0008725874125874126, + "step": 1412, + "tokens_trained": 0.134208992 + }, + { + "epoch": 0.40113475177304964, + "grad_norm": 1.7419382333755493, + "loss": 5.2388, + "lr": 0.0008723076923076924, + "step": 1414, + "tokens_trained": 0.134398376 + }, + { + "epoch": 0.40170212765957447, + "grad_norm": 1.6936920881271362, + "loss": 5.2824, + "lr": 0.000872027972027972, + "step": 1416, + "tokens_trained": 0.134589264 + }, + { + "epoch": 0.4022695035460993, + "grad_norm": 1.7408183813095093, + "loss": 5.2692, + "lr": 0.0008717482517482518, + "step": 1418, + "tokens_trained": 0.134776568 + }, + { + "epoch": 0.40283687943262414, + "grad_norm": 1.7089916467666626, + "loss": 5.2309, + "lr": 0.0008714685314685314, + "step": 1420, + "tokens_trained": 0.134967616 + }, + { + "epoch": 0.4034042553191489, + "grad_norm": 1.6850922107696533, + "loss": 5.3656, + "lr": 0.0008711888111888112, + "step": 1422, + "tokens_trained": 0.135158272 + }, + { + "epoch": 0.40397163120567375, + "grad_norm": 1.546431064605713, + "loss": 5.3455, + "lr": 0.0008709090909090909, + "step": 1424, + "tokens_trained": 0.135349512 + }, + { + "epoch": 0.4045390070921986, + "grad_norm": 1.3656421899795532, + "loss": 5.2842, + "lr": 0.0008706293706293707, + "step": 1426, + "tokens_trained": 0.135538512 + }, + { + "epoch": 0.4051063829787234, + "grad_norm": 1.5918062925338745, + "loss": 5.3243, + "lr": 0.0008703496503496504, + "step": 1428, + "tokens_trained": 0.13572968 + }, + { + "epoch": 0.4056737588652482, + "grad_norm": 1.563009262084961, + "loss": 5.2539, + "lr": 0.00087006993006993, + "step": 1430, + "tokens_trained": 0.135919568 + }, + { + "epoch": 0.40624113475177304, + "grad_norm": 1.6144121885299683, + "loss": 5.2844, + "lr": 0.0008697902097902099, + "step": 1432, + "tokens_trained": 0.136109304 + }, + { + "epoch": 0.40680851063829787, + "grad_norm": 1.5911130905151367, + "loss": 5.3205, + "lr": 0.0008695104895104895, + "step": 1434, + "tokens_trained": 0.136296696 + }, + { + "epoch": 0.4073758865248227, + "grad_norm": 1.60932457447052, + "loss": 5.3783, + "lr": 0.0008692307692307693, + "step": 1436, + "tokens_trained": 0.136484912 + }, + { + "epoch": 0.40794326241134754, + "grad_norm": 1.559644341468811, + "loss": 5.2785, + "lr": 0.0008689510489510489, + "step": 1438, + "tokens_trained": 0.136675736 + }, + { + "epoch": 0.4085106382978723, + "grad_norm": 1.5167043209075928, + "loss": 5.3224, + "lr": 0.0008686713286713287, + "step": 1440, + "tokens_trained": 0.136864928 + }, + { + "epoch": 0.40907801418439715, + "grad_norm": 1.5843397378921509, + "loss": 5.3075, + "lr": 0.0008683916083916084, + "step": 1442, + "tokens_trained": 0.137056688 + }, + { + "epoch": 0.409645390070922, + "grad_norm": 1.581120491027832, + "loss": 5.2863, + "lr": 0.0008681118881118881, + "step": 1444, + "tokens_trained": 0.137244664 + }, + { + "epoch": 0.4102127659574468, + "grad_norm": 1.6355490684509277, + "loss": 5.348, + "lr": 0.0008678321678321679, + "step": 1446, + "tokens_trained": 0.13743372 + }, + { + "epoch": 0.41078014184397166, + "grad_norm": 1.5543185472488403, + "loss": 5.3268, + "lr": 0.0008675524475524475, + "step": 1448, + "tokens_trained": 0.13762696 + }, + { + "epoch": 0.41134751773049644, + "grad_norm": 1.5313750505447388, + "loss": 5.2784, + "lr": 0.0008672727272727273, + "step": 1450, + "tokens_trained": 0.137817376 + }, + { + "epoch": 0.41191489361702127, + "grad_norm": 1.7918111085891724, + "loss": 5.3063, + "lr": 0.000866993006993007, + "step": 1452, + "tokens_trained": 0.138007944 + }, + { + "epoch": 0.4124822695035461, + "grad_norm": 1.5105966329574585, + "loss": 5.2432, + "lr": 0.0008667132867132868, + "step": 1454, + "tokens_trained": 0.138199776 + }, + { + "epoch": 0.41304964539007094, + "grad_norm": 1.4441865682601929, + "loss": 5.269, + "lr": 0.0008664335664335664, + "step": 1456, + "tokens_trained": 0.13839124 + }, + { + "epoch": 0.4136170212765957, + "grad_norm": 1.473544955253601, + "loss": 5.2377, + "lr": 0.0008661538461538461, + "step": 1458, + "tokens_trained": 0.138580704 + }, + { + "epoch": 0.41418439716312055, + "grad_norm": 1.6085572242736816, + "loss": 5.245, + "lr": 0.0008658741258741259, + "step": 1460, + "tokens_trained": 0.138770176 + }, + { + "epoch": 0.4147517730496454, + "grad_norm": 1.609894871711731, + "loss": 5.3124, + "lr": 0.0008655944055944056, + "step": 1462, + "tokens_trained": 0.138961656 + }, + { + "epoch": 0.4153191489361702, + "grad_norm": 1.6923688650131226, + "loss": 5.3099, + "lr": 0.0008653146853146854, + "step": 1464, + "tokens_trained": 0.139151128 + }, + { + "epoch": 0.41588652482269506, + "grad_norm": 1.7480796575546265, + "loss": 5.2608, + "lr": 0.000865034965034965, + "step": 1466, + "tokens_trained": 0.139341168 + }, + { + "epoch": 0.41645390070921984, + "grad_norm": 1.725832223892212, + "loss": 5.2863, + "lr": 0.0008647552447552448, + "step": 1468, + "tokens_trained": 0.139530448 + }, + { + "epoch": 0.41702127659574467, + "grad_norm": 1.7886406183242798, + "loss": 5.231, + "lr": 0.0008644755244755245, + "step": 1470, + "tokens_trained": 0.13972244 + }, + { + "epoch": 0.4175886524822695, + "grad_norm": 1.803231954574585, + "loss": 5.2428, + "lr": 0.0008641958041958042, + "step": 1472, + "tokens_trained": 0.139913136 + }, + { + "epoch": 0.41815602836879434, + "grad_norm": 1.5347254276275635, + "loss": 5.2215, + "lr": 0.0008639160839160839, + "step": 1474, + "tokens_trained": 0.140104072 + }, + { + "epoch": 0.4187234042553192, + "grad_norm": 1.4485915899276733, + "loss": 5.2364, + "lr": 0.0008636363636363636, + "step": 1476, + "tokens_trained": 0.140294312 + }, + { + "epoch": 0.41929078014184396, + "grad_norm": 1.6130446195602417, + "loss": 5.3088, + "lr": 0.0008633566433566434, + "step": 1478, + "tokens_trained": 0.140482968 + }, + { + "epoch": 0.4198581560283688, + "grad_norm": 1.5839030742645264, + "loss": 5.3215, + "lr": 0.0008630769230769231, + "step": 1480, + "tokens_trained": 0.140674208 + }, + { + "epoch": 0.4204255319148936, + "grad_norm": 1.7519373893737793, + "loss": 5.3331, + "lr": 0.0008627972027972029, + "step": 1482, + "tokens_trained": 0.140864408 + }, + { + "epoch": 0.42099290780141846, + "grad_norm": 1.6718385219573975, + "loss": 5.231, + "lr": 0.0008625174825174825, + "step": 1484, + "tokens_trained": 0.141054696 + }, + { + "epoch": 0.42156028368794324, + "grad_norm": 1.5733797550201416, + "loss": 5.2621, + "lr": 0.0008622377622377622, + "step": 1486, + "tokens_trained": 0.141245712 + }, + { + "epoch": 0.4221276595744681, + "grad_norm": 1.549985647201538, + "loss": 5.2574, + "lr": 0.000861958041958042, + "step": 1488, + "tokens_trained": 0.141434232 + }, + { + "epoch": 0.4226950354609929, + "grad_norm": 1.651908278465271, + "loss": 5.2953, + "lr": 0.0008616783216783217, + "step": 1490, + "tokens_trained": 0.141623936 + }, + { + "epoch": 0.42326241134751774, + "grad_norm": 1.5680350065231323, + "loss": 5.288, + "lr": 0.0008613986013986014, + "step": 1492, + "tokens_trained": 0.141813904 + }, + { + "epoch": 0.4238297872340426, + "grad_norm": 1.5155646800994873, + "loss": 5.2529, + "lr": 0.0008611188811188811, + "step": 1494, + "tokens_trained": 0.14200372 + }, + { + "epoch": 0.42439716312056736, + "grad_norm": 1.5949562788009644, + "loss": 5.3064, + "lr": 0.0008608391608391609, + "step": 1496, + "tokens_trained": 0.142194496 + }, + { + "epoch": 0.4249645390070922, + "grad_norm": 1.6359357833862305, + "loss": 5.3452, + "lr": 0.0008605594405594406, + "step": 1498, + "tokens_trained": 0.142384592 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 1.648120403289795, + "loss": 5.3427, + "lr": 0.0008602797202797203, + "step": 1500, + "tokens_trained": 0.142573368 + }, + { + "epoch": 0.425531914893617, + "eval_loss": 5.282389163970947, + "eval_runtime": 20.5657, + "step": 1500, + "tokens_trained": 0.142573368 + }, + { + "epoch": 0.42609929078014186, + "grad_norm": 1.6313989162445068, + "loss": 5.2442, + "lr": 0.00086, + "step": 1502, + "tokens_trained": 0.142764584 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 1.447824239730835, + "loss": 5.2979, + "lr": 0.0008597202797202797, + "step": 1504, + "tokens_trained": 0.142953912 + }, + { + "epoch": 0.4272340425531915, + "grad_norm": 1.4285600185394287, + "loss": 5.317, + "lr": 0.0008594405594405595, + "step": 1506, + "tokens_trained": 0.143145944 + }, + { + "epoch": 0.4278014184397163, + "grad_norm": 1.4464077949523926, + "loss": 5.2746, + "lr": 0.0008591608391608392, + "step": 1508, + "tokens_trained": 0.1433374 + }, + { + "epoch": 0.42836879432624114, + "grad_norm": 1.3554625511169434, + "loss": 5.276, + "lr": 0.0008588811188811188, + "step": 1510, + "tokens_trained": 0.143529088 + }, + { + "epoch": 0.428936170212766, + "grad_norm": 1.4690148830413818, + "loss": 5.2976, + "lr": 0.0008586013986013986, + "step": 1512, + "tokens_trained": 0.1437192 + }, + { + "epoch": 0.42950354609929076, + "grad_norm": 1.4911222457885742, + "loss": 5.2727, + "lr": 0.0008583216783216783, + "step": 1514, + "tokens_trained": 0.143907728 + }, + { + "epoch": 0.4300709219858156, + "grad_norm": 1.5823880434036255, + "loss": 5.2481, + "lr": 0.0008580419580419581, + "step": 1516, + "tokens_trained": 0.144097048 + }, + { + "epoch": 0.4306382978723404, + "grad_norm": 1.486588716506958, + "loss": 5.2561, + "lr": 0.0008577622377622378, + "step": 1518, + "tokens_trained": 0.14428652 + }, + { + "epoch": 0.43120567375886526, + "grad_norm": 1.5762882232666016, + "loss": 5.267, + "lr": 0.0008574825174825175, + "step": 1520, + "tokens_trained": 0.144476848 + }, + { + "epoch": 0.4317730496453901, + "grad_norm": 1.6832828521728516, + "loss": 5.3329, + "lr": 0.0008572027972027972, + "step": 1522, + "tokens_trained": 0.144667568 + }, + { + "epoch": 0.4323404255319149, + "grad_norm": 1.7036137580871582, + "loss": 5.2326, + "lr": 0.000856923076923077, + "step": 1524, + "tokens_trained": 0.144860328 + }, + { + "epoch": 0.4329078014184397, + "grad_norm": 1.8102291822433472, + "loss": 5.251, + "lr": 0.0008566433566433567, + "step": 1526, + "tokens_trained": 0.1450528 + }, + { + "epoch": 0.43347517730496454, + "grad_norm": 1.667229413986206, + "loss": 5.2841, + "lr": 0.0008563636363636363, + "step": 1528, + "tokens_trained": 0.145240952 + }, + { + "epoch": 0.4340425531914894, + "grad_norm": 1.6709800958633423, + "loss": 5.2387, + "lr": 0.0008560839160839161, + "step": 1530, + "tokens_trained": 0.145431376 + }, + { + "epoch": 0.4346099290780142, + "grad_norm": 1.600885272026062, + "loss": 5.2179, + "lr": 0.0008558041958041958, + "step": 1532, + "tokens_trained": 0.145620184 + }, + { + "epoch": 0.435177304964539, + "grad_norm": 1.5783873796463013, + "loss": 5.2432, + "lr": 0.0008555244755244756, + "step": 1534, + "tokens_trained": 0.145810616 + }, + { + "epoch": 0.4357446808510638, + "grad_norm": 1.5059685707092285, + "loss": 5.2604, + "lr": 0.0008552447552447553, + "step": 1536, + "tokens_trained": 0.14600232 + }, + { + "epoch": 0.43631205673758866, + "grad_norm": 1.5880341529846191, + "loss": 5.249, + "lr": 0.000854965034965035, + "step": 1538, + "tokens_trained": 0.146192504 + }, + { + "epoch": 0.4368794326241135, + "grad_norm": 1.430004596710205, + "loss": 5.2668, + "lr": 0.0008546853146853147, + "step": 1540, + "tokens_trained": 0.146382264 + }, + { + "epoch": 0.4374468085106383, + "grad_norm": 1.4099256992340088, + "loss": 5.2839, + "lr": 0.0008544055944055944, + "step": 1542, + "tokens_trained": 0.146570432 + }, + { + "epoch": 0.4380141843971631, + "grad_norm": 1.3938827514648438, + "loss": 5.2534, + "lr": 0.0008541258741258742, + "step": 1544, + "tokens_trained": 0.146763736 + }, + { + "epoch": 0.43858156028368794, + "grad_norm": 1.4359923601150513, + "loss": 5.2202, + "lr": 0.0008538461538461538, + "step": 1546, + "tokens_trained": 0.146953944 + }, + { + "epoch": 0.4391489361702128, + "grad_norm": 1.5405043363571167, + "loss": 5.2613, + "lr": 0.0008535664335664336, + "step": 1548, + "tokens_trained": 0.147144664 + }, + { + "epoch": 0.4397163120567376, + "grad_norm": 1.6448051929473877, + "loss": 5.299, + "lr": 0.0008532867132867133, + "step": 1550, + "tokens_trained": 0.147335064 + }, + { + "epoch": 0.4402836879432624, + "grad_norm": 1.6528949737548828, + "loss": 5.3004, + "lr": 0.000853006993006993, + "step": 1552, + "tokens_trained": 0.147524088 + }, + { + "epoch": 0.4408510638297872, + "grad_norm": 1.637702226638794, + "loss": 5.2298, + "lr": 0.0008527272727272728, + "step": 1554, + "tokens_trained": 0.147716296 + }, + { + "epoch": 0.44141843971631206, + "grad_norm": 1.7230212688446045, + "loss": 5.2806, + "lr": 0.0008524475524475524, + "step": 1556, + "tokens_trained": 0.147905216 + }, + { + "epoch": 0.4419858156028369, + "grad_norm": 1.6216089725494385, + "loss": 5.3062, + "lr": 0.0008521678321678322, + "step": 1558, + "tokens_trained": 0.148092312 + }, + { + "epoch": 0.4425531914893617, + "grad_norm": 1.5734955072402954, + "loss": 5.2607, + "lr": 0.0008518881118881119, + "step": 1560, + "tokens_trained": 0.148282712 + }, + { + "epoch": 0.4431205673758865, + "grad_norm": 1.6687103509902954, + "loss": 5.2737, + "lr": 0.0008516083916083917, + "step": 1562, + "tokens_trained": 0.148474672 + }, + { + "epoch": 0.44368794326241134, + "grad_norm": 1.547277569770813, + "loss": 5.3183, + "lr": 0.0008513286713286713, + "step": 1564, + "tokens_trained": 0.148667824 + }, + { + "epoch": 0.4442553191489362, + "grad_norm": 1.3782074451446533, + "loss": 5.266, + "lr": 0.000851048951048951, + "step": 1566, + "tokens_trained": 0.14885704 + }, + { + "epoch": 0.444822695035461, + "grad_norm": 1.5648273229599, + "loss": 5.2954, + "lr": 0.0008507692307692308, + "step": 1568, + "tokens_trained": 0.14904804 + }, + { + "epoch": 0.4453900709219858, + "grad_norm": 1.5675908327102661, + "loss": 5.2897, + "lr": 0.0008504895104895105, + "step": 1570, + "tokens_trained": 0.149237048 + }, + { + "epoch": 0.4459574468085106, + "grad_norm": 1.5399287939071655, + "loss": 5.2993, + "lr": 0.0008502097902097903, + "step": 1572, + "tokens_trained": 0.149427328 + }, + { + "epoch": 0.44652482269503546, + "grad_norm": 1.7170253992080688, + "loss": 5.2756, + "lr": 0.0008499300699300699, + "step": 1574, + "tokens_trained": 0.149618448 + }, + { + "epoch": 0.4470921985815603, + "grad_norm": 1.5694142580032349, + "loss": 5.2294, + "lr": 0.0008496503496503497, + "step": 1576, + "tokens_trained": 0.149809416 + }, + { + "epoch": 0.44765957446808513, + "grad_norm": 1.5410487651824951, + "loss": 5.2392, + "lr": 0.0008493706293706294, + "step": 1578, + "tokens_trained": 0.149999608 + }, + { + "epoch": 0.4482269503546099, + "grad_norm": 1.5991896390914917, + "loss": 5.2569, + "lr": 0.0008490909090909091, + "step": 1580, + "tokens_trained": 0.150190224 + }, + { + "epoch": 0.44879432624113474, + "grad_norm": 1.5861775875091553, + "loss": 5.3151, + "lr": 0.0008488111888111888, + "step": 1582, + "tokens_trained": 0.150380592 + }, + { + "epoch": 0.4493617021276596, + "grad_norm": 1.530462622642517, + "loss": 5.3242, + "lr": 0.0008485314685314685, + "step": 1584, + "tokens_trained": 0.15056992 + }, + { + "epoch": 0.4499290780141844, + "grad_norm": 1.5658655166625977, + "loss": 5.2933, + "lr": 0.0008482517482517483, + "step": 1586, + "tokens_trained": 0.150760336 + }, + { + "epoch": 0.4504964539007092, + "grad_norm": 1.4187430143356323, + "loss": 5.2235, + "lr": 0.000847972027972028, + "step": 1588, + "tokens_trained": 0.150949088 + }, + { + "epoch": 0.451063829787234, + "grad_norm": 1.6921541690826416, + "loss": 5.2496, + "lr": 0.0008476923076923078, + "step": 1590, + "tokens_trained": 0.151140016 + }, + { + "epoch": 0.45163120567375886, + "grad_norm": 1.6049220561981201, + "loss": 5.2767, + "lr": 0.0008474125874125874, + "step": 1592, + "tokens_trained": 0.151330944 + }, + { + "epoch": 0.4521985815602837, + "grad_norm": 1.513168454170227, + "loss": 5.2904, + "lr": 0.0008471328671328671, + "step": 1594, + "tokens_trained": 0.151520152 + }, + { + "epoch": 0.45276595744680853, + "grad_norm": 1.5247087478637695, + "loss": 5.2391, + "lr": 0.0008468531468531469, + "step": 1596, + "tokens_trained": 0.151711592 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 1.5005898475646973, + "loss": 5.3025, + "lr": 0.0008465734265734266, + "step": 1598, + "tokens_trained": 0.151902736 + }, + { + "epoch": 0.45390070921985815, + "grad_norm": 1.3196156024932861, + "loss": 5.3025, + "lr": 0.0008462937062937063, + "step": 1600, + "tokens_trained": 0.152094032 + }, + { + "epoch": 0.454468085106383, + "grad_norm": 1.5037102699279785, + "loss": 5.2348, + "lr": 0.000846013986013986, + "step": 1602, + "tokens_trained": 0.15228336 + }, + { + "epoch": 0.4550354609929078, + "grad_norm": 1.404539942741394, + "loss": 5.2551, + "lr": 0.0008457342657342658, + "step": 1604, + "tokens_trained": 0.152474776 + }, + { + "epoch": 0.45560283687943265, + "grad_norm": 1.4784883260726929, + "loss": 5.2927, + "lr": 0.0008454545454545455, + "step": 1606, + "tokens_trained": 0.152663392 + }, + { + "epoch": 0.45617021276595743, + "grad_norm": 1.3743332624435425, + "loss": 5.2542, + "lr": 0.0008451748251748252, + "step": 1608, + "tokens_trained": 0.152852512 + }, + { + "epoch": 0.45673758865248226, + "grad_norm": 1.4161995649337769, + "loss": 5.2518, + "lr": 0.0008448951048951049, + "step": 1610, + "tokens_trained": 0.15304428 + }, + { + "epoch": 0.4573049645390071, + "grad_norm": 1.5045989751815796, + "loss": 5.2735, + "lr": 0.0008446153846153846, + "step": 1612, + "tokens_trained": 0.153234632 + }, + { + "epoch": 0.45787234042553193, + "grad_norm": 1.3695783615112305, + "loss": 5.2294, + "lr": 0.0008443356643356644, + "step": 1614, + "tokens_trained": 0.1534248 + }, + { + "epoch": 0.4584397163120567, + "grad_norm": 1.4551646709442139, + "loss": 5.2639, + "lr": 0.0008440559440559441, + "step": 1616, + "tokens_trained": 0.153614944 + }, + { + "epoch": 0.45900709219858155, + "grad_norm": 1.5018376111984253, + "loss": 5.2989, + "lr": 0.0008437762237762238, + "step": 1618, + "tokens_trained": 0.153803784 + }, + { + "epoch": 0.4595744680851064, + "grad_norm": 1.5295960903167725, + "loss": 5.33, + "lr": 0.0008434965034965035, + "step": 1620, + "tokens_trained": 0.153993752 + }, + { + "epoch": 0.4601418439716312, + "grad_norm": 1.417626142501831, + "loss": 5.2134, + "lr": 0.0008432167832167832, + "step": 1622, + "tokens_trained": 0.154184448 + }, + { + "epoch": 0.46070921985815605, + "grad_norm": 1.5715348720550537, + "loss": 5.2782, + "lr": 0.000842937062937063, + "step": 1624, + "tokens_trained": 0.154373632 + }, + { + "epoch": 0.46099290780141844, + "eval_loss": 5.266384601593018, + "eval_runtime": 21.0916, + "step": 1625, + "tokens_trained": 0.154468808 + }, + { + "epoch": 0.46127659574468083, + "grad_norm": 1.5504534244537354, + "loss": 5.2307, + "lr": 0.0008426573426573427, + "step": 1626, + "tokens_trained": 0.154564864 + }, + { + "epoch": 0.46184397163120566, + "grad_norm": 1.483108401298523, + "loss": 5.2578, + "lr": 0.0008423776223776224, + "step": 1628, + "tokens_trained": 0.154755312 + }, + { + "epoch": 0.4624113475177305, + "grad_norm": 1.5631264448165894, + "loss": 5.3291, + "lr": 0.0008420979020979021, + "step": 1630, + "tokens_trained": 0.154943736 + }, + { + "epoch": 0.46297872340425533, + "grad_norm": 1.4680705070495605, + "loss": 5.2256, + "lr": 0.0008418181818181819, + "step": 1632, + "tokens_trained": 0.15513452 + }, + { + "epoch": 0.46354609929078017, + "grad_norm": 1.468338966369629, + "loss": 5.2712, + "lr": 0.0008415384615384616, + "step": 1634, + "tokens_trained": 0.155325288 + }, + { + "epoch": 0.46411347517730495, + "grad_norm": 1.4557780027389526, + "loss": 5.2808, + "lr": 0.0008412587412587412, + "step": 1636, + "tokens_trained": 0.155515328 + }, + { + "epoch": 0.4646808510638298, + "grad_norm": 1.4534999132156372, + "loss": 5.2707, + "lr": 0.000840979020979021, + "step": 1638, + "tokens_trained": 0.155706752 + }, + { + "epoch": 0.4652482269503546, + "grad_norm": 1.4011393785476685, + "loss": 5.3028, + "lr": 0.0008406993006993006, + "step": 1640, + "tokens_trained": 0.155895336 + }, + { + "epoch": 0.46581560283687945, + "grad_norm": 1.307922601699829, + "loss": 5.2188, + "lr": 0.0008404195804195805, + "step": 1642, + "tokens_trained": 0.156085936 + }, + { + "epoch": 0.46638297872340423, + "grad_norm": 1.359922170639038, + "loss": 5.2863, + "lr": 0.0008401398601398602, + "step": 1644, + "tokens_trained": 0.15627636 + }, + { + "epoch": 0.46695035460992906, + "grad_norm": 1.6204577684402466, + "loss": 5.2877, + "lr": 0.0008398601398601399, + "step": 1646, + "tokens_trained": 0.156465192 + }, + { + "epoch": 0.4675177304964539, + "grad_norm": 1.7367322444915771, + "loss": 5.2501, + "lr": 0.0008395804195804196, + "step": 1648, + "tokens_trained": 0.15665336 + }, + { + "epoch": 0.46808510638297873, + "grad_norm": 1.7013088464736938, + "loss": 5.2522, + "lr": 0.0008393006993006993, + "step": 1650, + "tokens_trained": 0.156843128 + }, + { + "epoch": 0.46865248226950357, + "grad_norm": 1.6429578065872192, + "loss": 5.2978, + "lr": 0.0008390209790209791, + "step": 1652, + "tokens_trained": 0.157034328 + }, + { + "epoch": 0.46921985815602835, + "grad_norm": 1.527243733406067, + "loss": 5.2384, + "lr": 0.0008387412587412587, + "step": 1654, + "tokens_trained": 0.157222784 + }, + { + "epoch": 0.4697872340425532, + "grad_norm": 1.4792861938476562, + "loss": 5.2149, + "lr": 0.0008384615384615385, + "step": 1656, + "tokens_trained": 0.15741308 + }, + { + "epoch": 0.470354609929078, + "grad_norm": 1.4050098657608032, + "loss": 5.229, + "lr": 0.0008381818181818181, + "step": 1658, + "tokens_trained": 0.157603872 + }, + { + "epoch": 0.47092198581560285, + "grad_norm": 1.4799182415008545, + "loss": 5.2235, + "lr": 0.000837902097902098, + "step": 1660, + "tokens_trained": 0.157793352 + }, + { + "epoch": 0.4714893617021277, + "grad_norm": 1.4031378030776978, + "loss": 5.23, + "lr": 0.0008376223776223776, + "step": 1662, + "tokens_trained": 0.157984416 + }, + { + "epoch": 0.47205673758865246, + "grad_norm": 1.5775604248046875, + "loss": 5.2811, + "lr": 0.0008373426573426573, + "step": 1664, + "tokens_trained": 0.158176048 + }, + { + "epoch": 0.4726241134751773, + "grad_norm": 1.4855432510375977, + "loss": 5.2363, + "lr": 0.0008370629370629371, + "step": 1666, + "tokens_trained": 0.158368152 + }, + { + "epoch": 0.47319148936170213, + "grad_norm": 1.5609453916549683, + "loss": 5.2984, + "lr": 0.0008367832167832168, + "step": 1668, + "tokens_trained": 0.15855684 + }, + { + "epoch": 0.47375886524822697, + "grad_norm": 1.5052629709243774, + "loss": 5.213, + "lr": 0.0008365034965034966, + "step": 1670, + "tokens_trained": 0.15874712 + }, + { + "epoch": 0.47432624113475175, + "grad_norm": 1.5655242204666138, + "loss": 5.2551, + "lr": 0.0008362237762237762, + "step": 1672, + "tokens_trained": 0.158937104 + }, + { + "epoch": 0.4748936170212766, + "grad_norm": 1.301142930984497, + "loss": 5.1564, + "lr": 0.000835944055944056, + "step": 1674, + "tokens_trained": 0.159128096 + }, + { + "epoch": 0.4754609929078014, + "grad_norm": 1.5447527170181274, + "loss": 5.2547, + "lr": 0.0008356643356643356, + "step": 1676, + "tokens_trained": 0.159318968 + }, + { + "epoch": 0.47602836879432625, + "grad_norm": 1.638100266456604, + "loss": 5.2301, + "lr": 0.0008353846153846154, + "step": 1678, + "tokens_trained": 0.159508648 + }, + { + "epoch": 0.4765957446808511, + "grad_norm": 1.6203068494796753, + "loss": 5.2644, + "lr": 0.0008351048951048951, + "step": 1680, + "tokens_trained": 0.159698648 + }, + { + "epoch": 0.47716312056737586, + "grad_norm": 1.4097110033035278, + "loss": 5.2047, + "lr": 0.0008348251748251748, + "step": 1682, + "tokens_trained": 0.159887392 + }, + { + "epoch": 0.4777304964539007, + "grad_norm": 1.3377385139465332, + "loss": 5.2685, + "lr": 0.0008345454545454546, + "step": 1684, + "tokens_trained": 0.160076904 + }, + { + "epoch": 0.47829787234042553, + "grad_norm": 1.4079371690750122, + "loss": 5.2842, + "lr": 0.0008342657342657343, + "step": 1686, + "tokens_trained": 0.160266712 + }, + { + "epoch": 0.47886524822695037, + "grad_norm": 1.6039987802505493, + "loss": 5.2248, + "lr": 0.0008339860139860141, + "step": 1688, + "tokens_trained": 0.160455464 + }, + { + "epoch": 0.4794326241134752, + "grad_norm": 1.639218807220459, + "loss": 5.2007, + "lr": 0.0008337062937062937, + "step": 1690, + "tokens_trained": 0.16064472 + }, + { + "epoch": 0.48, + "grad_norm": 1.8226710557937622, + "loss": 5.2427, + "lr": 0.0008334265734265734, + "step": 1692, + "tokens_trained": 0.160835192 + }, + { + "epoch": 0.4805673758865248, + "grad_norm": 1.6480419635772705, + "loss": 5.1944, + "lr": 0.0008331468531468531, + "step": 1694, + "tokens_trained": 0.161025272 + }, + { + "epoch": 0.48113475177304965, + "grad_norm": 1.666717290878296, + "loss": 5.2879, + "lr": 0.0008328671328671329, + "step": 1696, + "tokens_trained": 0.161214016 + }, + { + "epoch": 0.4817021276595745, + "grad_norm": 1.5092660188674927, + "loss": 5.2612, + "lr": 0.0008325874125874126, + "step": 1698, + "tokens_trained": 0.161405448 + }, + { + "epoch": 0.48226950354609927, + "grad_norm": 1.4042121171951294, + "loss": 5.2373, + "lr": 0.0008323076923076923, + "step": 1700, + "tokens_trained": 0.161595896 + }, + { + "epoch": 0.4828368794326241, + "grad_norm": 1.4937382936477661, + "loss": 5.2172, + "lr": 0.000832027972027972, + "step": 1702, + "tokens_trained": 0.161783904 + }, + { + "epoch": 0.48340425531914893, + "grad_norm": 1.4652959108352661, + "loss": 5.2704, + "lr": 0.0008317482517482518, + "step": 1704, + "tokens_trained": 0.161975888 + }, + { + "epoch": 0.48397163120567377, + "grad_norm": 1.3021745681762695, + "loss": 5.2672, + "lr": 0.0008314685314685315, + "step": 1706, + "tokens_trained": 0.162165808 + }, + { + "epoch": 0.4845390070921986, + "grad_norm": 1.3580701351165771, + "loss": 5.2467, + "lr": 0.0008311888111888112, + "step": 1708, + "tokens_trained": 0.162355152 + }, + { + "epoch": 0.4851063829787234, + "grad_norm": 1.480072259902954, + "loss": 5.2797, + "lr": 0.0008309090909090909, + "step": 1710, + "tokens_trained": 0.162544744 + }, + { + "epoch": 0.4856737588652482, + "grad_norm": 1.3532829284667969, + "loss": 5.2556, + "lr": 0.0008306293706293706, + "step": 1712, + "tokens_trained": 0.162734976 + }, + { + "epoch": 0.48624113475177305, + "grad_norm": 1.240332007408142, + "loss": 5.2153, + "lr": 0.0008303496503496504, + "step": 1714, + "tokens_trained": 0.162924992 + }, + { + "epoch": 0.4868085106382979, + "grad_norm": 1.4141086339950562, + "loss": 5.2056, + "lr": 0.00083006993006993, + "step": 1716, + "tokens_trained": 0.163114008 + }, + { + "epoch": 0.4873758865248227, + "grad_norm": 1.321721076965332, + "loss": 5.2223, + "lr": 0.0008297902097902098, + "step": 1718, + "tokens_trained": 0.163304416 + }, + { + "epoch": 0.4879432624113475, + "grad_norm": 1.5437248945236206, + "loss": 5.2727, + "lr": 0.0008295104895104895, + "step": 1720, + "tokens_trained": 0.163493816 + }, + { + "epoch": 0.48851063829787233, + "grad_norm": 1.7218859195709229, + "loss": 5.2323, + "lr": 0.0008292307692307693, + "step": 1722, + "tokens_trained": 0.163683984 + }, + { + "epoch": 0.48907801418439717, + "grad_norm": 1.5534045696258545, + "loss": 5.1983, + "lr": 0.000828951048951049, + "step": 1724, + "tokens_trained": 0.163874968 + }, + { + "epoch": 0.489645390070922, + "grad_norm": 1.3675404787063599, + "loss": 5.2086, + "lr": 0.0008286713286713287, + "step": 1726, + "tokens_trained": 0.164065152 + }, + { + "epoch": 0.4902127659574468, + "grad_norm": 1.5178970098495483, + "loss": 5.2529, + "lr": 0.0008283916083916084, + "step": 1728, + "tokens_trained": 0.164255952 + }, + { + "epoch": 0.4907801418439716, + "grad_norm": 1.4910545349121094, + "loss": 5.2931, + "lr": 0.000828111888111888, + "step": 1730, + "tokens_trained": 0.164447112 + }, + { + "epoch": 0.49134751773049645, + "grad_norm": 1.5647637844085693, + "loss": 5.2603, + "lr": 0.0008278321678321679, + "step": 1732, + "tokens_trained": 0.16463704 + }, + { + "epoch": 0.4919148936170213, + "grad_norm": 1.4607906341552734, + "loss": 5.2702, + "lr": 0.0008275524475524475, + "step": 1734, + "tokens_trained": 0.164827312 + }, + { + "epoch": 0.4924822695035461, + "grad_norm": 1.5806026458740234, + "loss": 5.2356, + "lr": 0.0008272727272727273, + "step": 1736, + "tokens_trained": 0.165015224 + }, + { + "epoch": 0.4930496453900709, + "grad_norm": 1.5417263507843018, + "loss": 5.262, + "lr": 0.000826993006993007, + "step": 1738, + "tokens_trained": 0.16520484 + }, + { + "epoch": 0.49361702127659574, + "grad_norm": 1.511680245399475, + "loss": 5.2634, + "lr": 0.0008267132867132868, + "step": 1740, + "tokens_trained": 0.165393064 + }, + { + "epoch": 0.49418439716312057, + "grad_norm": 1.4468717575073242, + "loss": 5.2452, + "lr": 0.0008264335664335665, + "step": 1742, + "tokens_trained": 0.165584472 + }, + { + "epoch": 0.4947517730496454, + "grad_norm": 1.423187017440796, + "loss": 5.2533, + "lr": 0.0008261538461538461, + "step": 1744, + "tokens_trained": 0.165773768 + }, + { + "epoch": 0.49531914893617024, + "grad_norm": 1.512462854385376, + "loss": 5.2152, + "lr": 0.0008258741258741259, + "step": 1746, + "tokens_trained": 0.165963456 + }, + { + "epoch": 0.495886524822695, + "grad_norm": 1.4620780944824219, + "loss": 5.2511, + "lr": 0.0008255944055944055, + "step": 1748, + "tokens_trained": 0.166152136 + }, + { + "epoch": 0.49645390070921985, + "grad_norm": 1.4943009614944458, + "loss": 5.2829, + "lr": 0.0008253146853146854, + "step": 1750, + "tokens_trained": 0.16634248 + }, + { + "epoch": 0.49645390070921985, + "eval_loss": 5.23966646194458, + "eval_runtime": 20.5954, + "step": 1750, + "tokens_trained": 0.16634248 + }, + { + "epoch": 0.4970212765957447, + "grad_norm": 1.6739267110824585, + "loss": 5.2306, + "lr": 0.000825034965034965, + "step": 1752, + "tokens_trained": 0.166532864 + }, + { + "epoch": 0.4975886524822695, + "grad_norm": 1.6125763654708862, + "loss": 5.2845, + "lr": 0.0008247552447552448, + "step": 1754, + "tokens_trained": 0.166722944 + }, + { + "epoch": 0.4981560283687943, + "grad_norm": 1.5872310400009155, + "loss": 5.2075, + "lr": 0.0008244755244755245, + "step": 1756, + "tokens_trained": 0.16691184 + }, + { + "epoch": 0.49872340425531914, + "grad_norm": 1.4396610260009766, + "loss": 5.2532, + "lr": 0.0008241958041958042, + "step": 1758, + "tokens_trained": 0.167101896 + }, + { + "epoch": 0.49929078014184397, + "grad_norm": 1.363879680633545, + "loss": 5.2252, + "lr": 0.000823916083916084, + "step": 1760, + "tokens_trained": 0.167289384 + }, + { + "epoch": 0.4998581560283688, + "grad_norm": 1.395561695098877, + "loss": 5.2097, + "lr": 0.0008236363636363636, + "step": 1762, + "tokens_trained": 0.167479424 + }, + { + "epoch": 0.5004255319148936, + "grad_norm": 1.413736343383789, + "loss": 5.2283, + "lr": 0.0008233566433566434, + "step": 1764, + "tokens_trained": 0.167668256 + }, + { + "epoch": 0.5009929078014185, + "grad_norm": 1.4240859746932983, + "loss": 5.2574, + "lr": 0.000823076923076923, + "step": 1766, + "tokens_trained": 0.167858616 + }, + { + "epoch": 0.5015602836879433, + "grad_norm": 1.437165379524231, + "loss": 5.2511, + "lr": 0.0008227972027972029, + "step": 1768, + "tokens_trained": 0.168048272 + }, + { + "epoch": 0.502127659574468, + "grad_norm": 1.458575963973999, + "loss": 5.2183, + "lr": 0.0008225174825174825, + "step": 1770, + "tokens_trained": 0.168240184 + }, + { + "epoch": 0.5026950354609929, + "grad_norm": 1.5224673748016357, + "loss": 5.259, + "lr": 0.0008222377622377622, + "step": 1772, + "tokens_trained": 0.168429536 + }, + { + "epoch": 0.5032624113475177, + "grad_norm": 1.578438401222229, + "loss": 5.2108, + "lr": 0.000821958041958042, + "step": 1774, + "tokens_trained": 0.168619312 + }, + { + "epoch": 0.5038297872340426, + "grad_norm": 1.4880632162094116, + "loss": 5.229, + "lr": 0.0008216783216783217, + "step": 1776, + "tokens_trained": 0.168808344 + }, + { + "epoch": 0.5043971631205674, + "grad_norm": 1.3741049766540527, + "loss": 5.2873, + "lr": 0.0008213986013986015, + "step": 1778, + "tokens_trained": 0.168999112 + }, + { + "epoch": 0.5049645390070922, + "grad_norm": 1.4396610260009766, + "loss": 5.3237, + "lr": 0.0008211188811188811, + "step": 1780, + "tokens_trained": 0.169189288 + }, + { + "epoch": 0.505531914893617, + "grad_norm": 1.4296880960464478, + "loss": 5.2228, + "lr": 0.0008208391608391609, + "step": 1782, + "tokens_trained": 0.16937864 + }, + { + "epoch": 0.5060992907801418, + "grad_norm": 1.5704258680343628, + "loss": 5.2569, + "lr": 0.0008205594405594405, + "step": 1784, + "tokens_trained": 0.169569024 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 1.458261489868164, + "loss": 5.1818, + "lr": 0.0008202797202797203, + "step": 1786, + "tokens_trained": 0.16975932 + }, + { + "epoch": 0.5072340425531915, + "grad_norm": 1.5307244062423706, + "loss": 5.2684, + "lr": 0.00082, + "step": 1788, + "tokens_trained": 0.169949064 + }, + { + "epoch": 0.5078014184397163, + "grad_norm": 1.3966363668441772, + "loss": 5.2125, + "lr": 0.0008197202797202797, + "step": 1790, + "tokens_trained": 0.170139352 + }, + { + "epoch": 0.5083687943262412, + "grad_norm": 1.4094839096069336, + "loss": 5.2518, + "lr": 0.0008194405594405595, + "step": 1792, + "tokens_trained": 0.170330336 + }, + { + "epoch": 0.5089361702127659, + "grad_norm": 1.266122817993164, + "loss": 5.2409, + "lr": 0.0008191608391608392, + "step": 1794, + "tokens_trained": 0.170521848 + }, + { + "epoch": 0.5095035460992908, + "grad_norm": 1.3079488277435303, + "loss": 5.182, + "lr": 0.000818881118881119, + "step": 1796, + "tokens_trained": 0.170710664 + }, + { + "epoch": 0.5100709219858156, + "grad_norm": 1.2961090803146362, + "loss": 5.2456, + "lr": 0.0008186013986013986, + "step": 1798, + "tokens_trained": 0.170900016 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 1.3402773141860962, + "loss": 5.1888, + "lr": 0.0008183216783216783, + "step": 1800, + "tokens_trained": 0.171089824 + }, + { + "epoch": 0.5112056737588653, + "grad_norm": 1.386769413948059, + "loss": 5.1715, + "lr": 0.000818041958041958, + "step": 1802, + "tokens_trained": 0.171279448 + }, + { + "epoch": 0.51177304964539, + "grad_norm": 1.4280421733856201, + "loss": 5.2131, + "lr": 0.0008177622377622378, + "step": 1804, + "tokens_trained": 0.17147048 + }, + { + "epoch": 0.512340425531915, + "grad_norm": 1.4805412292480469, + "loss": 5.2379, + "lr": 0.0008174825174825175, + "step": 1806, + "tokens_trained": 0.171662264 + }, + { + "epoch": 0.5129078014184397, + "grad_norm": 1.4608936309814453, + "loss": 5.2412, + "lr": 0.0008172027972027972, + "step": 1808, + "tokens_trained": 0.171853176 + }, + { + "epoch": 0.5134751773049645, + "grad_norm": 1.550136923789978, + "loss": 5.1828, + "lr": 0.000816923076923077, + "step": 1810, + "tokens_trained": 0.172043344 + }, + { + "epoch": 0.5140425531914894, + "grad_norm": 1.4756869077682495, + "loss": 5.199, + "lr": 0.0008166433566433567, + "step": 1812, + "tokens_trained": 0.172231952 + }, + { + "epoch": 0.5146099290780142, + "grad_norm": 1.4199044704437256, + "loss": 5.2074, + "lr": 0.0008163636363636364, + "step": 1814, + "tokens_trained": 0.172420376 + }, + { + "epoch": 0.5151773049645391, + "grad_norm": 1.3477959632873535, + "loss": 5.1672, + "lr": 0.0008160839160839161, + "step": 1816, + "tokens_trained": 0.172610248 + }, + { + "epoch": 0.5157446808510638, + "grad_norm": 1.3331218957901, + "loss": 5.2267, + "lr": 0.0008158041958041958, + "step": 1818, + "tokens_trained": 0.172799168 + }, + { + "epoch": 0.5163120567375886, + "grad_norm": 1.2391384840011597, + "loss": 5.2088, + "lr": 0.0008155244755244755, + "step": 1820, + "tokens_trained": 0.172989328 + }, + { + "epoch": 0.5168794326241135, + "grad_norm": 1.3377013206481934, + "loss": 5.2279, + "lr": 0.0008152447552447553, + "step": 1822, + "tokens_trained": 0.173179376 + }, + { + "epoch": 0.5174468085106383, + "grad_norm": 1.285628318786621, + "loss": 5.3006, + "lr": 0.000814965034965035, + "step": 1824, + "tokens_trained": 0.173370408 + }, + { + "epoch": 0.5180141843971631, + "grad_norm": 1.2010120153427124, + "loss": 5.2264, + "lr": 0.0008146853146853147, + "step": 1826, + "tokens_trained": 0.173561144 + }, + { + "epoch": 0.518581560283688, + "grad_norm": 1.2953096628189087, + "loss": 5.1879, + "lr": 0.0008144055944055944, + "step": 1828, + "tokens_trained": 0.173753592 + }, + { + "epoch": 0.5191489361702127, + "grad_norm": 1.256910800933838, + "loss": 5.2402, + "lr": 0.0008141258741258742, + "step": 1830, + "tokens_trained": 0.173943752 + }, + { + "epoch": 0.5197163120567376, + "grad_norm": 1.338755488395691, + "loss": 5.2556, + "lr": 0.0008138461538461539, + "step": 1832, + "tokens_trained": 0.174130504 + }, + { + "epoch": 0.5202836879432624, + "grad_norm": 1.380715012550354, + "loss": 5.2047, + "lr": 0.0008135664335664336, + "step": 1834, + "tokens_trained": 0.174322088 + }, + { + "epoch": 0.5208510638297872, + "grad_norm": 1.4989492893218994, + "loss": 5.1873, + "lr": 0.0008132867132867133, + "step": 1836, + "tokens_trained": 0.17451164 + }, + { + "epoch": 0.5214184397163121, + "grad_norm": 1.3239110708236694, + "loss": 5.202, + "lr": 0.000813006993006993, + "step": 1838, + "tokens_trained": 0.174701896 + }, + { + "epoch": 0.5219858156028369, + "grad_norm": 1.397745132446289, + "loss": 5.2259, + "lr": 0.0008127272727272728, + "step": 1840, + "tokens_trained": 0.174892336 + }, + { + "epoch": 0.5225531914893617, + "grad_norm": 1.3992305994033813, + "loss": 5.1771, + "lr": 0.0008124475524475524, + "step": 1842, + "tokens_trained": 0.17508276 + }, + { + "epoch": 0.5231205673758865, + "grad_norm": 1.38923180103302, + "loss": 5.1981, + "lr": 0.0008121678321678322, + "step": 1844, + "tokens_trained": 0.175273272 + }, + { + "epoch": 0.5236879432624113, + "grad_norm": 1.478642225265503, + "loss": 5.2533, + "lr": 0.0008118881118881119, + "step": 1846, + "tokens_trained": 0.175462352 + }, + { + "epoch": 0.5242553191489362, + "grad_norm": 1.332709789276123, + "loss": 5.2205, + "lr": 0.0008116083916083917, + "step": 1848, + "tokens_trained": 0.175648128 + }, + { + "epoch": 0.524822695035461, + "grad_norm": 1.4612590074539185, + "loss": 5.2207, + "lr": 0.0008113286713286714, + "step": 1850, + "tokens_trained": 0.175837712 + }, + { + "epoch": 0.5253900709219859, + "grad_norm": 1.4682700634002686, + "loss": 5.2576, + "lr": 0.000811048951048951, + "step": 1852, + "tokens_trained": 0.176029512 + }, + { + "epoch": 0.5259574468085106, + "grad_norm": 1.3380264043807983, + "loss": 5.2435, + "lr": 0.0008107692307692308, + "step": 1854, + "tokens_trained": 0.176220432 + }, + { + "epoch": 0.5265248226950354, + "grad_norm": 1.2452281713485718, + "loss": 5.2973, + "lr": 0.0008104895104895104, + "step": 1856, + "tokens_trained": 0.176412144 + }, + { + "epoch": 0.5270921985815603, + "grad_norm": 1.392592191696167, + "loss": 5.2028, + "lr": 0.0008102097902097903, + "step": 1858, + "tokens_trained": 0.17660144 + }, + { + "epoch": 0.5276595744680851, + "grad_norm": 1.4258657693862915, + "loss": 5.2342, + "lr": 0.0008099300699300699, + "step": 1860, + "tokens_trained": 0.176790424 + }, + { + "epoch": 0.52822695035461, + "grad_norm": 1.4627033472061157, + "loss": 5.1732, + "lr": 0.0008096503496503497, + "step": 1862, + "tokens_trained": 0.176983296 + }, + { + "epoch": 0.5287943262411348, + "grad_norm": 1.4448645114898682, + "loss": 5.2001, + "lr": 0.0008093706293706294, + "step": 1864, + "tokens_trained": 0.177174544 + }, + { + "epoch": 0.5293617021276595, + "grad_norm": 1.3879749774932861, + "loss": 5.1642, + "lr": 0.0008090909090909092, + "step": 1866, + "tokens_trained": 0.17736428 + }, + { + "epoch": 0.5299290780141844, + "grad_norm": 1.2791417837142944, + "loss": 5.1975, + "lr": 0.0008088111888111889, + "step": 1868, + "tokens_trained": 0.177553752 + }, + { + "epoch": 0.5304964539007092, + "grad_norm": 1.3620632886886597, + "loss": 5.1742, + "lr": 0.0008085314685314685, + "step": 1870, + "tokens_trained": 0.177746448 + }, + { + "epoch": 0.531063829787234, + "grad_norm": 1.2759565114974976, + "loss": 5.2076, + "lr": 0.0008082517482517483, + "step": 1872, + "tokens_trained": 0.177937888 + }, + { + "epoch": 0.5316312056737589, + "grad_norm": 1.3390915393829346, + "loss": 5.2387, + "lr": 0.0008079720279720279, + "step": 1874, + "tokens_trained": 0.178127776 + }, + { + "epoch": 0.5319148936170213, + "eval_loss": 5.228371620178223, + "eval_runtime": 20.9372, + "step": 1875, + "tokens_trained": 0.17822376 + }, + { + "epoch": 0.5321985815602837, + "grad_norm": 1.3872885704040527, + "loss": 5.2053, + "lr": 0.0008076923076923078, + "step": 1876, + "tokens_trained": 0.178318616 + }, + { + "epoch": 0.5327659574468085, + "grad_norm": 1.4238568544387817, + "loss": 5.2091, + "lr": 0.0008074125874125874, + "step": 1878, + "tokens_trained": 0.178509272 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 1.3352588415145874, + "loss": 5.2471, + "lr": 0.0008071328671328671, + "step": 1880, + "tokens_trained": 0.178698016 + }, + { + "epoch": 0.5339007092198581, + "grad_norm": 1.2931993007659912, + "loss": 5.2315, + "lr": 0.0008068531468531469, + "step": 1882, + "tokens_trained": 0.17888628 + }, + { + "epoch": 0.534468085106383, + "grad_norm": 1.3475919961929321, + "loss": 5.2337, + "lr": 0.0008065734265734265, + "step": 1884, + "tokens_trained": 0.179076944 + }, + { + "epoch": 0.5350354609929078, + "grad_norm": 1.3263812065124512, + "loss": 5.2017, + "lr": 0.0008062937062937064, + "step": 1886, + "tokens_trained": 0.179266128 + }, + { + "epoch": 0.5356028368794327, + "grad_norm": 1.3956594467163086, + "loss": 5.1907, + "lr": 0.000806013986013986, + "step": 1888, + "tokens_trained": 0.179454848 + }, + { + "epoch": 0.5361702127659574, + "grad_norm": 1.4399393796920776, + "loss": 5.216, + "lr": 0.0008057342657342658, + "step": 1890, + "tokens_trained": 0.179643992 + }, + { + "epoch": 0.5367375886524822, + "grad_norm": 1.278714656829834, + "loss": 5.1689, + "lr": 0.0008054545454545454, + "step": 1892, + "tokens_trained": 0.179831416 + }, + { + "epoch": 0.5373049645390071, + "grad_norm": 1.3517796993255615, + "loss": 5.1319, + "lr": 0.0008051748251748253, + "step": 1894, + "tokens_trained": 0.180022528 + }, + { + "epoch": 0.5378723404255319, + "grad_norm": 1.2710460424423218, + "loss": 5.1619, + "lr": 0.0008048951048951049, + "step": 1896, + "tokens_trained": 0.180212936 + }, + { + "epoch": 0.5384397163120568, + "grad_norm": 1.3603075742721558, + "loss": 5.1615, + "lr": 0.0008046153846153846, + "step": 1898, + "tokens_trained": 0.180404648 + }, + { + "epoch": 0.5390070921985816, + "grad_norm": 1.422122836112976, + "loss": 5.1801, + "lr": 0.0008043356643356644, + "step": 1900, + "tokens_trained": 0.18059388 + }, + { + "epoch": 0.5395744680851063, + "grad_norm": 1.4242218732833862, + "loss": 5.2367, + "lr": 0.000804055944055944, + "step": 1902, + "tokens_trained": 0.180783248 + }, + { + "epoch": 0.5401418439716312, + "grad_norm": 1.4476134777069092, + "loss": 5.252, + "lr": 0.0008037762237762239, + "step": 1904, + "tokens_trained": 0.180971152 + }, + { + "epoch": 0.540709219858156, + "grad_norm": 1.4724863767623901, + "loss": 5.2042, + "lr": 0.0008034965034965035, + "step": 1906, + "tokens_trained": 0.181159992 + }, + { + "epoch": 0.5412765957446809, + "grad_norm": 1.4014806747436523, + "loss": 5.2514, + "lr": 0.0008032167832167832, + "step": 1908, + "tokens_trained": 0.18135032 + }, + { + "epoch": 0.5418439716312057, + "grad_norm": 1.3511682748794556, + "loss": 5.2036, + "lr": 0.0008029370629370629, + "step": 1910, + "tokens_trained": 0.181540312 + }, + { + "epoch": 0.5424113475177305, + "grad_norm": 1.3011739253997803, + "loss": 5.24, + "lr": 0.0008026573426573427, + "step": 1912, + "tokens_trained": 0.181731104 + }, + { + "epoch": 0.5429787234042553, + "grad_norm": 1.2753015756607056, + "loss": 5.25, + "lr": 0.0008023776223776224, + "step": 1914, + "tokens_trained": 0.18192008 + }, + { + "epoch": 0.5435460992907801, + "grad_norm": 1.4685192108154297, + "loss": 5.1619, + "lr": 0.0008020979020979021, + "step": 1916, + "tokens_trained": 0.182110072 + }, + { + "epoch": 0.544113475177305, + "grad_norm": 1.4695900678634644, + "loss": 5.2626, + "lr": 0.0008018181818181818, + "step": 1918, + "tokens_trained": 0.182300224 + }, + { + "epoch": 0.5446808510638298, + "grad_norm": 1.4895613193511963, + "loss": 5.1766, + "lr": 0.0008015384615384615, + "step": 1920, + "tokens_trained": 0.182490712 + }, + { + "epoch": 0.5452482269503546, + "grad_norm": 1.3073184490203857, + "loss": 5.2281, + "lr": 0.0008012587412587414, + "step": 1922, + "tokens_trained": 0.182681168 + }, + { + "epoch": 0.5458156028368795, + "grad_norm": 1.2414125204086304, + "loss": 5.2099, + "lr": 0.000800979020979021, + "step": 1924, + "tokens_trained": 0.182870504 + }, + { + "epoch": 0.5463829787234042, + "grad_norm": 1.2407176494598389, + "loss": 5.1116, + "lr": 0.0008006993006993007, + "step": 1926, + "tokens_trained": 0.1830618 + }, + { + "epoch": 0.546950354609929, + "grad_norm": 1.4507744312286377, + "loss": 5.1658, + "lr": 0.0008004195804195804, + "step": 1928, + "tokens_trained": 0.183250072 + }, + { + "epoch": 0.5475177304964539, + "grad_norm": 1.348907232284546, + "loss": 5.231, + "lr": 0.0008001398601398602, + "step": 1930, + "tokens_trained": 0.18344004 + }, + { + "epoch": 0.5480851063829787, + "grad_norm": 1.4393324851989746, + "loss": 5.2393, + "lr": 0.0007998601398601399, + "step": 1932, + "tokens_trained": 0.183630032 + }, + { + "epoch": 0.5486524822695036, + "grad_norm": 1.3569602966308594, + "loss": 5.2068, + "lr": 0.0007995804195804196, + "step": 1934, + "tokens_trained": 0.183820816 + }, + { + "epoch": 0.5492198581560284, + "grad_norm": 1.362021803855896, + "loss": 5.1641, + "lr": 0.0007993006993006992, + "step": 1936, + "tokens_trained": 0.184009824 + }, + { + "epoch": 0.5497872340425531, + "grad_norm": 1.2926445007324219, + "loss": 5.1983, + "lr": 0.000799020979020979, + "step": 1938, + "tokens_trained": 0.184199544 + }, + { + "epoch": 0.550354609929078, + "grad_norm": 1.3065440654754639, + "loss": 5.3009, + "lr": 0.0007987412587412588, + "step": 1940, + "tokens_trained": 0.1843906 + }, + { + "epoch": 0.5509219858156028, + "grad_norm": 1.3288060426712036, + "loss": 5.2347, + "lr": 0.0007984615384615385, + "step": 1942, + "tokens_trained": 0.184580304 + }, + { + "epoch": 0.5514893617021277, + "grad_norm": 1.4742496013641357, + "loss": 5.1497, + "lr": 0.0007981818181818182, + "step": 1944, + "tokens_trained": 0.184771832 + }, + { + "epoch": 0.5520567375886525, + "grad_norm": 1.3907397985458374, + "loss": 5.2001, + "lr": 0.0007979020979020979, + "step": 1946, + "tokens_trained": 0.184963744 + }, + { + "epoch": 0.5526241134751773, + "grad_norm": 1.3324332237243652, + "loss": 5.2056, + "lr": 0.0007976223776223777, + "step": 1948, + "tokens_trained": 0.185152248 + }, + { + "epoch": 0.5531914893617021, + "grad_norm": 1.258155345916748, + "loss": 5.1999, + "lr": 0.0007973426573426573, + "step": 1950, + "tokens_trained": 0.18534196 + }, + { + "epoch": 0.5537588652482269, + "grad_norm": 1.3515956401824951, + "loss": 5.1988, + "lr": 0.0007970629370629371, + "step": 1952, + "tokens_trained": 0.18553156 + }, + { + "epoch": 0.5543262411347518, + "grad_norm": 1.535507321357727, + "loss": 5.2198, + "lr": 0.0007967832167832167, + "step": 1954, + "tokens_trained": 0.185719792 + }, + { + "epoch": 0.5548936170212766, + "grad_norm": 1.3124226331710815, + "loss": 5.1468, + "lr": 0.0007965034965034965, + "step": 1956, + "tokens_trained": 0.18591288 + }, + { + "epoch": 0.5554609929078014, + "grad_norm": 1.2720654010772705, + "loss": 5.1939, + "lr": 0.0007962237762237763, + "step": 1958, + "tokens_trained": 0.186102344 + }, + { + "epoch": 0.5560283687943263, + "grad_norm": 1.2731753587722778, + "loss": 5.2063, + "lr": 0.000795944055944056, + "step": 1960, + "tokens_trained": 0.186291976 + }, + { + "epoch": 0.556595744680851, + "grad_norm": 1.3020576238632202, + "loss": 5.266, + "lr": 0.0007956643356643357, + "step": 1962, + "tokens_trained": 0.186483504 + }, + { + "epoch": 0.5571631205673759, + "grad_norm": 1.300626277923584, + "loss": 5.2159, + "lr": 0.0007953846153846153, + "step": 1964, + "tokens_trained": 0.18667372 + }, + { + "epoch": 0.5577304964539007, + "grad_norm": 1.3075426816940308, + "loss": 5.2136, + "lr": 0.0007951048951048952, + "step": 1966, + "tokens_trained": 0.186864808 + }, + { + "epoch": 0.5582978723404255, + "grad_norm": 1.4623394012451172, + "loss": 5.2081, + "lr": 0.0007948251748251748, + "step": 1968, + "tokens_trained": 0.187056272 + }, + { + "epoch": 0.5588652482269504, + "grad_norm": 1.4950625896453857, + "loss": 5.1885, + "lr": 0.0007945454545454546, + "step": 1970, + "tokens_trained": 0.187244464 + }, + { + "epoch": 0.5594326241134752, + "grad_norm": 1.517152190208435, + "loss": 5.2558, + "lr": 0.0007942657342657342, + "step": 1972, + "tokens_trained": 0.187433216 + }, + { + "epoch": 0.56, + "grad_norm": 1.4226372241973877, + "loss": 5.236, + "lr": 0.000793986013986014, + "step": 1974, + "tokens_trained": 0.187622632 + }, + { + "epoch": 0.5605673758865248, + "grad_norm": 1.3692735433578491, + "loss": 5.2089, + "lr": 0.0007937062937062938, + "step": 1976, + "tokens_trained": 0.18781324 + }, + { + "epoch": 0.5611347517730496, + "grad_norm": 1.3344841003417969, + "loss": 5.2052, + "lr": 0.0007934265734265734, + "step": 1978, + "tokens_trained": 0.188002488 + }, + { + "epoch": 0.5617021276595745, + "grad_norm": 1.3929632902145386, + "loss": 5.2353, + "lr": 0.0007931468531468532, + "step": 1980, + "tokens_trained": 0.188194712 + }, + { + "epoch": 0.5622695035460993, + "grad_norm": 1.3147000074386597, + "loss": 5.2071, + "lr": 0.0007928671328671328, + "step": 1982, + "tokens_trained": 0.188387056 + }, + { + "epoch": 0.5628368794326241, + "grad_norm": 1.351483702659607, + "loss": 5.2196, + "lr": 0.0007925874125874127, + "step": 1984, + "tokens_trained": 0.188579048 + }, + { + "epoch": 0.563404255319149, + "grad_norm": 1.3840581178665161, + "loss": 5.1889, + "lr": 0.0007923076923076923, + "step": 1986, + "tokens_trained": 0.18876896 + }, + { + "epoch": 0.5639716312056737, + "grad_norm": 1.3427214622497559, + "loss": 5.192, + "lr": 0.000792027972027972, + "step": 1988, + "tokens_trained": 0.18895832 + }, + { + "epoch": 0.5645390070921986, + "grad_norm": 1.2931344509124756, + "loss": 5.1942, + "lr": 0.0007917482517482517, + "step": 1990, + "tokens_trained": 0.18915036 + }, + { + "epoch": 0.5651063829787234, + "grad_norm": 1.2408664226531982, + "loss": 5.2014, + "lr": 0.0007914685314685314, + "step": 1992, + "tokens_trained": 0.189339784 + }, + { + "epoch": 0.5656737588652482, + "grad_norm": 1.342760682106018, + "loss": 5.2056, + "lr": 0.0007911888111888113, + "step": 1994, + "tokens_trained": 0.189530776 + }, + { + "epoch": 0.5662411347517731, + "grad_norm": 1.2647815942764282, + "loss": 5.2338, + "lr": 0.0007909090909090909, + "step": 1996, + "tokens_trained": 0.189720312 + }, + { + "epoch": 0.5668085106382978, + "grad_norm": 1.1956689357757568, + "loss": 5.1464, + "lr": 0.0007906293706293707, + "step": 1998, + "tokens_trained": 0.189909592 + }, + { + "epoch": 0.5673758865248227, + "grad_norm": 1.287185549736023, + "loss": 5.1919, + "lr": 0.0007903496503496503, + "step": 2000, + "tokens_trained": 0.190100544 + }, + { + "epoch": 0.5673758865248227, + "eval_loss": 5.208409309387207, + "eval_runtime": 21.1643, + "step": 2000, + "tokens_trained": 0.190100544 + }, + { + "epoch": 0.5679432624113475, + "grad_norm": 1.3409695625305176, + "loss": 5.1723, + "lr": 0.0007900699300699302, + "step": 2002, + "tokens_trained": 0.190291792 + }, + { + "epoch": 0.5685106382978723, + "grad_norm": 1.3951654434204102, + "loss": 5.243, + "lr": 0.0007897902097902098, + "step": 2004, + "tokens_trained": 0.190481864 + }, + { + "epoch": 0.5690780141843972, + "grad_norm": 1.2949507236480713, + "loss": 5.2248, + "lr": 0.0007895104895104895, + "step": 2006, + "tokens_trained": 0.19067228 + }, + { + "epoch": 0.569645390070922, + "grad_norm": 1.3585959672927856, + "loss": 5.1889, + "lr": 0.0007892307692307692, + "step": 2008, + "tokens_trained": 0.190860368 + }, + { + "epoch": 0.5702127659574469, + "grad_norm": 1.2834774255752563, + "loss": 5.2067, + "lr": 0.0007889510489510489, + "step": 2010, + "tokens_trained": 0.191051904 + }, + { + "epoch": 0.5707801418439716, + "grad_norm": 1.3544108867645264, + "loss": 5.2041, + "lr": 0.0007886713286713288, + "step": 2012, + "tokens_trained": 0.191242688 + }, + { + "epoch": 0.5713475177304964, + "grad_norm": 1.3536330461502075, + "loss": 5.2131, + "lr": 0.0007883916083916084, + "step": 2014, + "tokens_trained": 0.191431104 + }, + { + "epoch": 0.5719148936170213, + "grad_norm": 1.337441325187683, + "loss": 5.2036, + "lr": 0.0007881118881118882, + "step": 2016, + "tokens_trained": 0.19162204 + }, + { + "epoch": 0.5724822695035461, + "grad_norm": 1.4701579809188843, + "loss": 5.2049, + "lr": 0.0007878321678321678, + "step": 2018, + "tokens_trained": 0.191813352 + }, + { + "epoch": 0.573049645390071, + "grad_norm": 1.4354153871536255, + "loss": 5.2583, + "lr": 0.0007875524475524476, + "step": 2020, + "tokens_trained": 0.192004064 + }, + { + "epoch": 0.5736170212765958, + "grad_norm": 1.358913540840149, + "loss": 5.1961, + "lr": 0.0007872727272727273, + "step": 2022, + "tokens_trained": 0.192193232 + }, + { + "epoch": 0.5741843971631205, + "grad_norm": 1.3889496326446533, + "loss": 5.1755, + "lr": 0.000786993006993007, + "step": 2024, + "tokens_trained": 0.192385416 + }, + { + "epoch": 0.5747517730496454, + "grad_norm": 1.4138504266738892, + "loss": 5.2423, + "lr": 0.0007867132867132867, + "step": 2026, + "tokens_trained": 0.192575904 + }, + { + "epoch": 0.5753191489361702, + "grad_norm": 1.2651748657226562, + "loss": 5.1574, + "lr": 0.0007864335664335664, + "step": 2028, + "tokens_trained": 0.192765568 + }, + { + "epoch": 0.5758865248226951, + "grad_norm": 1.304296612739563, + "loss": 5.1978, + "lr": 0.0007861538461538463, + "step": 2030, + "tokens_trained": 0.192956176 + }, + { + "epoch": 0.5764539007092199, + "grad_norm": 1.2884007692337036, + "loss": 5.1945, + "lr": 0.0007858741258741259, + "step": 2032, + "tokens_trained": 0.193146208 + }, + { + "epoch": 0.5770212765957446, + "grad_norm": 1.4838171005249023, + "loss": 5.1348, + "lr": 0.0007855944055944056, + "step": 2034, + "tokens_trained": 0.193335664 + }, + { + "epoch": 0.5775886524822695, + "grad_norm": 1.456529974937439, + "loss": 5.2284, + "lr": 0.0007853146853146853, + "step": 2036, + "tokens_trained": 0.193525216 + }, + { + "epoch": 0.5781560283687943, + "grad_norm": 1.3471657037734985, + "loss": 5.2101, + "lr": 0.0007850349650349651, + "step": 2038, + "tokens_trained": 0.19371268 + }, + { + "epoch": 0.5787234042553191, + "grad_norm": 1.3996837139129639, + "loss": 5.1828, + "lr": 0.0007847552447552448, + "step": 2040, + "tokens_trained": 0.193903536 + }, + { + "epoch": 0.579290780141844, + "grad_norm": 1.4071470499038696, + "loss": 5.1724, + "lr": 0.0007844755244755245, + "step": 2042, + "tokens_trained": 0.194092384 + }, + { + "epoch": 0.5798581560283688, + "grad_norm": 1.4125159978866577, + "loss": 5.1602, + "lr": 0.0007841958041958041, + "step": 2044, + "tokens_trained": 0.19428356 + }, + { + "epoch": 0.5804255319148937, + "grad_norm": 1.3602298498153687, + "loss": 5.1904, + "lr": 0.0007839160839160839, + "step": 2046, + "tokens_trained": 0.194473352 + }, + { + "epoch": 0.5809929078014184, + "grad_norm": 1.2836074829101562, + "loss": 5.1648, + "lr": 0.0007836363636363637, + "step": 2048, + "tokens_trained": 0.194663624 + }, + { + "epoch": 0.5815602836879432, + "grad_norm": 1.306192398071289, + "loss": 5.2037, + "lr": 0.0007833566433566434, + "step": 2050, + "tokens_trained": 0.194854 + }, + { + "epoch": 0.5821276595744681, + "grad_norm": 1.3130674362182617, + "loss": 5.223, + "lr": 0.0007830769230769231, + "step": 2052, + "tokens_trained": 0.195044368 + }, + { + "epoch": 0.5826950354609929, + "grad_norm": 1.2337714433670044, + "loss": 5.1609, + "lr": 0.0007827972027972028, + "step": 2054, + "tokens_trained": 0.195237064 + }, + { + "epoch": 0.5832624113475178, + "grad_norm": 1.2249869108200073, + "loss": 5.1352, + "lr": 0.0007825174825174826, + "step": 2056, + "tokens_trained": 0.195425016 + }, + { + "epoch": 0.5838297872340426, + "grad_norm": 1.2610726356506348, + "loss": 5.2304, + "lr": 0.0007822377622377622, + "step": 2058, + "tokens_trained": 0.195614488 + }, + { + "epoch": 0.5843971631205673, + "grad_norm": 1.1917920112609863, + "loss": 5.1964, + "lr": 0.000781958041958042, + "step": 2060, + "tokens_trained": 0.19580392 + }, + { + "epoch": 0.5849645390070922, + "grad_norm": 1.2248187065124512, + "loss": 5.0901, + "lr": 0.0007816783216783216, + "step": 2062, + "tokens_trained": 0.195993096 + }, + { + "epoch": 0.585531914893617, + "grad_norm": 1.4138745069503784, + "loss": 5.1806, + "lr": 0.0007813986013986014, + "step": 2064, + "tokens_trained": 0.196183824 + }, + { + "epoch": 0.5860992907801419, + "grad_norm": 1.389195442199707, + "loss": 5.1813, + "lr": 0.0007811188811188812, + "step": 2066, + "tokens_trained": 0.196373912 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 1.2737247943878174, + "loss": 5.1935, + "lr": 0.0007808391608391609, + "step": 2068, + "tokens_trained": 0.196564696 + }, + { + "epoch": 0.5872340425531914, + "grad_norm": 1.443703293800354, + "loss": 5.2376, + "lr": 0.0007805594405594406, + "step": 2070, + "tokens_trained": 0.196754472 + }, + { + "epoch": 0.5878014184397163, + "grad_norm": 1.367251992225647, + "loss": 5.2505, + "lr": 0.0007802797202797202, + "step": 2072, + "tokens_trained": 0.196945288 + }, + { + "epoch": 0.5883687943262411, + "grad_norm": 1.4049919843673706, + "loss": 5.2155, + "lr": 0.0007800000000000001, + "step": 2074, + "tokens_trained": 0.197135328 + }, + { + "epoch": 0.588936170212766, + "grad_norm": 1.5119894742965698, + "loss": 5.189, + "lr": 0.0007797202797202797, + "step": 2076, + "tokens_trained": 0.197325152 + }, + { + "epoch": 0.5895035460992908, + "grad_norm": 1.349288821220398, + "loss": 5.1626, + "lr": 0.0007794405594405595, + "step": 2078, + "tokens_trained": 0.197514576 + }, + { + "epoch": 0.5900709219858156, + "grad_norm": 1.2594739198684692, + "loss": 5.2222, + "lr": 0.0007791608391608391, + "step": 2080, + "tokens_trained": 0.197705064 + }, + { + "epoch": 0.5906382978723405, + "grad_norm": 1.0747008323669434, + "loss": 5.1669, + "lr": 0.0007788811188811189, + "step": 2082, + "tokens_trained": 0.197895032 + }, + { + "epoch": 0.5912056737588652, + "grad_norm": 1.1089273691177368, + "loss": 5.1071, + "lr": 0.0007786013986013987, + "step": 2084, + "tokens_trained": 0.198085832 + }, + { + "epoch": 0.5917730496453901, + "grad_norm": 1.153296709060669, + "loss": 5.1483, + "lr": 0.0007783216783216783, + "step": 2086, + "tokens_trained": 0.198272104 + }, + { + "epoch": 0.5923404255319149, + "grad_norm": 1.1960811614990234, + "loss": 5.1703, + "lr": 0.0007780419580419581, + "step": 2088, + "tokens_trained": 0.198459976 + }, + { + "epoch": 0.5929078014184397, + "grad_norm": 1.073548674583435, + "loss": 5.2449, + "lr": 0.0007777622377622377, + "step": 2090, + "tokens_trained": 0.198648376 + }, + { + "epoch": 0.5934751773049646, + "grad_norm": 1.233362078666687, + "loss": 5.1987, + "lr": 0.0007774825174825176, + "step": 2092, + "tokens_trained": 0.198839144 + }, + { + "epoch": 0.5940425531914894, + "grad_norm": 1.3649506568908691, + "loss": 5.183, + "lr": 0.0007772027972027972, + "step": 2094, + "tokens_trained": 0.199029064 + }, + { + "epoch": 0.5946099290780141, + "grad_norm": 1.2620112895965576, + "loss": 5.1343, + "lr": 0.000776923076923077, + "step": 2096, + "tokens_trained": 0.199218376 + }, + { + "epoch": 0.595177304964539, + "grad_norm": 1.3836737871170044, + "loss": 5.248, + "lr": 0.0007766433566433566, + "step": 2098, + "tokens_trained": 0.199407736 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 1.3027995824813843, + "loss": 5.1813, + "lr": 0.0007763636363636363, + "step": 2100, + "tokens_trained": 0.199597888 + }, + { + "epoch": 0.5963120567375887, + "grad_norm": 1.2857698202133179, + "loss": 5.2111, + "lr": 0.0007760839160839162, + "step": 2102, + "tokens_trained": 0.19978852 + }, + { + "epoch": 0.5968794326241135, + "grad_norm": 1.3470538854599, + "loss": 5.1505, + "lr": 0.0007758041958041958, + "step": 2104, + "tokens_trained": 0.199978536 + }, + { + "epoch": 0.5974468085106382, + "grad_norm": 1.230573058128357, + "loss": 5.1222, + "lr": 0.0007755244755244756, + "step": 2106, + "tokens_trained": 0.200170024 + }, + { + "epoch": 0.5980141843971631, + "grad_norm": 1.2551500797271729, + "loss": 5.1297, + "lr": 0.0007752447552447552, + "step": 2108, + "tokens_trained": 0.20035992 + }, + { + "epoch": 0.5985815602836879, + "grad_norm": 1.2162272930145264, + "loss": 5.233, + "lr": 0.0007749650349650351, + "step": 2110, + "tokens_trained": 0.200548976 + }, + { + "epoch": 0.5991489361702128, + "grad_norm": 1.2617305517196655, + "loss": 5.2118, + "lr": 0.0007746853146853147, + "step": 2112, + "tokens_trained": 0.200740656 + }, + { + "epoch": 0.5997163120567376, + "grad_norm": 1.4057862758636475, + "loss": 5.2215, + "lr": 0.0007744055944055944, + "step": 2114, + "tokens_trained": 0.200930944 + }, + { + "epoch": 0.6002836879432624, + "grad_norm": 1.3729593753814697, + "loss": 5.1773, + "lr": 0.0007741258741258741, + "step": 2116, + "tokens_trained": 0.201122528 + }, + { + "epoch": 0.6008510638297873, + "grad_norm": 1.3300920724868774, + "loss": 5.1573, + "lr": 0.0007738461538461538, + "step": 2118, + "tokens_trained": 0.201310224 + }, + { + "epoch": 0.601418439716312, + "grad_norm": 1.33209228515625, + "loss": 5.1523, + "lr": 0.0007735664335664337, + "step": 2120, + "tokens_trained": 0.201499048 + }, + { + "epoch": 0.6019858156028369, + "grad_norm": 1.1407768726348877, + "loss": 5.1453, + "lr": 0.0007732867132867133, + "step": 2122, + "tokens_trained": 0.201688872 + }, + { + "epoch": 0.6025531914893617, + "grad_norm": 1.1250742673873901, + "loss": 5.173, + "lr": 0.0007730069930069931, + "step": 2124, + "tokens_trained": 0.201880504 + }, + { + "epoch": 0.6028368794326241, + "eval_loss": 5.190411567687988, + "eval_runtime": 20.812, + "step": 2125, + "tokens_trained": 0.201976984 + }, + { + "epoch": 0.6031205673758865, + "grad_norm": 1.2974287271499634, + "loss": 5.1878, + "lr": 0.0007727272727272727, + "step": 2126, + "tokens_trained": 0.20207104 + }, + { + "epoch": 0.6036879432624114, + "grad_norm": 1.251120924949646, + "loss": 5.203, + "lr": 0.0007724475524475525, + "step": 2128, + "tokens_trained": 0.202261848 + }, + { + "epoch": 0.6042553191489362, + "grad_norm": 1.3494654893875122, + "loss": 5.1981, + "lr": 0.0007721678321678322, + "step": 2130, + "tokens_trained": 0.202452936 + }, + { + "epoch": 0.604822695035461, + "grad_norm": 1.2586653232574463, + "loss": 5.1786, + "lr": 0.0007718881118881119, + "step": 2132, + "tokens_trained": 0.202642168 + }, + { + "epoch": 0.6053900709219858, + "grad_norm": 1.228868842124939, + "loss": 5.1651, + "lr": 0.0007716083916083916, + "step": 2134, + "tokens_trained": 0.202830528 + }, + { + "epoch": 0.6059574468085106, + "grad_norm": 1.25627863407135, + "loss": 5.2033, + "lr": 0.0007713286713286713, + "step": 2136, + "tokens_trained": 0.203022216 + }, + { + "epoch": 0.6065248226950355, + "grad_norm": 1.1568467617034912, + "loss": 5.1659, + "lr": 0.0007710489510489512, + "step": 2138, + "tokens_trained": 0.203211696 + }, + { + "epoch": 0.6070921985815603, + "grad_norm": 1.1502138376235962, + "loss": 5.1935, + "lr": 0.0007707692307692308, + "step": 2140, + "tokens_trained": 0.203403224 + }, + { + "epoch": 0.6076595744680852, + "grad_norm": 1.2491158246994019, + "loss": 5.1367, + "lr": 0.0007704895104895105, + "step": 2142, + "tokens_trained": 0.203594912 + }, + { + "epoch": 0.6082269503546099, + "grad_norm": 1.3012075424194336, + "loss": 5.1954, + "lr": 0.0007702097902097902, + "step": 2144, + "tokens_trained": 0.203787032 + }, + { + "epoch": 0.6087943262411347, + "grad_norm": 1.2956688404083252, + "loss": 5.2255, + "lr": 0.0007699300699300699, + "step": 2146, + "tokens_trained": 0.203979064 + }, + { + "epoch": 0.6093617021276596, + "grad_norm": 1.3562579154968262, + "loss": 5.2371, + "lr": 0.0007696503496503497, + "step": 2148, + "tokens_trained": 0.20416828 + }, + { + "epoch": 0.6099290780141844, + "grad_norm": 1.2726640701293945, + "loss": 5.154, + "lr": 0.0007693706293706294, + "step": 2150, + "tokens_trained": 0.20435532 + }, + { + "epoch": 0.6104964539007092, + "grad_norm": 1.1975597143173218, + "loss": 5.1559, + "lr": 0.000769090909090909, + "step": 2152, + "tokens_trained": 0.204545416 + }, + { + "epoch": 0.6110638297872341, + "grad_norm": 1.2840410470962524, + "loss": 5.2558, + "lr": 0.0007688111888111888, + "step": 2154, + "tokens_trained": 0.204734752 + }, + { + "epoch": 0.6116312056737588, + "grad_norm": 1.4807062149047852, + "loss": 5.229, + "lr": 0.0007685314685314686, + "step": 2156, + "tokens_trained": 0.204925432 + }, + { + "epoch": 0.6121985815602837, + "grad_norm": 1.3909307718276978, + "loss": 5.2128, + "lr": 0.0007682517482517483, + "step": 2158, + "tokens_trained": 0.205117624 + }, + { + "epoch": 0.6127659574468085, + "grad_norm": 1.3998613357543945, + "loss": 5.1344, + "lr": 0.000767972027972028, + "step": 2160, + "tokens_trained": 0.205309032 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 1.3821474313735962, + "loss": 5.2223, + "lr": 0.0007676923076923077, + "step": 2162, + "tokens_trained": 0.205498112 + }, + { + "epoch": 0.6139007092198582, + "grad_norm": 1.280150294303894, + "loss": 5.1357, + "lr": 0.0007674125874125874, + "step": 2164, + "tokens_trained": 0.205686112 + }, + { + "epoch": 0.614468085106383, + "grad_norm": 1.2361094951629639, + "loss": 5.1285, + "lr": 0.0007671328671328672, + "step": 2166, + "tokens_trained": 0.20587828 + }, + { + "epoch": 0.6150354609929078, + "grad_norm": 1.1495496034622192, + "loss": 5.1597, + "lr": 0.0007668531468531469, + "step": 2168, + "tokens_trained": 0.206068272 + }, + { + "epoch": 0.6156028368794326, + "grad_norm": 1.2377156019210815, + "loss": 5.1208, + "lr": 0.0007665734265734265, + "step": 2170, + "tokens_trained": 0.206257272 + }, + { + "epoch": 0.6161702127659574, + "grad_norm": 1.226664423942566, + "loss": 5.2143, + "lr": 0.0007662937062937063, + "step": 2172, + "tokens_trained": 0.206449824 + }, + { + "epoch": 0.6167375886524823, + "grad_norm": 1.1939537525177002, + "loss": 5.0847, + "lr": 0.000766013986013986, + "step": 2174, + "tokens_trained": 0.206636992 + }, + { + "epoch": 0.6173049645390071, + "grad_norm": 1.233585238456726, + "loss": 5.1647, + "lr": 0.0007657342657342658, + "step": 2176, + "tokens_trained": 0.206828288 + }, + { + "epoch": 0.617872340425532, + "grad_norm": 1.3282006978988647, + "loss": 5.1748, + "lr": 0.0007654545454545455, + "step": 2178, + "tokens_trained": 0.207019064 + }, + { + "epoch": 0.6184397163120567, + "grad_norm": 1.2299532890319824, + "loss": 5.248, + "lr": 0.0007651748251748251, + "step": 2180, + "tokens_trained": 0.20720844 + }, + { + "epoch": 0.6190070921985815, + "grad_norm": 1.279590129852295, + "loss": 5.1467, + "lr": 0.0007648951048951049, + "step": 2182, + "tokens_trained": 0.207398952 + }, + { + "epoch": 0.6195744680851064, + "grad_norm": 1.30775785446167, + "loss": 5.1981, + "lr": 0.0007646153846153846, + "step": 2184, + "tokens_trained": 0.207589224 + }, + { + "epoch": 0.6201418439716312, + "grad_norm": 1.2829056978225708, + "loss": 5.1976, + "lr": 0.0007643356643356644, + "step": 2186, + "tokens_trained": 0.20778024 + }, + { + "epoch": 0.6207092198581561, + "grad_norm": 1.2149474620819092, + "loss": 5.2186, + "lr": 0.000764055944055944, + "step": 2188, + "tokens_trained": 0.207969176 + }, + { + "epoch": 0.6212765957446809, + "grad_norm": 1.239912748336792, + "loss": 5.15, + "lr": 0.0007637762237762238, + "step": 2190, + "tokens_trained": 0.208159016 + }, + { + "epoch": 0.6218439716312056, + "grad_norm": 1.322252869606018, + "loss": 5.2447, + "lr": 0.0007634965034965035, + "step": 2192, + "tokens_trained": 0.2083502 + }, + { + "epoch": 0.6224113475177305, + "grad_norm": 1.1804618835449219, + "loss": 5.1924, + "lr": 0.0007632167832167833, + "step": 2194, + "tokens_trained": 0.208539616 + }, + { + "epoch": 0.6229787234042553, + "grad_norm": 1.2914003133773804, + "loss": 5.1559, + "lr": 0.000762937062937063, + "step": 2196, + "tokens_trained": 0.208731032 + }, + { + "epoch": 0.6235460992907801, + "grad_norm": 1.2175878286361694, + "loss": 5.1335, + "lr": 0.0007626573426573426, + "step": 2198, + "tokens_trained": 0.208923952 + }, + { + "epoch": 0.624113475177305, + "grad_norm": 1.2267946004867554, + "loss": 5.1697, + "lr": 0.0007623776223776224, + "step": 2200, + "tokens_trained": 0.20911168 + }, + { + "epoch": 0.6246808510638298, + "grad_norm": 1.2482635974884033, + "loss": 5.1986, + "lr": 0.0007620979020979021, + "step": 2202, + "tokens_trained": 0.209299504 + }, + { + "epoch": 0.6252482269503546, + "grad_norm": 1.3256076574325562, + "loss": 5.1955, + "lr": 0.0007618181818181819, + "step": 2204, + "tokens_trained": 0.20948936 + }, + { + "epoch": 0.6258156028368794, + "grad_norm": 1.205692172050476, + "loss": 5.1175, + "lr": 0.0007615384615384615, + "step": 2206, + "tokens_trained": 0.209678072 + }, + { + "epoch": 0.6263829787234042, + "grad_norm": 1.2371326684951782, + "loss": 5.1798, + "lr": 0.0007612587412587412, + "step": 2208, + "tokens_trained": 0.209868904 + }, + { + "epoch": 0.6269503546099291, + "grad_norm": 1.1657975912094116, + "loss": 5.159, + "lr": 0.000760979020979021, + "step": 2210, + "tokens_trained": 0.210060992 + }, + { + "epoch": 0.6275177304964539, + "grad_norm": 1.18202543258667, + "loss": 5.2157, + "lr": 0.0007606993006993007, + "step": 2212, + "tokens_trained": 0.210252096 + }, + { + "epoch": 0.6280851063829788, + "grad_norm": 1.220446228981018, + "loss": 5.1677, + "lr": 0.0007604195804195805, + "step": 2214, + "tokens_trained": 0.210444176 + }, + { + "epoch": 0.6286524822695035, + "grad_norm": 1.1070069074630737, + "loss": 5.1702, + "lr": 0.0007601398601398601, + "step": 2216, + "tokens_trained": 0.210633376 + }, + { + "epoch": 0.6292198581560283, + "grad_norm": 1.3031543493270874, + "loss": 5.2253, + "lr": 0.0007598601398601399, + "step": 2218, + "tokens_trained": 0.21082368 + }, + { + "epoch": 0.6297872340425532, + "grad_norm": 1.0999404191970825, + "loss": 5.1942, + "lr": 0.0007595804195804196, + "step": 2220, + "tokens_trained": 0.211013448 + }, + { + "epoch": 0.630354609929078, + "grad_norm": 1.2241060733795166, + "loss": 5.1408, + "lr": 0.0007593006993006993, + "step": 2222, + "tokens_trained": 0.211205176 + }, + { + "epoch": 0.6309219858156029, + "grad_norm": 1.3057242631912231, + "loss": 5.2234, + "lr": 0.000759020979020979, + "step": 2224, + "tokens_trained": 0.211396464 + }, + { + "epoch": 0.6314893617021277, + "grad_norm": 1.2667888402938843, + "loss": 5.1675, + "lr": 0.0007587412587412587, + "step": 2226, + "tokens_trained": 0.211587608 + }, + { + "epoch": 0.6320567375886524, + "grad_norm": 1.1653670072555542, + "loss": 5.2081, + "lr": 0.0007584615384615385, + "step": 2228, + "tokens_trained": 0.211779832 + }, + { + "epoch": 0.6326241134751773, + "grad_norm": 1.1786928176879883, + "loss": 5.1772, + "lr": 0.0007581818181818182, + "step": 2230, + "tokens_trained": 0.211971584 + }, + { + "epoch": 0.6331914893617021, + "grad_norm": 1.242872714996338, + "loss": 5.1378, + "lr": 0.000757902097902098, + "step": 2232, + "tokens_trained": 0.212161024 + }, + { + "epoch": 0.633758865248227, + "grad_norm": 1.2831401824951172, + "loss": 5.1488, + "lr": 0.0007576223776223776, + "step": 2234, + "tokens_trained": 0.21235084 + }, + { + "epoch": 0.6343262411347518, + "grad_norm": 1.269600510597229, + "loss": 5.1454, + "lr": 0.0007573426573426573, + "step": 2236, + "tokens_trained": 0.212539504 + }, + { + "epoch": 0.6348936170212766, + "grad_norm": 1.2224805355072021, + "loss": 5.1123, + "lr": 0.0007570629370629371, + "step": 2238, + "tokens_trained": 0.21272884 + }, + { + "epoch": 0.6354609929078014, + "grad_norm": 1.2404342889785767, + "loss": 5.2023, + "lr": 0.0007567832167832168, + "step": 2240, + "tokens_trained": 0.212920128 + }, + { + "epoch": 0.6360283687943262, + "grad_norm": 1.1551696062088013, + "loss": 5.1529, + "lr": 0.0007565034965034965, + "step": 2242, + "tokens_trained": 0.213110744 + }, + { + "epoch": 0.6365957446808511, + "grad_norm": 1.2342238426208496, + "loss": 5.182, + "lr": 0.0007562237762237762, + "step": 2244, + "tokens_trained": 0.213298584 + }, + { + "epoch": 0.6371631205673759, + "grad_norm": 1.2631146907806396, + "loss": 5.1442, + "lr": 0.000755944055944056, + "step": 2246, + "tokens_trained": 0.213488512 + }, + { + "epoch": 0.6377304964539007, + "grad_norm": 1.2031443119049072, + "loss": 5.1041, + "lr": 0.0007556643356643357, + "step": 2248, + "tokens_trained": 0.21367964 + }, + { + "epoch": 0.6382978723404256, + "grad_norm": 1.127889633178711, + "loss": 5.1889, + "lr": 0.0007553846153846154, + "step": 2250, + "tokens_trained": 0.213871584 + }, + { + "epoch": 0.6382978723404256, + "eval_loss": 5.1714253425598145, + "eval_runtime": 20.5005, + "step": 2250, + "tokens_trained": 0.213871584 + }, + { + "epoch": 0.6388652482269503, + "grad_norm": 1.1281750202178955, + "loss": 5.1039, + "lr": 0.0007551048951048951, + "step": 2252, + "tokens_trained": 0.214061624 + }, + { + "epoch": 0.6394326241134751, + "grad_norm": 1.1058608293533325, + "loss": 5.1562, + "lr": 0.0007548251748251748, + "step": 2254, + "tokens_trained": 0.214252024 + }, + { + "epoch": 0.64, + "grad_norm": 1.0579496622085571, + "loss": 5.1476, + "lr": 0.0007545454545454546, + "step": 2256, + "tokens_trained": 0.214442624 + }, + { + "epoch": 0.6405673758865248, + "grad_norm": 1.1370742321014404, + "loss": 5.1948, + "lr": 0.0007542657342657343, + "step": 2258, + "tokens_trained": 0.214634016 + }, + { + "epoch": 0.6411347517730497, + "grad_norm": 1.1118457317352295, + "loss": 5.169, + "lr": 0.000753986013986014, + "step": 2260, + "tokens_trained": 0.214823368 + }, + { + "epoch": 0.6417021276595745, + "grad_norm": 1.039004921913147, + "loss": 5.1454, + "lr": 0.0007537062937062937, + "step": 2262, + "tokens_trained": 0.21501196 + }, + { + "epoch": 0.6422695035460992, + "grad_norm": 1.2534265518188477, + "loss": 5.1455, + "lr": 0.0007534265734265734, + "step": 2264, + "tokens_trained": 0.215200808 + }, + { + "epoch": 0.6428368794326241, + "grad_norm": 1.2437689304351807, + "loss": 5.1966, + "lr": 0.0007531468531468532, + "step": 2266, + "tokens_trained": 0.21539036 + }, + { + "epoch": 0.6434042553191489, + "grad_norm": 1.1795995235443115, + "loss": 5.1716, + "lr": 0.0007528671328671329, + "step": 2268, + "tokens_trained": 0.215582088 + }, + { + "epoch": 0.6439716312056738, + "grad_norm": 1.3241360187530518, + "loss": 5.1638, + "lr": 0.0007525874125874126, + "step": 2270, + "tokens_trained": 0.215771936 + }, + { + "epoch": 0.6445390070921986, + "grad_norm": 1.2526317834854126, + "loss": 5.1067, + "lr": 0.0007523076923076923, + "step": 2272, + "tokens_trained": 0.215960792 + }, + { + "epoch": 0.6451063829787234, + "grad_norm": 1.249042272567749, + "loss": 5.1466, + "lr": 0.0007520279720279721, + "step": 2274, + "tokens_trained": 0.216151448 + }, + { + "epoch": 0.6456737588652482, + "grad_norm": 1.1926413774490356, + "loss": 5.1886, + "lr": 0.0007517482517482518, + "step": 2276, + "tokens_trained": 0.216340368 + }, + { + "epoch": 0.646241134751773, + "grad_norm": 1.1615192890167236, + "loss": 5.1538, + "lr": 0.0007514685314685314, + "step": 2278, + "tokens_trained": 0.216531264 + }, + { + "epoch": 0.6468085106382979, + "grad_norm": 1.1265521049499512, + "loss": 5.1518, + "lr": 0.0007511888111888112, + "step": 2280, + "tokens_trained": 0.216722024 + }, + { + "epoch": 0.6473758865248227, + "grad_norm": 1.0598393678665161, + "loss": 5.1776, + "lr": 0.0007509090909090909, + "step": 2282, + "tokens_trained": 0.216913232 + }, + { + "epoch": 0.6479432624113475, + "grad_norm": 1.1727370023727417, + "loss": 5.2083, + "lr": 0.0007506293706293707, + "step": 2284, + "tokens_trained": 0.217103136 + }, + { + "epoch": 0.6485106382978724, + "grad_norm": 1.1411634683609009, + "loss": 5.182, + "lr": 0.0007503496503496504, + "step": 2286, + "tokens_trained": 0.21729368 + }, + { + "epoch": 0.6490780141843971, + "grad_norm": 1.2293574810028076, + "loss": 5.1725, + "lr": 0.00075006993006993, + "step": 2288, + "tokens_trained": 0.217485624 + }, + { + "epoch": 0.649645390070922, + "grad_norm": 1.3079198598861694, + "loss": 5.1531, + "lr": 0.0007497902097902098, + "step": 2290, + "tokens_trained": 0.217675192 + }, + { + "epoch": 0.6502127659574468, + "grad_norm": 1.1579710245132446, + "loss": 5.1162, + "lr": 0.0007495104895104895, + "step": 2292, + "tokens_trained": 0.2178658 + }, + { + "epoch": 0.6507801418439716, + "grad_norm": 1.1968539953231812, + "loss": 5.1652, + "lr": 0.0007492307692307693, + "step": 2294, + "tokens_trained": 0.218057984 + }, + { + "epoch": 0.6513475177304965, + "grad_norm": 1.3666965961456299, + "loss": 5.2035, + "lr": 0.0007489510489510489, + "step": 2296, + "tokens_trained": 0.218249704 + }, + { + "epoch": 0.6519148936170213, + "grad_norm": 1.3615487813949585, + "loss": 5.1704, + "lr": 0.0007486713286713287, + "step": 2298, + "tokens_trained": 0.218441792 + }, + { + "epoch": 0.6524822695035462, + "grad_norm": 1.2289810180664062, + "loss": 5.1683, + "lr": 0.0007483916083916084, + "step": 2300, + "tokens_trained": 0.218630624 + }, + { + "epoch": 0.6530496453900709, + "grad_norm": 1.1299561262130737, + "loss": 5.1672, + "lr": 0.0007481118881118882, + "step": 2302, + "tokens_trained": 0.218819928 + }, + { + "epoch": 0.6536170212765957, + "grad_norm": 1.186132550239563, + "loss": 5.1456, + "lr": 0.0007478321678321679, + "step": 2304, + "tokens_trained": 0.219008792 + }, + { + "epoch": 0.6541843971631206, + "grad_norm": 1.2106919288635254, + "loss": 5.1998, + "lr": 0.0007475524475524475, + "step": 2306, + "tokens_trained": 0.219198584 + }, + { + "epoch": 0.6547517730496454, + "grad_norm": 1.2485368251800537, + "loss": 5.1473, + "lr": 0.0007472727272727273, + "step": 2308, + "tokens_trained": 0.219386768 + }, + { + "epoch": 0.6553191489361702, + "grad_norm": 1.1855547428131104, + "loss": 5.1721, + "lr": 0.000746993006993007, + "step": 2310, + "tokens_trained": 0.219575904 + }, + { + "epoch": 0.655886524822695, + "grad_norm": 1.3077043294906616, + "loss": 5.1444, + "lr": 0.0007467132867132868, + "step": 2312, + "tokens_trained": 0.219767712 + }, + { + "epoch": 0.6564539007092198, + "grad_norm": 1.3514399528503418, + "loss": 5.198, + "lr": 0.0007464335664335664, + "step": 2314, + "tokens_trained": 0.219959384 + }, + { + "epoch": 0.6570212765957447, + "grad_norm": 1.0906041860580444, + "loss": 5.115, + "lr": 0.0007461538461538462, + "step": 2316, + "tokens_trained": 0.2201464 + }, + { + "epoch": 0.6575886524822695, + "grad_norm": 1.154425859451294, + "loss": 5.1186, + "lr": 0.0007458741258741259, + "step": 2318, + "tokens_trained": 0.220336992 + }, + { + "epoch": 0.6581560283687943, + "grad_norm": 1.1141375303268433, + "loss": 5.1709, + "lr": 0.0007455944055944056, + "step": 2320, + "tokens_trained": 0.220525928 + }, + { + "epoch": 0.6587234042553192, + "grad_norm": 1.0958452224731445, + "loss": 5.1641, + "lr": 0.0007453146853146854, + "step": 2322, + "tokens_trained": 0.220715056 + }, + { + "epoch": 0.659290780141844, + "grad_norm": 1.168017029762268, + "loss": 5.1666, + "lr": 0.000745034965034965, + "step": 2324, + "tokens_trained": 0.220905264 + }, + { + "epoch": 0.6598581560283688, + "grad_norm": 1.044488549232483, + "loss": 5.2079, + "lr": 0.0007447552447552448, + "step": 2326, + "tokens_trained": 0.221096736 + }, + { + "epoch": 0.6604255319148936, + "grad_norm": 1.2333874702453613, + "loss": 5.1166, + "lr": 0.0007444755244755245, + "step": 2328, + "tokens_trained": 0.221287184 + }, + { + "epoch": 0.6609929078014184, + "grad_norm": 1.1800497770309448, + "loss": 5.1561, + "lr": 0.0007441958041958043, + "step": 2330, + "tokens_trained": 0.221477312 + }, + { + "epoch": 0.6615602836879433, + "grad_norm": 1.118755578994751, + "loss": 5.1513, + "lr": 0.0007439160839160839, + "step": 2332, + "tokens_trained": 0.221665208 + }, + { + "epoch": 0.6621276595744681, + "grad_norm": 1.2018475532531738, + "loss": 5.1007, + "lr": 0.0007436363636363636, + "step": 2334, + "tokens_trained": 0.221855608 + }, + { + "epoch": 0.662695035460993, + "grad_norm": 1.1832036972045898, + "loss": 5.0944, + "lr": 0.0007433566433566433, + "step": 2336, + "tokens_trained": 0.222043856 + }, + { + "epoch": 0.6632624113475177, + "grad_norm": 1.3179196119308472, + "loss": 5.1645, + "lr": 0.0007430769230769231, + "step": 2338, + "tokens_trained": 0.222235728 + }, + { + "epoch": 0.6638297872340425, + "grad_norm": 1.1313154697418213, + "loss": 5.1733, + "lr": 0.0007427972027972029, + "step": 2340, + "tokens_trained": 0.222424688 + }, + { + "epoch": 0.6643971631205674, + "grad_norm": 1.2135043144226074, + "loss": 5.1291, + "lr": 0.0007425174825174825, + "step": 2342, + "tokens_trained": 0.222611952 + }, + { + "epoch": 0.6649645390070922, + "grad_norm": 1.2418344020843506, + "loss": 5.178, + "lr": 0.0007422377622377622, + "step": 2344, + "tokens_trained": 0.222803264 + }, + { + "epoch": 0.6655319148936171, + "grad_norm": 1.2896099090576172, + "loss": 5.1772, + "lr": 0.000741958041958042, + "step": 2346, + "tokens_trained": 0.22299108 + }, + { + "epoch": 0.6660992907801419, + "grad_norm": 1.150012731552124, + "loss": 5.1334, + "lr": 0.0007416783216783217, + "step": 2348, + "tokens_trained": 0.223182336 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.307721495628357, + "loss": 5.0898, + "lr": 0.0007413986013986014, + "step": 2350, + "tokens_trained": 0.223371664 + }, + { + "epoch": 0.6672340425531915, + "grad_norm": 1.2633092403411865, + "loss": 5.1344, + "lr": 0.0007411188811188811, + "step": 2352, + "tokens_trained": 0.223561984 + }, + { + "epoch": 0.6678014184397163, + "grad_norm": 1.1801539659500122, + "loss": 5.1242, + "lr": 0.0007408391608391608, + "step": 2354, + "tokens_trained": 0.223750344 + }, + { + "epoch": 0.6683687943262412, + "grad_norm": 1.1279330253601074, + "loss": 5.1348, + "lr": 0.0007405594405594406, + "step": 2356, + "tokens_trained": 0.223941528 + }, + { + "epoch": 0.668936170212766, + "grad_norm": 1.193912148475647, + "loss": 5.1823, + "lr": 0.0007402797202797204, + "step": 2358, + "tokens_trained": 0.224132064 + }, + { + "epoch": 0.6695035460992907, + "grad_norm": 1.1424062252044678, + "loss": 5.1452, + "lr": 0.00074, + "step": 2360, + "tokens_trained": 0.2243216 + }, + { + "epoch": 0.6700709219858156, + "grad_norm": 1.1543093919754028, + "loss": 5.1199, + "lr": 0.0007397202797202797, + "step": 2362, + "tokens_trained": 0.224509992 + }, + { + "epoch": 0.6706382978723404, + "grad_norm": 1.2291040420532227, + "loss": 5.0824, + "lr": 0.0007394405594405595, + "step": 2364, + "tokens_trained": 0.22470124 + }, + { + "epoch": 0.6712056737588652, + "grad_norm": 1.1839559078216553, + "loss": 5.1486, + "lr": 0.0007391608391608392, + "step": 2366, + "tokens_trained": 0.224893488 + }, + { + "epoch": 0.6717730496453901, + "grad_norm": 1.1374263763427734, + "loss": 5.1482, + "lr": 0.0007388811188811189, + "step": 2368, + "tokens_trained": 0.225083304 + }, + { + "epoch": 0.6723404255319149, + "grad_norm": 1.2041044235229492, + "loss": 5.1055, + "lr": 0.0007386013986013986, + "step": 2370, + "tokens_trained": 0.225273256 + }, + { + "epoch": 0.6729078014184398, + "grad_norm": 1.1405609846115112, + "loss": 5.1647, + "lr": 0.0007383216783216782, + "step": 2372, + "tokens_trained": 0.225461976 + }, + { + "epoch": 0.6734751773049645, + "grad_norm": 1.112979531288147, + "loss": 5.1232, + "lr": 0.0007380419580419581, + "step": 2374, + "tokens_trained": 0.225651248 + }, + { + "epoch": 0.6737588652482269, + "eval_loss": 5.160866737365723, + "eval_runtime": 20.3049, + "step": 2375, + "tokens_trained": 0.22574612 + }, + { + "epoch": 0.6740425531914893, + "grad_norm": 1.2868081331253052, + "loss": 5.1802, + "lr": 0.0007377622377622378, + "step": 2376, + "tokens_trained": 0.225840616 + }, + { + "epoch": 0.6746099290780142, + "grad_norm": 1.0904244184494019, + "loss": 5.1093, + "lr": 0.0007374825174825175, + "step": 2378, + "tokens_trained": 0.22602952 + }, + { + "epoch": 0.675177304964539, + "grad_norm": 1.182820200920105, + "loss": 5.1425, + "lr": 0.0007372027972027972, + "step": 2380, + "tokens_trained": 0.226219912 + }, + { + "epoch": 0.6757446808510639, + "grad_norm": 1.29615318775177, + "loss": 5.2044, + "lr": 0.000736923076923077, + "step": 2382, + "tokens_trained": 0.226409832 + }, + { + "epoch": 0.6763120567375887, + "grad_norm": 1.2440109252929688, + "loss": 5.1722, + "lr": 0.0007366433566433567, + "step": 2384, + "tokens_trained": 0.226600912 + }, + { + "epoch": 0.6768794326241134, + "grad_norm": 1.2176823616027832, + "loss": 5.1237, + "lr": 0.0007363636363636363, + "step": 2386, + "tokens_trained": 0.226788136 + }, + { + "epoch": 0.6774468085106383, + "grad_norm": 1.1725387573242188, + "loss": 5.1334, + "lr": 0.0007360839160839161, + "step": 2388, + "tokens_trained": 0.22697924 + }, + { + "epoch": 0.6780141843971631, + "grad_norm": 1.0678813457489014, + "loss": 5.1306, + "lr": 0.0007358041958041957, + "step": 2390, + "tokens_trained": 0.227169576 + }, + { + "epoch": 0.678581560283688, + "grad_norm": 1.1266731023788452, + "loss": 5.1956, + "lr": 0.0007355244755244756, + "step": 2392, + "tokens_trained": 0.227361776 + }, + { + "epoch": 0.6791489361702128, + "grad_norm": 1.2048848867416382, + "loss": 5.1599, + "lr": 0.0007352447552447553, + "step": 2394, + "tokens_trained": 0.227551768 + }, + { + "epoch": 0.6797163120567375, + "grad_norm": 1.2414182424545288, + "loss": 5.1836, + "lr": 0.000734965034965035, + "step": 2396, + "tokens_trained": 0.227743072 + }, + { + "epoch": 0.6802836879432624, + "grad_norm": 1.1587010622024536, + "loss": 5.1589, + "lr": 0.0007346853146853147, + "step": 2398, + "tokens_trained": 0.227933848 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 1.1487596035003662, + "loss": 5.1494, + "lr": 0.0007344055944055944, + "step": 2400, + "tokens_trained": 0.228122304 + }, + { + "epoch": 0.6814184397163121, + "grad_norm": 1.1008368730545044, + "loss": 5.1614, + "lr": 0.0007341258741258742, + "step": 2402, + "tokens_trained": 0.228311624 + }, + { + "epoch": 0.6819858156028369, + "grad_norm": 1.0571539402008057, + "loss": 5.1373, + "lr": 0.0007338461538461538, + "step": 2404, + "tokens_trained": 0.228501208 + }, + { + "epoch": 0.6825531914893617, + "grad_norm": 1.1685987710952759, + "loss": 5.1439, + "lr": 0.0007335664335664336, + "step": 2406, + "tokens_trained": 0.228691272 + }, + { + "epoch": 0.6831205673758866, + "grad_norm": 1.2319012880325317, + "loss": 5.1949, + "lr": 0.0007332867132867132, + "step": 2408, + "tokens_trained": 0.228881608 + }, + { + "epoch": 0.6836879432624113, + "grad_norm": 1.1806107759475708, + "loss": 5.1467, + "lr": 0.0007330069930069931, + "step": 2410, + "tokens_trained": 0.229073152 + }, + { + "epoch": 0.6842553191489362, + "grad_norm": 1.1616697311401367, + "loss": 5.1553, + "lr": 0.0007327272727272728, + "step": 2412, + "tokens_trained": 0.229263656 + }, + { + "epoch": 0.684822695035461, + "grad_norm": 1.143112063407898, + "loss": 5.091, + "lr": 0.0007324475524475524, + "step": 2414, + "tokens_trained": 0.229454224 + }, + { + "epoch": 0.6853900709219858, + "grad_norm": 1.2467398643493652, + "loss": 5.1778, + "lr": 0.0007321678321678322, + "step": 2416, + "tokens_trained": 0.22964568 + }, + { + "epoch": 0.6859574468085107, + "grad_norm": 1.1989973783493042, + "loss": 5.146, + "lr": 0.0007318881118881119, + "step": 2418, + "tokens_trained": 0.229836448 + }, + { + "epoch": 0.6865248226950355, + "grad_norm": 1.3296927213668823, + "loss": 5.1446, + "lr": 0.0007316083916083917, + "step": 2420, + "tokens_trained": 0.230027424 + }, + { + "epoch": 0.6870921985815602, + "grad_norm": 1.256990671157837, + "loss": 5.1396, + "lr": 0.0007313286713286713, + "step": 2422, + "tokens_trained": 0.23022012 + }, + { + "epoch": 0.6876595744680851, + "grad_norm": 1.1474595069885254, + "loss": 5.1263, + "lr": 0.0007310489510489511, + "step": 2424, + "tokens_trained": 0.230410232 + }, + { + "epoch": 0.6882269503546099, + "grad_norm": 1.2070049047470093, + "loss": 5.1169, + "lr": 0.0007307692307692307, + "step": 2426, + "tokens_trained": 0.230601056 + }, + { + "epoch": 0.6887943262411348, + "grad_norm": 1.2047003507614136, + "loss": 5.1146, + "lr": 0.0007304895104895105, + "step": 2428, + "tokens_trained": 0.230791056 + }, + { + "epoch": 0.6893617021276596, + "grad_norm": 1.3246855735778809, + "loss": 5.1864, + "lr": 0.0007302097902097902, + "step": 2430, + "tokens_trained": 0.230981904 + }, + { + "epoch": 0.6899290780141843, + "grad_norm": 1.2012712955474854, + "loss": 5.168, + "lr": 0.0007299300699300699, + "step": 2432, + "tokens_trained": 0.231170976 + }, + { + "epoch": 0.6904964539007092, + "grad_norm": 1.2258418798446655, + "loss": 5.14, + "lr": 0.0007296503496503497, + "step": 2434, + "tokens_trained": 0.231362024 + }, + { + "epoch": 0.691063829787234, + "grad_norm": 1.2767595052719116, + "loss": 5.1775, + "lr": 0.0007293706293706294, + "step": 2436, + "tokens_trained": 0.23155 + }, + { + "epoch": 0.6916312056737589, + "grad_norm": 1.204324722290039, + "loss": 5.1357, + "lr": 0.0007290909090909092, + "step": 2438, + "tokens_trained": 0.231739944 + }, + { + "epoch": 0.6921985815602837, + "grad_norm": 1.1876553297042847, + "loss": 5.1185, + "lr": 0.0007288111888111888, + "step": 2440, + "tokens_trained": 0.231930448 + }, + { + "epoch": 0.6927659574468085, + "grad_norm": 1.2512568235397339, + "loss": 5.1212, + "lr": 0.0007285314685314685, + "step": 2442, + "tokens_trained": 0.23212152 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 1.2961020469665527, + "loss": 5.0622, + "lr": 0.0007282517482517482, + "step": 2444, + "tokens_trained": 0.232310856 + }, + { + "epoch": 0.6939007092198581, + "grad_norm": 1.1042410135269165, + "loss": 5.1317, + "lr": 0.000727972027972028, + "step": 2446, + "tokens_trained": 0.232499144 + }, + { + "epoch": 0.694468085106383, + "grad_norm": 1.0408610105514526, + "loss": 5.1562, + "lr": 0.0007276923076923077, + "step": 2448, + "tokens_trained": 0.232689864 + }, + { + "epoch": 0.6950354609929078, + "grad_norm": 1.1109600067138672, + "loss": 5.1463, + "lr": 0.0007274125874125874, + "step": 2450, + "tokens_trained": 0.232878912 + }, + { + "epoch": 0.6956028368794326, + "grad_norm": 1.0867618322372437, + "loss": 5.105, + "lr": 0.0007271328671328672, + "step": 2452, + "tokens_trained": 0.233069416 + }, + { + "epoch": 0.6961702127659575, + "grad_norm": 1.0342003107070923, + "loss": 5.1431, + "lr": 0.0007268531468531469, + "step": 2454, + "tokens_trained": 0.233258552 + }, + { + "epoch": 0.6967375886524823, + "grad_norm": 1.2264306545257568, + "loss": 5.1646, + "lr": 0.0007265734265734266, + "step": 2456, + "tokens_trained": 0.233448464 + }, + { + "epoch": 0.6973049645390071, + "grad_norm": 1.1715648174285889, + "loss": 5.1194, + "lr": 0.0007262937062937063, + "step": 2458, + "tokens_trained": 0.23364024 + }, + { + "epoch": 0.6978723404255319, + "grad_norm": 1.05716872215271, + "loss": 5.09, + "lr": 0.000726013986013986, + "step": 2460, + "tokens_trained": 0.233829848 + }, + { + "epoch": 0.6984397163120567, + "grad_norm": 1.1329678297042847, + "loss": 5.1303, + "lr": 0.0007257342657342657, + "step": 2462, + "tokens_trained": 0.234021368 + }, + { + "epoch": 0.6990070921985816, + "grad_norm": 1.2084178924560547, + "loss": 5.1393, + "lr": 0.0007254545454545455, + "step": 2464, + "tokens_trained": 0.234210264 + }, + { + "epoch": 0.6995744680851064, + "grad_norm": 1.0744361877441406, + "loss": 5.1067, + "lr": 0.0007251748251748252, + "step": 2466, + "tokens_trained": 0.234399616 + }, + { + "epoch": 0.7001418439716312, + "grad_norm": 1.1711128950119019, + "loss": 5.1226, + "lr": 0.0007248951048951049, + "step": 2468, + "tokens_trained": 0.234589936 + }, + { + "epoch": 0.700709219858156, + "grad_norm": 1.2188383340835571, + "loss": 5.1139, + "lr": 0.0007246153846153846, + "step": 2470, + "tokens_trained": 0.234781376 + }, + { + "epoch": 0.7012765957446808, + "grad_norm": 1.1662676334381104, + "loss": 5.137, + "lr": 0.0007243356643356644, + "step": 2472, + "tokens_trained": 0.234972192 + }, + { + "epoch": 0.7018439716312057, + "grad_norm": 1.18717622756958, + "loss": 5.1665, + "lr": 0.0007240559440559441, + "step": 2474, + "tokens_trained": 0.235162472 + }, + { + "epoch": 0.7024113475177305, + "grad_norm": 1.1546517610549927, + "loss": 5.1503, + "lr": 0.0007237762237762238, + "step": 2476, + "tokens_trained": 0.23535256 + }, + { + "epoch": 0.7029787234042553, + "grad_norm": 1.0647573471069336, + "loss": 5.155, + "lr": 0.0007234965034965035, + "step": 2478, + "tokens_trained": 0.235543424 + }, + { + "epoch": 0.7035460992907802, + "grad_norm": 1.1157219409942627, + "loss": 5.1561, + "lr": 0.0007232167832167831, + "step": 2480, + "tokens_trained": 0.23573568 + }, + { + "epoch": 0.7041134751773049, + "grad_norm": 1.1972934007644653, + "loss": 5.1271, + "lr": 0.000722937062937063, + "step": 2482, + "tokens_trained": 0.235927072 + }, + { + "epoch": 0.7046808510638298, + "grad_norm": 1.0370620489120483, + "loss": 5.1016, + "lr": 0.0007226573426573426, + "step": 2484, + "tokens_trained": 0.236116528 + }, + { + "epoch": 0.7052482269503546, + "grad_norm": 1.1389620304107666, + "loss": 5.1422, + "lr": 0.0007223776223776224, + "step": 2486, + "tokens_trained": 0.236305864 + }, + { + "epoch": 0.7058156028368794, + "grad_norm": 1.1045559644699097, + "loss": 5.1434, + "lr": 0.0007220979020979021, + "step": 2488, + "tokens_trained": 0.236494224 + }, + { + "epoch": 0.7063829787234043, + "grad_norm": 1.1014395952224731, + "loss": 5.1462, + "lr": 0.0007218181818181819, + "step": 2490, + "tokens_trained": 0.236684376 + }, + { + "epoch": 0.706950354609929, + "grad_norm": 1.0460759401321411, + "loss": 5.126, + "lr": 0.0007215384615384616, + "step": 2492, + "tokens_trained": 0.236875272 + }, + { + "epoch": 0.707517730496454, + "grad_norm": 1.0848767757415771, + "loss": 5.1387, + "lr": 0.0007212587412587412, + "step": 2494, + "tokens_trained": 0.237065552 + }, + { + "epoch": 0.7080851063829787, + "grad_norm": 1.1626802682876587, + "loss": 5.1509, + "lr": 0.000720979020979021, + "step": 2496, + "tokens_trained": 0.237254944 + }, + { + "epoch": 0.7086524822695035, + "grad_norm": 1.1846860647201538, + "loss": 5.098, + "lr": 0.0007206993006993006, + "step": 2498, + "tokens_trained": 0.237444488 + }, + { + "epoch": 0.7092198581560284, + "grad_norm": 1.2549248933792114, + "loss": 5.1104, + "lr": 0.0007204195804195805, + "step": 2500, + "tokens_trained": 0.237633528 + }, + { + "epoch": 0.7092198581560284, + "eval_loss": 5.141824245452881, + "eval_runtime": 20.5081, + "step": 2500, + "tokens_trained": 0.237633528 + }, + { + "epoch": 0.7097872340425532, + "grad_norm": 1.19071626663208, + "loss": 5.2249, + "lr": 0.0007201398601398601, + "step": 2502, + "tokens_trained": 0.237823136 + }, + { + "epoch": 0.7103546099290781, + "grad_norm": 1.162804365158081, + "loss": 5.1099, + "lr": 0.0007198601398601399, + "step": 2504, + "tokens_trained": 0.238012752 + }, + { + "epoch": 0.7109219858156028, + "grad_norm": 1.0964027643203735, + "loss": 5.1015, + "lr": 0.0007195804195804196, + "step": 2506, + "tokens_trained": 0.238205472 + }, + { + "epoch": 0.7114893617021276, + "grad_norm": 1.0719815492630005, + "loss": 5.1425, + "lr": 0.0007193006993006994, + "step": 2508, + "tokens_trained": 0.238394848 + }, + { + "epoch": 0.7120567375886525, + "grad_norm": 1.1835323572158813, + "loss": 5.0744, + "lr": 0.0007190209790209791, + "step": 2510, + "tokens_trained": 0.238583408 + }, + { + "epoch": 0.7126241134751773, + "grad_norm": 1.0975273847579956, + "loss": 5.0346, + "lr": 0.0007187412587412587, + "step": 2512, + "tokens_trained": 0.238773544 + }, + { + "epoch": 0.7131914893617022, + "grad_norm": 1.1507470607757568, + "loss": 5.146, + "lr": 0.0007184615384615385, + "step": 2514, + "tokens_trained": 0.238962624 + }, + { + "epoch": 0.713758865248227, + "grad_norm": 1.1186292171478271, + "loss": 5.1934, + "lr": 0.0007181818181818181, + "step": 2516, + "tokens_trained": 0.239152848 + }, + { + "epoch": 0.7143262411347517, + "grad_norm": 1.0672920942306519, + "loss": 5.1488, + "lr": 0.000717902097902098, + "step": 2518, + "tokens_trained": 0.239344248 + }, + { + "epoch": 0.7148936170212766, + "grad_norm": 1.1226296424865723, + "loss": 5.0799, + "lr": 0.0007176223776223776, + "step": 2520, + "tokens_trained": 0.239535088 + }, + { + "epoch": 0.7154609929078014, + "grad_norm": 1.134265422821045, + "loss": 5.1677, + "lr": 0.0007173426573426573, + "step": 2522, + "tokens_trained": 0.23972356 + }, + { + "epoch": 0.7160283687943262, + "grad_norm": 1.1157846450805664, + "loss": 5.1576, + "lr": 0.0007170629370629371, + "step": 2524, + "tokens_trained": 0.239914104 + }, + { + "epoch": 0.7165957446808511, + "grad_norm": 1.096637487411499, + "loss": 5.1512, + "lr": 0.0007167832167832168, + "step": 2526, + "tokens_trained": 0.24010344 + }, + { + "epoch": 0.7171631205673759, + "grad_norm": 1.0092846155166626, + "loss": 5.0907, + "lr": 0.0007165034965034966, + "step": 2528, + "tokens_trained": 0.240294496 + }, + { + "epoch": 0.7177304964539007, + "grad_norm": 0.9926803112030029, + "loss": 5.112, + "lr": 0.0007162237762237762, + "step": 2530, + "tokens_trained": 0.240484752 + }, + { + "epoch": 0.7182978723404255, + "grad_norm": 1.031894326210022, + "loss": 5.13, + "lr": 0.000715944055944056, + "step": 2532, + "tokens_trained": 0.240674024 + }, + { + "epoch": 0.7188652482269503, + "grad_norm": 1.0606821775436401, + "loss": 5.1229, + "lr": 0.0007156643356643356, + "step": 2534, + "tokens_trained": 0.24086436 + }, + { + "epoch": 0.7194326241134752, + "grad_norm": 1.0486221313476562, + "loss": 5.1179, + "lr": 0.0007153846153846155, + "step": 2536, + "tokens_trained": 0.241052096 + }, + { + "epoch": 0.72, + "grad_norm": 1.073940396308899, + "loss": 5.1147, + "lr": 0.0007151048951048951, + "step": 2538, + "tokens_trained": 0.241242064 + }, + { + "epoch": 0.7205673758865249, + "grad_norm": 1.0888422727584839, + "loss": 5.1442, + "lr": 0.0007148251748251748, + "step": 2540, + "tokens_trained": 0.241429472 + }, + { + "epoch": 0.7211347517730496, + "grad_norm": 1.0362575054168701, + "loss": 5.1482, + "lr": 0.0007145454545454546, + "step": 2542, + "tokens_trained": 0.241619464 + }, + { + "epoch": 0.7217021276595744, + "grad_norm": 1.020987629890442, + "loss": 5.1809, + "lr": 0.0007142657342657343, + "step": 2544, + "tokens_trained": 0.241810584 + }, + { + "epoch": 0.7222695035460993, + "grad_norm": 1.1145941019058228, + "loss": 5.07, + "lr": 0.0007139860139860141, + "step": 2546, + "tokens_trained": 0.242001336 + }, + { + "epoch": 0.7228368794326241, + "grad_norm": 1.114311933517456, + "loss": 5.1288, + "lr": 0.0007137062937062937, + "step": 2548, + "tokens_trained": 0.242191648 + }, + { + "epoch": 0.723404255319149, + "grad_norm": 1.2127752304077148, + "loss": 5.1414, + "lr": 0.0007134265734265734, + "step": 2550, + "tokens_trained": 0.2423814 + }, + { + "epoch": 0.7239716312056738, + "grad_norm": 1.2173429727554321, + "loss": 5.0843, + "lr": 0.0007131468531468531, + "step": 2552, + "tokens_trained": 0.242571344 + }, + { + "epoch": 0.7245390070921985, + "grad_norm": 1.269544005393982, + "loss": 5.0945, + "lr": 0.0007128671328671329, + "step": 2554, + "tokens_trained": 0.242760304 + }, + { + "epoch": 0.7251063829787234, + "grad_norm": 1.1891573667526245, + "loss": 5.1301, + "lr": 0.0007125874125874126, + "step": 2556, + "tokens_trained": 0.242950432 + }, + { + "epoch": 0.7256737588652482, + "grad_norm": 1.1826258897781372, + "loss": 5.1463, + "lr": 0.0007123076923076923, + "step": 2558, + "tokens_trained": 0.243140944 + }, + { + "epoch": 0.7262411347517731, + "grad_norm": 1.0478367805480957, + "loss": 5.1082, + "lr": 0.0007120279720279721, + "step": 2560, + "tokens_trained": 0.243331192 + }, + { + "epoch": 0.7268085106382979, + "grad_norm": 1.05866539478302, + "loss": 5.135, + "lr": 0.0007117482517482518, + "step": 2562, + "tokens_trained": 0.243519712 + }, + { + "epoch": 0.7273758865248227, + "grad_norm": 1.1300735473632812, + "loss": 5.0985, + "lr": 0.0007114685314685315, + "step": 2564, + "tokens_trained": 0.243710408 + }, + { + "epoch": 0.7279432624113475, + "grad_norm": 1.0662705898284912, + "loss": 5.1482, + "lr": 0.0007111888111888112, + "step": 2566, + "tokens_trained": 0.243899576 + }, + { + "epoch": 0.7285106382978723, + "grad_norm": 1.0905804634094238, + "loss": 5.103, + "lr": 0.0007109090909090909, + "step": 2568, + "tokens_trained": 0.244090984 + }, + { + "epoch": 0.7290780141843972, + "grad_norm": 1.2062023878097534, + "loss": 5.1318, + "lr": 0.0007106293706293706, + "step": 2570, + "tokens_trained": 0.244280584 + }, + { + "epoch": 0.729645390070922, + "grad_norm": 1.0444546937942505, + "loss": 5.144, + "lr": 0.0007103496503496504, + "step": 2572, + "tokens_trained": 0.244471384 + }, + { + "epoch": 0.7302127659574468, + "grad_norm": 1.0395665168762207, + "loss": 5.0944, + "lr": 0.0007100699300699301, + "step": 2574, + "tokens_trained": 0.24466056 + }, + { + "epoch": 0.7307801418439717, + "grad_norm": 1.0630977153778076, + "loss": 5.1038, + "lr": 0.0007097902097902098, + "step": 2576, + "tokens_trained": 0.2448524 + }, + { + "epoch": 0.7313475177304964, + "grad_norm": 1.1561299562454224, + "loss": 5.1544, + "lr": 0.0007095104895104895, + "step": 2578, + "tokens_trained": 0.245042104 + }, + { + "epoch": 0.7319148936170212, + "grad_norm": 1.1774277687072754, + "loss": 5.1366, + "lr": 0.0007092307692307692, + "step": 2580, + "tokens_trained": 0.245231832 + }, + { + "epoch": 0.7324822695035461, + "grad_norm": 1.2139825820922852, + "loss": 5.1195, + "lr": 0.000708951048951049, + "step": 2582, + "tokens_trained": 0.24542076 + }, + { + "epoch": 0.7330496453900709, + "grad_norm": 1.1340903043746948, + "loss": 5.1476, + "lr": 0.0007086713286713287, + "step": 2584, + "tokens_trained": 0.245613128 + }, + { + "epoch": 0.7336170212765958, + "grad_norm": 1.2109994888305664, + "loss": 5.1359, + "lr": 0.0007083916083916084, + "step": 2586, + "tokens_trained": 0.245803992 + }, + { + "epoch": 0.7341843971631206, + "grad_norm": 1.1087621450424194, + "loss": 5.1287, + "lr": 0.000708111888111888, + "step": 2588, + "tokens_trained": 0.245994816 + }, + { + "epoch": 0.7347517730496453, + "grad_norm": 1.206106424331665, + "loss": 5.1618, + "lr": 0.0007078321678321679, + "step": 2590, + "tokens_trained": 0.246183624 + }, + { + "epoch": 0.7353191489361702, + "grad_norm": 1.0370070934295654, + "loss": 5.1103, + "lr": 0.0007075524475524475, + "step": 2592, + "tokens_trained": 0.246375232 + }, + { + "epoch": 0.735886524822695, + "grad_norm": 0.9844968914985657, + "loss": 5.1266, + "lr": 0.0007072727272727273, + "step": 2594, + "tokens_trained": 0.246565048 + }, + { + "epoch": 0.7364539007092199, + "grad_norm": 1.0623670816421509, + "loss": 5.1341, + "lr": 0.000706993006993007, + "step": 2596, + "tokens_trained": 0.246754136 + }, + { + "epoch": 0.7370212765957447, + "grad_norm": 1.1878798007965088, + "loss": 5.1178, + "lr": 0.0007067132867132867, + "step": 2598, + "tokens_trained": 0.246944496 + }, + { + "epoch": 0.7375886524822695, + "grad_norm": 1.045849323272705, + "loss": 5.1151, + "lr": 0.0007064335664335665, + "step": 2600, + "tokens_trained": 0.247135616 + }, + { + "epoch": 0.7381560283687943, + "grad_norm": 1.1081782579421997, + "loss": 5.0699, + "lr": 0.0007061538461538462, + "step": 2602, + "tokens_trained": 0.247326864 + }, + { + "epoch": 0.7387234042553191, + "grad_norm": 1.0893741846084595, + "loss": 5.0967, + "lr": 0.0007058741258741259, + "step": 2604, + "tokens_trained": 0.247515736 + }, + { + "epoch": 0.739290780141844, + "grad_norm": 1.128481149673462, + "loss": 5.1136, + "lr": 0.0007055944055944055, + "step": 2606, + "tokens_trained": 0.24770688 + }, + { + "epoch": 0.7398581560283688, + "grad_norm": 1.0735145807266235, + "loss": 5.1127, + "lr": 0.0007053146853146854, + "step": 2608, + "tokens_trained": 0.247897584 + }, + { + "epoch": 0.7404255319148936, + "grad_norm": 1.0027481317520142, + "loss": 5.1157, + "lr": 0.000705034965034965, + "step": 2610, + "tokens_trained": 0.248088352 + }, + { + "epoch": 0.7409929078014185, + "grad_norm": 1.0782684087753296, + "loss": 5.1268, + "lr": 0.0007047552447552448, + "step": 2612, + "tokens_trained": 0.248277752 + }, + { + "epoch": 0.7415602836879432, + "grad_norm": 1.0961271524429321, + "loss": 5.1024, + "lr": 0.0007044755244755245, + "step": 2614, + "tokens_trained": 0.248466504 + }, + { + "epoch": 0.7421276595744681, + "grad_norm": 0.9727640151977539, + "loss": 5.067, + "lr": 0.0007041958041958041, + "step": 2616, + "tokens_trained": 0.248657896 + }, + { + "epoch": 0.7426950354609929, + "grad_norm": 0.9756829738616943, + "loss": 5.1326, + "lr": 0.000703916083916084, + "step": 2618, + "tokens_trained": 0.248849288 + }, + { + "epoch": 0.7432624113475177, + "grad_norm": 0.9990546703338623, + "loss": 5.2016, + "lr": 0.0007036363636363636, + "step": 2620, + "tokens_trained": 0.24903988 + }, + { + "epoch": 0.7438297872340426, + "grad_norm": 1.062199592590332, + "loss": 5.1517, + "lr": 0.0007033566433566434, + "step": 2622, + "tokens_trained": 0.24922872 + }, + { + "epoch": 0.7443971631205674, + "grad_norm": 1.138197422027588, + "loss": 5.1052, + "lr": 0.000703076923076923, + "step": 2624, + "tokens_trained": 0.249420464 + }, + { + "epoch": 0.7446808510638298, + "eval_loss": 5.127779960632324, + "eval_runtime": 20.9141, + "step": 2625, + "tokens_trained": 0.249516464 + }, + { + "epoch": 0.7449645390070923, + "grad_norm": 1.1704756021499634, + "loss": 5.1167, + "lr": 0.0007027972027972029, + "step": 2626, + "tokens_trained": 0.249612824 + }, + { + "epoch": 0.745531914893617, + "grad_norm": 1.067280888557434, + "loss": 5.0877, + "lr": 0.0007025174825174825, + "step": 2628, + "tokens_trained": 0.249801672 + }, + { + "epoch": 0.7460992907801418, + "grad_norm": 1.0734069347381592, + "loss": 5.091, + "lr": 0.0007022377622377623, + "step": 2630, + "tokens_trained": 0.249993136 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 1.0817586183547974, + "loss": 5.0894, + "lr": 0.000701958041958042, + "step": 2632, + "tokens_trained": 0.25018232 + }, + { + "epoch": 0.7472340425531915, + "grad_norm": 1.0738139152526855, + "loss": 5.1141, + "lr": 0.0007016783216783216, + "step": 2634, + "tokens_trained": 0.250373456 + }, + { + "epoch": 0.7478014184397163, + "grad_norm": 1.0292818546295166, + "loss": 5.0746, + "lr": 0.0007013986013986015, + "step": 2636, + "tokens_trained": 0.250563552 + }, + { + "epoch": 0.7483687943262411, + "grad_norm": 1.0308977365493774, + "loss": 5.1346, + "lr": 0.0007011188811188811, + "step": 2638, + "tokens_trained": 0.25075176 + }, + { + "epoch": 0.7489361702127659, + "grad_norm": 1.0287693738937378, + "loss": 5.1137, + "lr": 0.0007008391608391609, + "step": 2640, + "tokens_trained": 0.250939456 + }, + { + "epoch": 0.7495035460992908, + "grad_norm": 1.043565273284912, + "loss": 5.1251, + "lr": 0.0007005594405594405, + "step": 2642, + "tokens_trained": 0.251130456 + }, + { + "epoch": 0.7500709219858156, + "grad_norm": 1.0977740287780762, + "loss": 5.0959, + "lr": 0.0007002797202797204, + "step": 2644, + "tokens_trained": 0.251320016 + }, + { + "epoch": 0.7506382978723404, + "grad_norm": 1.0304359197616577, + "loss": 5.0893, + "lr": 0.0007, + "step": 2646, + "tokens_trained": 0.251509824 + }, + { + "epoch": 0.7512056737588653, + "grad_norm": 1.0331344604492188, + "loss": 5.1238, + "lr": 0.0006997202797202797, + "step": 2648, + "tokens_trained": 0.251700504 + }, + { + "epoch": 0.75177304964539, + "grad_norm": 1.0405573844909668, + "loss": 5.1301, + "lr": 0.0006994405594405595, + "step": 2650, + "tokens_trained": 0.251890936 + }, + { + "epoch": 0.7523404255319149, + "grad_norm": 1.0685805082321167, + "loss": 5.1354, + "lr": 0.0006991608391608391, + "step": 2652, + "tokens_trained": 0.252081296 + }, + { + "epoch": 0.7529078014184397, + "grad_norm": 1.0597950220108032, + "loss": 5.1229, + "lr": 0.000698881118881119, + "step": 2654, + "tokens_trained": 0.252270456 + }, + { + "epoch": 0.7534751773049645, + "grad_norm": 1.0094919204711914, + "loss": 5.1077, + "lr": 0.0006986013986013986, + "step": 2656, + "tokens_trained": 0.252459416 + }, + { + "epoch": 0.7540425531914894, + "grad_norm": 1.0850694179534912, + "loss": 5.0876, + "lr": 0.0006983216783216784, + "step": 2658, + "tokens_trained": 0.252649656 + }, + { + "epoch": 0.7546099290780142, + "grad_norm": 1.0182054042816162, + "loss": 5.0842, + "lr": 0.000698041958041958, + "step": 2660, + "tokens_trained": 0.252840488 + }, + { + "epoch": 0.755177304964539, + "grad_norm": 1.074000597000122, + "loss": 5.1387, + "lr": 0.0006977622377622378, + "step": 2662, + "tokens_trained": 0.253030672 + }, + { + "epoch": 0.7557446808510638, + "grad_norm": 1.1259658336639404, + "loss": 5.1334, + "lr": 0.0006974825174825175, + "step": 2664, + "tokens_trained": 0.253221976 + }, + { + "epoch": 0.7563120567375886, + "grad_norm": 1.0146551132202148, + "loss": 5.0995, + "lr": 0.0006972027972027972, + "step": 2666, + "tokens_trained": 0.253414352 + }, + { + "epoch": 0.7568794326241135, + "grad_norm": 1.1268185377120972, + "loss": 5.1201, + "lr": 0.000696923076923077, + "step": 2668, + "tokens_trained": 0.25360448 + }, + { + "epoch": 0.7574468085106383, + "grad_norm": 1.025431752204895, + "loss": 5.0546, + "lr": 0.0006966433566433566, + "step": 2670, + "tokens_trained": 0.253791368 + }, + { + "epoch": 0.7580141843971632, + "grad_norm": 1.108112096786499, + "loss": 5.0917, + "lr": 0.0006963636363636365, + "step": 2672, + "tokens_trained": 0.253982112 + }, + { + "epoch": 0.758581560283688, + "grad_norm": 1.1009857654571533, + "loss": 5.1447, + "lr": 0.0006960839160839161, + "step": 2674, + "tokens_trained": 0.254173328 + }, + { + "epoch": 0.7591489361702127, + "grad_norm": 1.0718492269515991, + "loss": 5.1093, + "lr": 0.0006958041958041958, + "step": 2676, + "tokens_trained": 0.254363624 + }, + { + "epoch": 0.7597163120567376, + "grad_norm": 1.0715916156768799, + "loss": 5.1287, + "lr": 0.0006955244755244755, + "step": 2678, + "tokens_trained": 0.25455316 + }, + { + "epoch": 0.7602836879432624, + "grad_norm": 1.0953240394592285, + "loss": 5.1031, + "lr": 0.0006952447552447553, + "step": 2680, + "tokens_trained": 0.254742424 + }, + { + "epoch": 0.7608510638297873, + "grad_norm": 1.0574376583099365, + "loss": 5.1316, + "lr": 0.000694965034965035, + "step": 2682, + "tokens_trained": 0.254933624 + }, + { + "epoch": 0.7614184397163121, + "grad_norm": 1.1887143850326538, + "loss": 5.1261, + "lr": 0.0006946853146853147, + "step": 2684, + "tokens_trained": 0.255124424 + }, + { + "epoch": 0.7619858156028368, + "grad_norm": 1.0359193086624146, + "loss": 5.1584, + "lr": 0.0006944055944055943, + "step": 2686, + "tokens_trained": 0.255314344 + }, + { + "epoch": 0.7625531914893617, + "grad_norm": 1.1207493543624878, + "loss": 5.1496, + "lr": 0.0006941258741258741, + "step": 2688, + "tokens_trained": 0.255503192 + }, + { + "epoch": 0.7631205673758865, + "grad_norm": 1.1609482765197754, + "loss": 5.1403, + "lr": 0.0006938461538461539, + "step": 2690, + "tokens_trained": 0.25569088 + }, + { + "epoch": 0.7636879432624113, + "grad_norm": 1.0204665660858154, + "loss": 5.0891, + "lr": 0.0006935664335664336, + "step": 2692, + "tokens_trained": 0.255880216 + }, + { + "epoch": 0.7642553191489362, + "grad_norm": 1.064090371131897, + "loss": 5.0507, + "lr": 0.0006932867132867133, + "step": 2694, + "tokens_trained": 0.256070744 + }, + { + "epoch": 0.764822695035461, + "grad_norm": 1.1102992296218872, + "loss": 5.062, + "lr": 0.000693006993006993, + "step": 2696, + "tokens_trained": 0.256261136 + }, + { + "epoch": 0.7653900709219859, + "grad_norm": 1.0316580533981323, + "loss": 5.0933, + "lr": 0.0006927272727272728, + "step": 2698, + "tokens_trained": 0.256452032 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 1.0681291818618774, + "loss": 5.088, + "lr": 0.0006924475524475524, + "step": 2700, + "tokens_trained": 0.25664152 + }, + { + "epoch": 0.7665248226950354, + "grad_norm": 1.1148093938827515, + "loss": 5.0389, + "lr": 0.0006921678321678322, + "step": 2702, + "tokens_trained": 0.25683068 + }, + { + "epoch": 0.7670921985815603, + "grad_norm": 1.0831029415130615, + "loss": 5.1181, + "lr": 0.0006918881118881118, + "step": 2704, + "tokens_trained": 0.257020752 + }, + { + "epoch": 0.7676595744680851, + "grad_norm": 1.0877745151519775, + "loss": 5.1822, + "lr": 0.0006916083916083916, + "step": 2706, + "tokens_trained": 0.257209136 + }, + { + "epoch": 0.76822695035461, + "grad_norm": 1.0823218822479248, + "loss": 5.0855, + "lr": 0.0006913286713286714, + "step": 2708, + "tokens_trained": 0.257398504 + }, + { + "epoch": 0.7687943262411348, + "grad_norm": 1.0309520959854126, + "loss": 5.141, + "lr": 0.0006910489510489511, + "step": 2710, + "tokens_trained": 0.257589568 + }, + { + "epoch": 0.7693617021276595, + "grad_norm": 1.0433647632598877, + "loss": 5.057, + "lr": 0.0006907692307692308, + "step": 2712, + "tokens_trained": 0.257781368 + }, + { + "epoch": 0.7699290780141844, + "grad_norm": 1.05474054813385, + "loss": 5.0639, + "lr": 0.0006904895104895104, + "step": 2714, + "tokens_trained": 0.25797212 + }, + { + "epoch": 0.7704964539007092, + "grad_norm": 1.0005548000335693, + "loss": 5.1155, + "lr": 0.0006902097902097903, + "step": 2716, + "tokens_trained": 0.25815968 + }, + { + "epoch": 0.7710638297872341, + "grad_norm": 0.9644413590431213, + "loss": 5.1092, + "lr": 0.0006899300699300699, + "step": 2718, + "tokens_trained": 0.258350192 + }, + { + "epoch": 0.7716312056737589, + "grad_norm": 1.0715434551239014, + "loss": 5.0827, + "lr": 0.0006896503496503497, + "step": 2720, + "tokens_trained": 0.258539872 + }, + { + "epoch": 0.7721985815602836, + "grad_norm": 1.0799431800842285, + "loss": 5.1489, + "lr": 0.0006893706293706293, + "step": 2722, + "tokens_trained": 0.258728696 + }, + { + "epoch": 0.7727659574468085, + "grad_norm": 1.0224812030792236, + "loss": 5.0897, + "lr": 0.0006890909090909091, + "step": 2724, + "tokens_trained": 0.258918368 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 1.2171430587768555, + "loss": 5.1283, + "lr": 0.0006888111888111889, + "step": 2726, + "tokens_trained": 0.259107072 + }, + { + "epoch": 0.7739007092198582, + "grad_norm": 1.0420043468475342, + "loss": 5.1325, + "lr": 0.0006885314685314685, + "step": 2728, + "tokens_trained": 0.259297744 + }, + { + "epoch": 0.774468085106383, + "grad_norm": 1.0326933860778809, + "loss": 5.1543, + "lr": 0.0006882517482517483, + "step": 2730, + "tokens_trained": 0.259486832 + }, + { + "epoch": 0.7750354609929078, + "grad_norm": 1.1191221475601196, + "loss": 5.1182, + "lr": 0.0006879720279720279, + "step": 2732, + "tokens_trained": 0.2596774 + }, + { + "epoch": 0.7756028368794327, + "grad_norm": 1.089678168296814, + "loss": 5.09, + "lr": 0.0006876923076923078, + "step": 2734, + "tokens_trained": 0.259868248 + }, + { + "epoch": 0.7761702127659574, + "grad_norm": 1.0944526195526123, + "loss": 5.1519, + "lr": 0.0006874125874125874, + "step": 2736, + "tokens_trained": 0.260056992 + }, + { + "epoch": 0.7767375886524823, + "grad_norm": 1.0774682760238647, + "loss": 5.0998, + "lr": 0.0006871328671328672, + "step": 2738, + "tokens_trained": 0.2602478 + }, + { + "epoch": 0.7773049645390071, + "grad_norm": 1.0795758962631226, + "loss": 5.1483, + "lr": 0.0006868531468531468, + "step": 2740, + "tokens_trained": 0.260435896 + }, + { + "epoch": 0.7778723404255319, + "grad_norm": 1.229885458946228, + "loss": 5.0991, + "lr": 0.0006865734265734265, + "step": 2742, + "tokens_trained": 0.260624176 + }, + { + "epoch": 0.7784397163120568, + "grad_norm": 1.2816888093948364, + "loss": 5.1131, + "lr": 0.0006862937062937064, + "step": 2744, + "tokens_trained": 0.26081632 + }, + { + "epoch": 0.7790070921985816, + "grad_norm": 1.127356767654419, + "loss": 5.0589, + "lr": 0.000686013986013986, + "step": 2746, + "tokens_trained": 0.261003088 + }, + { + "epoch": 0.7795744680851063, + "grad_norm": 1.073644995689392, + "loss": 5.1402, + "lr": 0.0006857342657342658, + "step": 2748, + "tokens_trained": 0.261192088 + }, + { + "epoch": 0.7801418439716312, + "grad_norm": 1.0892105102539062, + "loss": 5.1231, + "lr": 0.0006854545454545454, + "step": 2750, + "tokens_trained": 0.261381504 + }, + { + "epoch": 0.7801418439716312, + "eval_loss": 5.11714506149292, + "eval_runtime": 20.9289, + "step": 2750, + "tokens_trained": 0.261381504 + }, + { + "epoch": 0.780709219858156, + "grad_norm": 1.2366212606430054, + "loss": 5.079, + "lr": 0.0006851748251748253, + "step": 2752, + "tokens_trained": 0.261572936 + }, + { + "epoch": 0.7812765957446809, + "grad_norm": 1.2283895015716553, + "loss": 5.0414, + "lr": 0.0006848951048951049, + "step": 2754, + "tokens_trained": 0.26176184 + }, + { + "epoch": 0.7818439716312057, + "grad_norm": 1.2296546697616577, + "loss": 5.0758, + "lr": 0.0006846153846153846, + "step": 2756, + "tokens_trained": 0.261952224 + }, + { + "epoch": 0.7824113475177304, + "grad_norm": 1.1455234289169312, + "loss": 5.0903, + "lr": 0.0006843356643356643, + "step": 2758, + "tokens_trained": 0.262142128 + }, + { + "epoch": 0.7829787234042553, + "grad_norm": 0.9795711040496826, + "loss": 5.1101, + "lr": 0.000684055944055944, + "step": 2760, + "tokens_trained": 0.262331464 + }, + { + "epoch": 0.7835460992907801, + "grad_norm": 1.1363111734390259, + "loss": 5.0948, + "lr": 0.0006837762237762239, + "step": 2762, + "tokens_trained": 0.262523048 + }, + { + "epoch": 0.784113475177305, + "grad_norm": 1.0878827571868896, + "loss": 5.0942, + "lr": 0.0006834965034965035, + "step": 2764, + "tokens_trained": 0.26271264 + }, + { + "epoch": 0.7846808510638298, + "grad_norm": 1.1213501691818237, + "loss": 5.0863, + "lr": 0.0006832167832167833, + "step": 2766, + "tokens_trained": 0.262903952 + }, + { + "epoch": 0.7852482269503546, + "grad_norm": 1.1156904697418213, + "loss": 5.1835, + "lr": 0.0006829370629370629, + "step": 2768, + "tokens_trained": 0.2630932 + }, + { + "epoch": 0.7858156028368795, + "grad_norm": 1.2105063199996948, + "loss": 5.152, + "lr": 0.0006826573426573427, + "step": 2770, + "tokens_trained": 0.2632822 + }, + { + "epoch": 0.7863829787234042, + "grad_norm": 1.056512475013733, + "loss": 5.129, + "lr": 0.0006823776223776224, + "step": 2772, + "tokens_trained": 0.263471056 + }, + { + "epoch": 0.7869503546099291, + "grad_norm": 1.124480128288269, + "loss": 5.1122, + "lr": 0.0006820979020979021, + "step": 2774, + "tokens_trained": 0.26365952 + }, + { + "epoch": 0.7875177304964539, + "grad_norm": 1.1403707265853882, + "loss": 5.1283, + "lr": 0.0006818181818181818, + "step": 2776, + "tokens_trained": 0.263850128 + }, + { + "epoch": 0.7880851063829787, + "grad_norm": 1.0712953805923462, + "loss": 5.0901, + "lr": 0.0006815384615384615, + "step": 2778, + "tokens_trained": 0.264036944 + }, + { + "epoch": 0.7886524822695036, + "grad_norm": 1.1485860347747803, + "loss": 5.0673, + "lr": 0.0006812587412587414, + "step": 2780, + "tokens_trained": 0.2642284 + }, + { + "epoch": 0.7892198581560284, + "grad_norm": 1.144534945487976, + "loss": 5.0939, + "lr": 0.000680979020979021, + "step": 2782, + "tokens_trained": 0.264417248 + }, + { + "epoch": 0.7897872340425532, + "grad_norm": 1.0953861474990845, + "loss": 5.0998, + "lr": 0.0006806993006993007, + "step": 2784, + "tokens_trained": 0.264605776 + }, + { + "epoch": 0.790354609929078, + "grad_norm": 1.0519598722457886, + "loss": 5.1059, + "lr": 0.0006804195804195804, + "step": 2786, + "tokens_trained": 0.264795928 + }, + { + "epoch": 0.7909219858156028, + "grad_norm": 1.064609408378601, + "loss": 5.1017, + "lr": 0.0006801398601398602, + "step": 2788, + "tokens_trained": 0.264986 + }, + { + "epoch": 0.7914893617021277, + "grad_norm": 1.0485059022903442, + "loss": 5.0636, + "lr": 0.0006798601398601399, + "step": 2790, + "tokens_trained": 0.265176936 + }, + { + "epoch": 0.7920567375886525, + "grad_norm": 1.1277351379394531, + "loss": 5.0689, + "lr": 0.0006795804195804196, + "step": 2792, + "tokens_trained": 0.265366584 + }, + { + "epoch": 0.7926241134751772, + "grad_norm": 1.0692890882492065, + "loss": 5.0922, + "lr": 0.0006793006993006992, + "step": 2794, + "tokens_trained": 0.265557456 + }, + { + "epoch": 0.7931914893617021, + "grad_norm": 0.9836872220039368, + "loss": 5.0702, + "lr": 0.000679020979020979, + "step": 2796, + "tokens_trained": 0.265747056 + }, + { + "epoch": 0.7937588652482269, + "grad_norm": 1.0450890064239502, + "loss": 5.0778, + "lr": 0.0006787412587412588, + "step": 2798, + "tokens_trained": 0.265935536 + }, + { + "epoch": 0.7943262411347518, + "grad_norm": 1.017853856086731, + "loss": 5.1401, + "lr": 0.0006784615384615385, + "step": 2800, + "tokens_trained": 0.266124376 + }, + { + "epoch": 0.7948936170212766, + "grad_norm": 0.9698541760444641, + "loss": 5.0882, + "lr": 0.0006781818181818182, + "step": 2802, + "tokens_trained": 0.266312192 + }, + { + "epoch": 0.7954609929078014, + "grad_norm": 0.9696250557899475, + "loss": 5.1424, + "lr": 0.0006779020979020979, + "step": 2804, + "tokens_trained": 0.266503584 + }, + { + "epoch": 0.7960283687943263, + "grad_norm": 1.011576533317566, + "loss": 5.062, + "lr": 0.0006776223776223777, + "step": 2806, + "tokens_trained": 0.266693776 + }, + { + "epoch": 0.796595744680851, + "grad_norm": 0.9681981801986694, + "loss": 5.1343, + "lr": 0.0006773426573426574, + "step": 2808, + "tokens_trained": 0.26688324 + }, + { + "epoch": 0.7971631205673759, + "grad_norm": 0.9778586626052856, + "loss": 5.0619, + "lr": 0.0006770629370629371, + "step": 2810, + "tokens_trained": 0.267072 + }, + { + "epoch": 0.7977304964539007, + "grad_norm": 0.9624539613723755, + "loss": 5.0943, + "lr": 0.0006767832167832167, + "step": 2812, + "tokens_trained": 0.267260184 + }, + { + "epoch": 0.7982978723404255, + "grad_norm": 1.0591245889663696, + "loss": 5.101, + "lr": 0.0006765034965034965, + "step": 2814, + "tokens_trained": 0.267450632 + }, + { + "epoch": 0.7988652482269504, + "grad_norm": 1.0650452375411987, + "loss": 5.0754, + "lr": 0.0006762237762237763, + "step": 2816, + "tokens_trained": 0.267641848 + }, + { + "epoch": 0.7994326241134752, + "grad_norm": 1.0241055488586426, + "loss": 5.113, + "lr": 0.000675944055944056, + "step": 2818, + "tokens_trained": 0.267831232 + }, + { + "epoch": 0.8, + "grad_norm": 0.9588684439659119, + "loss": 5.1124, + "lr": 0.0006756643356643357, + "step": 2820, + "tokens_trained": 0.268022016 + }, + { + "epoch": 0.8005673758865248, + "grad_norm": 1.0146323442459106, + "loss": 5.0773, + "lr": 0.0006753846153846153, + "step": 2822, + "tokens_trained": 0.268211504 + }, + { + "epoch": 0.8011347517730496, + "grad_norm": 1.040366530418396, + "loss": 5.0735, + "lr": 0.0006751048951048951, + "step": 2824, + "tokens_trained": 0.268400704 + }, + { + "epoch": 0.8017021276595745, + "grad_norm": 1.0419392585754395, + "loss": 5.1243, + "lr": 0.0006748251748251748, + "step": 2826, + "tokens_trained": 0.268592936 + }, + { + "epoch": 0.8022695035460993, + "grad_norm": 1.0807193517684937, + "loss": 5.0938, + "lr": 0.0006745454545454546, + "step": 2828, + "tokens_trained": 0.26878236 + }, + { + "epoch": 0.8028368794326242, + "grad_norm": 1.0357084274291992, + "loss": 5.138, + "lr": 0.0006742657342657342, + "step": 2830, + "tokens_trained": 0.268973808 + }, + { + "epoch": 0.8034042553191489, + "grad_norm": 1.0543837547302246, + "loss": 5.1219, + "lr": 0.000673986013986014, + "step": 2832, + "tokens_trained": 0.269163576 + }, + { + "epoch": 0.8039716312056737, + "grad_norm": 0.9575244188308716, + "loss": 5.0388, + "lr": 0.0006737062937062938, + "step": 2834, + "tokens_trained": 0.26935304 + }, + { + "epoch": 0.8045390070921986, + "grad_norm": 1.0559078454971313, + "loss": 5.1569, + "lr": 0.0006734265734265734, + "step": 2836, + "tokens_trained": 0.269542488 + }, + { + "epoch": 0.8051063829787234, + "grad_norm": 1.1365549564361572, + "loss": 5.1392, + "lr": 0.0006731468531468532, + "step": 2838, + "tokens_trained": 0.269732336 + }, + { + "epoch": 0.8056737588652483, + "grad_norm": 1.0022294521331787, + "loss": 5.1017, + "lr": 0.0006728671328671328, + "step": 2840, + "tokens_trained": 0.269922384 + }, + { + "epoch": 0.8062411347517731, + "grad_norm": 0.9790627360343933, + "loss": 5.1443, + "lr": 0.0006725874125874126, + "step": 2842, + "tokens_trained": 0.270111096 + }, + { + "epoch": 0.8068085106382978, + "grad_norm": 1.0328103303909302, + "loss": 5.087, + "lr": 0.0006723076923076923, + "step": 2844, + "tokens_trained": 0.27030036 + }, + { + "epoch": 0.8073758865248227, + "grad_norm": 1.0813841819763184, + "loss": 5.0995, + "lr": 0.0006720279720279721, + "step": 2846, + "tokens_trained": 0.270490936 + }, + { + "epoch": 0.8079432624113475, + "grad_norm": 1.1210085153579712, + "loss": 5.0929, + "lr": 0.0006717482517482517, + "step": 2848, + "tokens_trained": 0.27067904 + }, + { + "epoch": 0.8085106382978723, + "grad_norm": 1.10624361038208, + "loss": 5.0861, + "lr": 0.0006714685314685314, + "step": 2850, + "tokens_trained": 0.270869664 + }, + { + "epoch": 0.8090780141843972, + "grad_norm": 0.9984250664710999, + "loss": 5.1126, + "lr": 0.0006711888111888113, + "step": 2852, + "tokens_trained": 0.271059912 + }, + { + "epoch": 0.809645390070922, + "grad_norm": 1.0100075006484985, + "loss": 5.0131, + "lr": 0.0006709090909090909, + "step": 2854, + "tokens_trained": 0.271248128 + }, + { + "epoch": 0.8102127659574468, + "grad_norm": 1.0718857049942017, + "loss": 5.0978, + "lr": 0.0006706293706293707, + "step": 2856, + "tokens_trained": 0.271437752 + }, + { + "epoch": 0.8107801418439716, + "grad_norm": 1.0277525186538696, + "loss": 5.022, + "lr": 0.0006703496503496503, + "step": 2858, + "tokens_trained": 0.271627272 + }, + { + "epoch": 0.8113475177304964, + "grad_norm": 1.1056699752807617, + "loss": 5.1678, + "lr": 0.0006700699300699301, + "step": 2860, + "tokens_trained": 0.271815032 + }, + { + "epoch": 0.8119148936170213, + "grad_norm": 0.9853792190551758, + "loss": 5.1511, + "lr": 0.0006697902097902098, + "step": 2862, + "tokens_trained": 0.272004128 + }, + { + "epoch": 0.8124822695035461, + "grad_norm": 1.0207619667053223, + "loss": 5.0681, + "lr": 0.0006695104895104895, + "step": 2864, + "tokens_trained": 0.272193024 + }, + { + "epoch": 0.813049645390071, + "grad_norm": 1.0080488920211792, + "loss": 5.0712, + "lr": 0.0006692307692307692, + "step": 2866, + "tokens_trained": 0.27238368 + }, + { + "epoch": 0.8136170212765957, + "grad_norm": 1.1197504997253418, + "loss": 5.0545, + "lr": 0.0006689510489510489, + "step": 2868, + "tokens_trained": 0.272573416 + }, + { + "epoch": 0.8141843971631205, + "grad_norm": 1.0667881965637207, + "loss": 5.0492, + "lr": 0.0006686713286713288, + "step": 2870, + "tokens_trained": 0.272762152 + }, + { + "epoch": 0.8147517730496454, + "grad_norm": 1.0861417055130005, + "loss": 5.1656, + "lr": 0.0006683916083916084, + "step": 2872, + "tokens_trained": 0.272951384 + }, + { + "epoch": 0.8153191489361702, + "grad_norm": 0.9590932130813599, + "loss": 5.0987, + "lr": 0.0006681118881118882, + "step": 2874, + "tokens_trained": 0.273142368 + }, + { + "epoch": 0.8156028368794326, + "eval_loss": 5.098834037780762, + "eval_runtime": 21.1414, + "step": 2875, + "tokens_trained": 0.273239384 + }, + { + "epoch": 0.8158865248226951, + "grad_norm": 0.9762487411499023, + "loss": 5.0346, + "lr": 0.0006678321678321678, + "step": 2876, + "tokens_trained": 0.273334 + }, + { + "epoch": 0.8164539007092199, + "grad_norm": 1.059070110321045, + "loss": 5.0466, + "lr": 0.0006675524475524475, + "step": 2878, + "tokens_trained": 0.2735232 + }, + { + "epoch": 0.8170212765957446, + "grad_norm": 1.0384489297866821, + "loss": 5.0647, + "lr": 0.0006672727272727273, + "step": 2880, + "tokens_trained": 0.27371452 + }, + { + "epoch": 0.8175886524822695, + "grad_norm": 1.0188980102539062, + "loss": 5.135, + "lr": 0.000666993006993007, + "step": 2882, + "tokens_trained": 0.273903312 + }, + { + "epoch": 0.8181560283687943, + "grad_norm": 1.0437567234039307, + "loss": 5.1251, + "lr": 0.0006667132867132867, + "step": 2884, + "tokens_trained": 0.27409364 + }, + { + "epoch": 0.8187234042553192, + "grad_norm": 1.040148138999939, + "loss": 5.0829, + "lr": 0.0006664335664335664, + "step": 2886, + "tokens_trained": 0.274283392 + }, + { + "epoch": 0.819290780141844, + "grad_norm": 0.9796963930130005, + "loss": 5.1062, + "lr": 0.0006661538461538463, + "step": 2888, + "tokens_trained": 0.27447272 + }, + { + "epoch": 0.8198581560283688, + "grad_norm": 1.0791646242141724, + "loss": 5.0677, + "lr": 0.0006658741258741259, + "step": 2890, + "tokens_trained": 0.274661656 + }, + { + "epoch": 0.8204255319148936, + "grad_norm": 1.075614094734192, + "loss": 5.0932, + "lr": 0.0006655944055944056, + "step": 2892, + "tokens_trained": 0.274851216 + }, + { + "epoch": 0.8209929078014184, + "grad_norm": 1.0696609020233154, + "loss": 5.1614, + "lr": 0.0006653146853146853, + "step": 2894, + "tokens_trained": 0.275040176 + }, + { + "epoch": 0.8215602836879433, + "grad_norm": 1.0564289093017578, + "loss": 5.1165, + "lr": 0.000665034965034965, + "step": 2896, + "tokens_trained": 0.27522948 + }, + { + "epoch": 0.8221276595744681, + "grad_norm": 1.0135756731033325, + "loss": 5.1222, + "lr": 0.0006647552447552448, + "step": 2898, + "tokens_trained": 0.275419392 + }, + { + "epoch": 0.8226950354609929, + "grad_norm": 1.0177373886108398, + "loss": 5.1085, + "lr": 0.0006644755244755245, + "step": 2900, + "tokens_trained": 0.27561 + }, + { + "epoch": 0.8232624113475178, + "grad_norm": 0.9718354344367981, + "loss": 5.069, + "lr": 0.0006641958041958042, + "step": 2902, + "tokens_trained": 0.275800288 + }, + { + "epoch": 0.8238297872340425, + "grad_norm": 1.011567234992981, + "loss": 5.1668, + "lr": 0.0006639160839160839, + "step": 2904, + "tokens_trained": 0.275988672 + }, + { + "epoch": 0.8243971631205673, + "grad_norm": 1.0020220279693604, + "loss": 5.0616, + "lr": 0.0006636363636363638, + "step": 2906, + "tokens_trained": 0.276180112 + }, + { + "epoch": 0.8249645390070922, + "grad_norm": 0.9929330945014954, + "loss": 5.0723, + "lr": 0.0006633566433566434, + "step": 2908, + "tokens_trained": 0.276368136 + }, + { + "epoch": 0.825531914893617, + "grad_norm": 0.9768717885017395, + "loss": 5.0872, + "lr": 0.0006630769230769231, + "step": 2910, + "tokens_trained": 0.276557936 + }, + { + "epoch": 0.8260992907801419, + "grad_norm": 1.0068199634552002, + "loss": 5.1279, + "lr": 0.0006627972027972028, + "step": 2912, + "tokens_trained": 0.276748584 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 0.953273594379425, + "loss": 5.0422, + "lr": 0.0006625174825174825, + "step": 2914, + "tokens_trained": 0.276939168 + }, + { + "epoch": 0.8272340425531914, + "grad_norm": 0.9808285236358643, + "loss": 5.1278, + "lr": 0.0006622377622377623, + "step": 2916, + "tokens_trained": 0.277128728 + }, + { + "epoch": 0.8278014184397163, + "grad_norm": 0.9755997061729431, + "loss": 5.0661, + "lr": 0.000661958041958042, + "step": 2918, + "tokens_trained": 0.27731964 + }, + { + "epoch": 0.8283687943262411, + "grad_norm": 0.9573803544044495, + "loss": 5.0744, + "lr": 0.0006616783216783216, + "step": 2920, + "tokens_trained": 0.277508704 + }, + { + "epoch": 0.828936170212766, + "grad_norm": 1.1060761213302612, + "loss": 5.1124, + "lr": 0.0006613986013986014, + "step": 2922, + "tokens_trained": 0.277698576 + }, + { + "epoch": 0.8295035460992908, + "grad_norm": 1.1377017498016357, + "loss": 5.1375, + "lr": 0.0006611188811188812, + "step": 2924, + "tokens_trained": 0.277887456 + }, + { + "epoch": 0.8300709219858156, + "grad_norm": 1.0315862894058228, + "loss": 5.0839, + "lr": 0.0006608391608391609, + "step": 2926, + "tokens_trained": 0.278076232 + }, + { + "epoch": 0.8306382978723404, + "grad_norm": 0.9509685635566711, + "loss": 5.0986, + "lr": 0.0006605594405594406, + "step": 2928, + "tokens_trained": 0.278265168 + }, + { + "epoch": 0.8312056737588652, + "grad_norm": 0.9749555587768555, + "loss": 5.0292, + "lr": 0.0006602797202797203, + "step": 2930, + "tokens_trained": 0.27845612 + }, + { + "epoch": 0.8317730496453901, + "grad_norm": 0.9728718400001526, + "loss": 5.1113, + "lr": 0.00066, + "step": 2932, + "tokens_trained": 0.278647 + }, + { + "epoch": 0.8323404255319149, + "grad_norm": 0.8888244032859802, + "loss": 5.0698, + "lr": 0.0006597202797202797, + "step": 2934, + "tokens_trained": 0.278834704 + }, + { + "epoch": 0.8329078014184397, + "grad_norm": 0.9745096564292908, + "loss": 5.1356, + "lr": 0.0006594405594405595, + "step": 2936, + "tokens_trained": 0.27902504 + }, + { + "epoch": 0.8334751773049646, + "grad_norm": 1.023566484451294, + "loss": 5.0733, + "lr": 0.0006591608391608391, + "step": 2938, + "tokens_trained": 0.279214024 + }, + { + "epoch": 0.8340425531914893, + "grad_norm": 0.9370903968811035, + "loss": 5.108, + "lr": 0.0006588811188811189, + "step": 2940, + "tokens_trained": 0.279402336 + }, + { + "epoch": 0.8346099290780142, + "grad_norm": 1.037245750427246, + "loss": 5.1539, + "lr": 0.0006586013986013986, + "step": 2942, + "tokens_trained": 0.279594456 + }, + { + "epoch": 0.835177304964539, + "grad_norm": 1.1117267608642578, + "loss": 5.0984, + "lr": 0.0006583216783216784, + "step": 2944, + "tokens_trained": 0.279784736 + }, + { + "epoch": 0.8357446808510638, + "grad_norm": 1.0760383605957031, + "loss": 5.0798, + "lr": 0.0006580419580419581, + "step": 2946, + "tokens_trained": 0.279974272 + }, + { + "epoch": 0.8363120567375887, + "grad_norm": 1.0359710454940796, + "loss": 5.1052, + "lr": 0.0006577622377622377, + "step": 2948, + "tokens_trained": 0.280162576 + }, + { + "epoch": 0.8368794326241135, + "grad_norm": 1.0630141496658325, + "loss": 5.0561, + "lr": 0.0006574825174825175, + "step": 2950, + "tokens_trained": 0.280351752 + }, + { + "epoch": 0.8374468085106384, + "grad_norm": 1.0445481538772583, + "loss": 5.1009, + "lr": 0.0006572027972027972, + "step": 2952, + "tokens_trained": 0.280541392 + }, + { + "epoch": 0.8380141843971631, + "grad_norm": 1.0606142282485962, + "loss": 5.0109, + "lr": 0.000656923076923077, + "step": 2954, + "tokens_trained": 0.280732192 + }, + { + "epoch": 0.8385815602836879, + "grad_norm": 1.0462067127227783, + "loss": 5.1411, + "lr": 0.0006566433566433566, + "step": 2956, + "tokens_trained": 0.280922712 + }, + { + "epoch": 0.8391489361702128, + "grad_norm": 0.9841874241828918, + "loss": 5.0773, + "lr": 0.0006563636363636364, + "step": 2958, + "tokens_trained": 0.28111024 + }, + { + "epoch": 0.8397163120567376, + "grad_norm": 1.1026822328567505, + "loss": 5.1128, + "lr": 0.0006560839160839161, + "step": 2960, + "tokens_trained": 0.281302152 + }, + { + "epoch": 0.8402836879432624, + "grad_norm": 0.9562904834747314, + "loss": 5.0521, + "lr": 0.0006558041958041958, + "step": 2962, + "tokens_trained": 0.281490768 + }, + { + "epoch": 0.8408510638297872, + "grad_norm": 1.038006067276001, + "loss": 5.0931, + "lr": 0.0006555244755244756, + "step": 2964, + "tokens_trained": 0.281682552 + }, + { + "epoch": 0.841418439716312, + "grad_norm": 1.008678913116455, + "loss": 5.0728, + "lr": 0.0006552447552447552, + "step": 2966, + "tokens_trained": 0.281871816 + }, + { + "epoch": 0.8419858156028369, + "grad_norm": 0.9977920651435852, + "loss": 5.086, + "lr": 0.000654965034965035, + "step": 2968, + "tokens_trained": 0.2820618 + }, + { + "epoch": 0.8425531914893617, + "grad_norm": 0.9422287344932556, + "loss": 5.0844, + "lr": 0.0006546853146853147, + "step": 2970, + "tokens_trained": 0.282253032 + }, + { + "epoch": 0.8431205673758865, + "grad_norm": 1.0029969215393066, + "loss": 5.0928, + "lr": 0.0006544055944055945, + "step": 2972, + "tokens_trained": 0.282443296 + }, + { + "epoch": 0.8436879432624114, + "grad_norm": 1.0643123388290405, + "loss": 5.0988, + "lr": 0.0006541258741258741, + "step": 2974, + "tokens_trained": 0.282634024 + }, + { + "epoch": 0.8442553191489361, + "grad_norm": 1.0360649824142456, + "loss": 5.0634, + "lr": 0.0006538461538461538, + "step": 2976, + "tokens_trained": 0.282825768 + }, + { + "epoch": 0.844822695035461, + "grad_norm": 0.9609996676445007, + "loss": 5.1155, + "lr": 0.0006535664335664336, + "step": 2978, + "tokens_trained": 0.283016704 + }, + { + "epoch": 0.8453900709219858, + "grad_norm": 0.9547716379165649, + "loss": 5.0769, + "lr": 0.0006532867132867133, + "step": 2980, + "tokens_trained": 0.283205288 + }, + { + "epoch": 0.8459574468085106, + "grad_norm": 1.0286030769348145, + "loss": 5.0849, + "lr": 0.0006530069930069931, + "step": 2982, + "tokens_trained": 0.283395192 + }, + { + "epoch": 0.8465248226950355, + "grad_norm": 0.9071921706199646, + "loss": 5.0308, + "lr": 0.0006527272727272727, + "step": 2984, + "tokens_trained": 0.283587048 + }, + { + "epoch": 0.8470921985815603, + "grad_norm": 0.851090133190155, + "loss": 5.0601, + "lr": 0.0006524475524475524, + "step": 2986, + "tokens_trained": 0.28377872 + }, + { + "epoch": 0.8476595744680852, + "grad_norm": 0.946025550365448, + "loss": 5.0863, + "lr": 0.0006521678321678322, + "step": 2988, + "tokens_trained": 0.283968304 + }, + { + "epoch": 0.8482269503546099, + "grad_norm": 0.994915783405304, + "loss": 5.1034, + "lr": 0.0006518881118881119, + "step": 2990, + "tokens_trained": 0.284158704 + }, + { + "epoch": 0.8487943262411347, + "grad_norm": 0.9354639053344727, + "loss": 5.0749, + "lr": 0.0006516083916083916, + "step": 2992, + "tokens_trained": 0.284350032 + }, + { + "epoch": 0.8493617021276596, + "grad_norm": 0.9014646410942078, + "loss": 5.0753, + "lr": 0.0006513286713286713, + "step": 2994, + "tokens_trained": 0.284541136 + }, + { + "epoch": 0.8499290780141844, + "grad_norm": 0.9647039771080017, + "loss": 5.1391, + "lr": 0.0006510489510489511, + "step": 2996, + "tokens_trained": 0.28473112 + }, + { + "epoch": 0.8504964539007093, + "grad_norm": 0.9687992930412292, + "loss": 5.0058, + "lr": 0.0006507692307692308, + "step": 2998, + "tokens_trained": 0.284922608 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 0.9827167987823486, + "loss": 5.0597, + "lr": 0.0006504895104895106, + "step": 3000, + "tokens_trained": 0.285112344 + }, + { + "epoch": 0.851063829787234, + "eval_loss": 5.092260837554932, + "eval_runtime": 20.8128, + "step": 3000, + "tokens_trained": 0.285112344 + }, + { + "epoch": 0.8516312056737588, + "grad_norm": 1.1164077520370483, + "loss": 4.9872, + "lr": 0.0006502097902097902, + "step": 3002, + "tokens_trained": 0.285299144 + }, + { + "epoch": 0.8521985815602837, + "grad_norm": 1.0835845470428467, + "loss": 4.999, + "lr": 0.0006499300699300699, + "step": 3004, + "tokens_trained": 0.28548968 + }, + { + "epoch": 0.8527659574468085, + "grad_norm": 1.135926365852356, + "loss": 5.1038, + "lr": 0.0006496503496503497, + "step": 3006, + "tokens_trained": 0.28568256 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 1.0743507146835327, + "loss": 5.0964, + "lr": 0.0006493706293706294, + "step": 3008, + "tokens_trained": 0.28587348 + }, + { + "epoch": 0.8539007092198582, + "grad_norm": 0.9776538014411926, + "loss": 5.0208, + "lr": 0.0006490909090909091, + "step": 3010, + "tokens_trained": 0.286061968 + }, + { + "epoch": 0.854468085106383, + "grad_norm": 0.9797994494438171, + "loss": 5.0238, + "lr": 0.0006488111888111888, + "step": 3012, + "tokens_trained": 0.28625252 + }, + { + "epoch": 0.8550354609929078, + "grad_norm": 0.8697059154510498, + "loss": 5.0017, + "lr": 0.0006485314685314685, + "step": 3014, + "tokens_trained": 0.286443872 + }, + { + "epoch": 0.8556028368794326, + "grad_norm": 0.9378446340560913, + "loss": 5.0856, + "lr": 0.0006482517482517483, + "step": 3016, + "tokens_trained": 0.286633232 + }, + { + "epoch": 0.8561702127659574, + "grad_norm": 0.9418164491653442, + "loss": 5.0637, + "lr": 0.000647972027972028, + "step": 3018, + "tokens_trained": 0.286824032 + }, + { + "epoch": 0.8567375886524823, + "grad_norm": 0.9479710459709167, + "loss": 5.0941, + "lr": 0.0006476923076923077, + "step": 3020, + "tokens_trained": 0.2870158 + }, + { + "epoch": 0.8573049645390071, + "grad_norm": 0.9716140627861023, + "loss": 5.1218, + "lr": 0.0006474125874125874, + "step": 3022, + "tokens_trained": 0.287206184 + }, + { + "epoch": 0.857872340425532, + "grad_norm": 0.9651079177856445, + "loss": 5.0137, + "lr": 0.0006471328671328672, + "step": 3024, + "tokens_trained": 0.287395568 + }, + { + "epoch": 0.8584397163120567, + "grad_norm": 1.0485713481903076, + "loss": 5.0713, + "lr": 0.0006468531468531469, + "step": 3026, + "tokens_trained": 0.28758644 + }, + { + "epoch": 0.8590070921985815, + "grad_norm": 1.0849828720092773, + "loss": 5.0241, + "lr": 0.0006465734265734265, + "step": 3028, + "tokens_trained": 0.287773088 + }, + { + "epoch": 0.8595744680851064, + "grad_norm": 1.0668689012527466, + "loss": 5.0694, + "lr": 0.0006462937062937063, + "step": 3030, + "tokens_trained": 0.287963544 + }, + { + "epoch": 0.8601418439716312, + "grad_norm": 0.9943816065788269, + "loss": 5.0807, + "lr": 0.0006460139860139859, + "step": 3032, + "tokens_trained": 0.288152376 + }, + { + "epoch": 0.8607092198581561, + "grad_norm": 1.104642629623413, + "loss": 5.1047, + "lr": 0.0006457342657342658, + "step": 3034, + "tokens_trained": 0.288343064 + }, + { + "epoch": 0.8612765957446809, + "grad_norm": 1.0915707349777222, + "loss": 5.1332, + "lr": 0.0006454545454545455, + "step": 3036, + "tokens_trained": 0.28853308 + }, + { + "epoch": 0.8618439716312056, + "grad_norm": 0.9935365319252014, + "loss": 5.0799, + "lr": 0.0006451748251748252, + "step": 3038, + "tokens_trained": 0.288726184 + }, + { + "epoch": 0.8624113475177305, + "grad_norm": 0.9564308524131775, + "loss": 5.0549, + "lr": 0.0006448951048951049, + "step": 3040, + "tokens_trained": 0.288916368 + }, + { + "epoch": 0.8629787234042553, + "grad_norm": 1.0183926820755005, + "loss": 5.1508, + "lr": 0.0006446153846153846, + "step": 3042, + "tokens_trained": 0.28910616 + }, + { + "epoch": 0.8635460992907802, + "grad_norm": 0.8167940974235535, + "loss": 5.1238, + "lr": 0.0006443356643356644, + "step": 3044, + "tokens_trained": 0.289295008 + }, + { + "epoch": 0.864113475177305, + "grad_norm": 0.981560468673706, + "loss": 5.0692, + "lr": 0.000644055944055944, + "step": 3046, + "tokens_trained": 0.289483192 + }, + { + "epoch": 0.8646808510638297, + "grad_norm": 0.9596647024154663, + "loss": 5.0557, + "lr": 0.0006437762237762238, + "step": 3048, + "tokens_trained": 0.289672528 + }, + { + "epoch": 0.8652482269503546, + "grad_norm": 0.9322229027748108, + "loss": 5.0769, + "lr": 0.0006434965034965034, + "step": 3050, + "tokens_trained": 0.28986108 + }, + { + "epoch": 0.8658156028368794, + "grad_norm": 0.94253009557724, + "loss": 5.0556, + "lr": 0.0006432167832167833, + "step": 3052, + "tokens_trained": 0.29005032 + }, + { + "epoch": 0.8663829787234043, + "grad_norm": 0.9793356657028198, + "loss": 5.0821, + "lr": 0.000642937062937063, + "step": 3054, + "tokens_trained": 0.290238496 + }, + { + "epoch": 0.8669503546099291, + "grad_norm": 1.0270706415176392, + "loss": 5.1137, + "lr": 0.0006426573426573426, + "step": 3056, + "tokens_trained": 0.290428552 + }, + { + "epoch": 0.8675177304964539, + "grad_norm": 1.0015908479690552, + "loss": 5.0426, + "lr": 0.0006423776223776224, + "step": 3058, + "tokens_trained": 0.290617592 + }, + { + "epoch": 0.8680851063829788, + "grad_norm": 1.1663475036621094, + "loss": 5.0152, + "lr": 0.0006420979020979021, + "step": 3060, + "tokens_trained": 0.290806784 + }, + { + "epoch": 0.8686524822695035, + "grad_norm": 1.1144863367080688, + "loss": 5.1324, + "lr": 0.0006418181818181819, + "step": 3062, + "tokens_trained": 0.290997672 + }, + { + "epoch": 0.8692198581560284, + "grad_norm": 1.086949110031128, + "loss": 5.0896, + "lr": 0.0006415384615384615, + "step": 3064, + "tokens_trained": 0.291187224 + }, + { + "epoch": 0.8697872340425532, + "grad_norm": 1.0380237102508545, + "loss": 5.0964, + "lr": 0.0006412587412587413, + "step": 3066, + "tokens_trained": 0.291378608 + }, + { + "epoch": 0.870354609929078, + "grad_norm": 0.9731833338737488, + "loss": 5.0113, + "lr": 0.0006409790209790209, + "step": 3068, + "tokens_trained": 0.291568064 + }, + { + "epoch": 0.8709219858156029, + "grad_norm": 0.9414166212081909, + "loss": 5.0396, + "lr": 0.0006406993006993007, + "step": 3070, + "tokens_trained": 0.291757936 + }, + { + "epoch": 0.8714893617021277, + "grad_norm": 1.0375349521636963, + "loss": 5.1187, + "lr": 0.0006404195804195805, + "step": 3072, + "tokens_trained": 0.291948704 + }, + { + "epoch": 0.8720567375886524, + "grad_norm": 0.9412112236022949, + "loss": 5.0955, + "lr": 0.0006401398601398601, + "step": 3074, + "tokens_trained": 0.292141128 + }, + { + "epoch": 0.8726241134751773, + "grad_norm": 0.9645117521286011, + "loss": 5.1278, + "lr": 0.0006398601398601399, + "step": 3076, + "tokens_trained": 0.292331704 + }, + { + "epoch": 0.8731914893617021, + "grad_norm": 0.9918674230575562, + "loss": 5.0726, + "lr": 0.0006395804195804196, + "step": 3078, + "tokens_trained": 0.292519984 + }, + { + "epoch": 0.873758865248227, + "grad_norm": 0.8824833035469055, + "loss": 5.1334, + "lr": 0.0006393006993006994, + "step": 3080, + "tokens_trained": 0.292712064 + }, + { + "epoch": 0.8743262411347518, + "grad_norm": 1.0651589632034302, + "loss": 5.0911, + "lr": 0.000639020979020979, + "step": 3082, + "tokens_trained": 0.292901816 + }, + { + "epoch": 0.8748936170212765, + "grad_norm": 1.0067808628082275, + "loss": 5.1345, + "lr": 0.0006387412587412587, + "step": 3084, + "tokens_trained": 0.293094064 + }, + { + "epoch": 0.8754609929078014, + "grad_norm": 0.8916751146316528, + "loss": 5.1117, + "lr": 0.0006384615384615384, + "step": 3086, + "tokens_trained": 0.293284272 + }, + { + "epoch": 0.8760283687943262, + "grad_norm": 1.0009779930114746, + "loss": 5.1143, + "lr": 0.0006381818181818182, + "step": 3088, + "tokens_trained": 0.293474352 + }, + { + "epoch": 0.8765957446808511, + "grad_norm": 1.0289413928985596, + "loss": 5.0551, + "lr": 0.000637902097902098, + "step": 3090, + "tokens_trained": 0.293661976 + }, + { + "epoch": 0.8771631205673759, + "grad_norm": 0.9375638961791992, + "loss": 5.0666, + "lr": 0.0006376223776223776, + "step": 3092, + "tokens_trained": 0.293851968 + }, + { + "epoch": 0.8777304964539007, + "grad_norm": 0.9490086436271667, + "loss": 5.0901, + "lr": 0.0006373426573426574, + "step": 3094, + "tokens_trained": 0.294041608 + }, + { + "epoch": 0.8782978723404256, + "grad_norm": 0.932090163230896, + "loss": 5.0783, + "lr": 0.0006370629370629371, + "step": 3096, + "tokens_trained": 0.29423028 + }, + { + "epoch": 0.8788652482269503, + "grad_norm": 0.9120060205459595, + "loss": 5.1065, + "lr": 0.0006367832167832168, + "step": 3098, + "tokens_trained": 0.294421528 + }, + { + "epoch": 0.8794326241134752, + "grad_norm": 0.8693923950195312, + "loss": 5.0689, + "lr": 0.0006365034965034965, + "step": 3100, + "tokens_trained": 0.294609832 + }, + { + "epoch": 0.88, + "grad_norm": 0.834987223148346, + "loss": 5.0542, + "lr": 0.0006362237762237762, + "step": 3102, + "tokens_trained": 0.294799424 + }, + { + "epoch": 0.8805673758865248, + "grad_norm": 0.9196602702140808, + "loss": 5.0212, + "lr": 0.0006359440559440559, + "step": 3104, + "tokens_trained": 0.294990504 + }, + { + "epoch": 0.8811347517730497, + "grad_norm": 1.0392085313796997, + "loss": 5.0734, + "lr": 0.0006356643356643357, + "step": 3106, + "tokens_trained": 0.295181112 + }, + { + "epoch": 0.8817021276595745, + "grad_norm": 1.0879757404327393, + "loss": 5.0834, + "lr": 0.0006353846153846155, + "step": 3108, + "tokens_trained": 0.295371224 + }, + { + "epoch": 0.8822695035460993, + "grad_norm": 1.0321052074432373, + "loss": 5.1132, + "lr": 0.0006351048951048951, + "step": 3110, + "tokens_trained": 0.295563288 + }, + { + "epoch": 0.8828368794326241, + "grad_norm": 0.9930777549743652, + "loss": 5.0855, + "lr": 0.0006348251748251748, + "step": 3112, + "tokens_trained": 0.295753864 + }, + { + "epoch": 0.8834042553191489, + "grad_norm": 1.007925033569336, + "loss": 5.0728, + "lr": 0.0006345454545454546, + "step": 3114, + "tokens_trained": 0.29594536 + }, + { + "epoch": 0.8839716312056738, + "grad_norm": 1.0430697202682495, + "loss": 5.161, + "lr": 0.0006342657342657343, + "step": 3116, + "tokens_trained": 0.296135144 + }, + { + "epoch": 0.8845390070921986, + "grad_norm": 0.9607092142105103, + "loss": 5.0514, + "lr": 0.000633986013986014, + "step": 3118, + "tokens_trained": 0.296325736 + }, + { + "epoch": 0.8851063829787233, + "grad_norm": 1.0054502487182617, + "loss": 5.03, + "lr": 0.0006337062937062937, + "step": 3120, + "tokens_trained": 0.296514408 + }, + { + "epoch": 0.8856737588652482, + "grad_norm": 1.0535473823547363, + "loss": 5.1082, + "lr": 0.0006334265734265733, + "step": 3122, + "tokens_trained": 0.296702248 + }, + { + "epoch": 0.886241134751773, + "grad_norm": 0.9889388680458069, + "loss": 5.0583, + "lr": 0.0006331468531468532, + "step": 3124, + "tokens_trained": 0.296891656 + }, + { + "epoch": 0.8865248226950354, + "eval_loss": 5.07567024230957, + "eval_runtime": 20.7649, + "step": 3125, + "tokens_trained": 0.296985944 + }, + { + "epoch": 0.8868085106382979, + "grad_norm": 1.008825421333313, + "loss": 5.0698, + "lr": 0.0006328671328671329, + "step": 3126, + "tokens_trained": 0.297081752 + }, + { + "epoch": 0.8873758865248227, + "grad_norm": 0.9656203985214233, + "loss": 5.0915, + "lr": 0.0006325874125874126, + "step": 3128, + "tokens_trained": 0.297269568 + }, + { + "epoch": 0.8879432624113475, + "grad_norm": 0.9101914167404175, + "loss": 5.0821, + "lr": 0.0006323076923076923, + "step": 3130, + "tokens_trained": 0.297457936 + }, + { + "epoch": 0.8885106382978724, + "grad_norm": 1.03163480758667, + "loss": 5.07, + "lr": 0.0006320279720279721, + "step": 3132, + "tokens_trained": 0.297646944 + }, + { + "epoch": 0.8890780141843971, + "grad_norm": 1.0470985174179077, + "loss": 5.0165, + "lr": 0.0006317482517482518, + "step": 3134, + "tokens_trained": 0.29783736 + }, + { + "epoch": 0.889645390070922, + "grad_norm": 1.0149681568145752, + "loss": 5.0809, + "lr": 0.0006314685314685314, + "step": 3136, + "tokens_trained": 0.298027048 + }, + { + "epoch": 0.8902127659574468, + "grad_norm": 1.017217993736267, + "loss": 5.0323, + "lr": 0.0006311888111888112, + "step": 3138, + "tokens_trained": 0.298218432 + }, + { + "epoch": 0.8907801418439716, + "grad_norm": 1.0002187490463257, + "loss": 5.0818, + "lr": 0.0006309090909090908, + "step": 3140, + "tokens_trained": 0.29840768 + }, + { + "epoch": 0.8913475177304965, + "grad_norm": 0.9259346723556519, + "loss": 5.0863, + "lr": 0.0006306293706293707, + "step": 3142, + "tokens_trained": 0.298599384 + }, + { + "epoch": 0.8919148936170213, + "grad_norm": 0.9437862634658813, + "loss": 5.1282, + "lr": 0.0006303496503496504, + "step": 3144, + "tokens_trained": 0.298789024 + }, + { + "epoch": 0.8924822695035461, + "grad_norm": 0.9849722981452942, + "loss": 5.0658, + "lr": 0.0006300699300699301, + "step": 3146, + "tokens_trained": 0.298979648 + }, + { + "epoch": 0.8930496453900709, + "grad_norm": 1.1129319667816162, + "loss": 5.0663, + "lr": 0.0006297902097902098, + "step": 3148, + "tokens_trained": 0.299170416 + }, + { + "epoch": 0.8936170212765957, + "grad_norm": 1.101006031036377, + "loss": 5.0394, + "lr": 0.0006295104895104896, + "step": 3150, + "tokens_trained": 0.299361408 + }, + { + "epoch": 0.8941843971631206, + "grad_norm": 1.0711042881011963, + "loss": 5.0696, + "lr": 0.0006292307692307693, + "step": 3152, + "tokens_trained": 0.29955124 + }, + { + "epoch": 0.8947517730496454, + "grad_norm": 1.0250879526138306, + "loss": 5.0645, + "lr": 0.0006289510489510489, + "step": 3154, + "tokens_trained": 0.299742168 + }, + { + "epoch": 0.8953191489361703, + "grad_norm": 1.0772818326950073, + "loss": 5.06, + "lr": 0.0006286713286713287, + "step": 3156, + "tokens_trained": 0.299931536 + }, + { + "epoch": 0.895886524822695, + "grad_norm": 1.1049630641937256, + "loss": 5.0823, + "lr": 0.0006283916083916083, + "step": 3158, + "tokens_trained": 0.300121944 + }, + { + "epoch": 0.8964539007092198, + "grad_norm": 1.0998307466506958, + "loss": 5.0334, + "lr": 0.0006281118881118882, + "step": 3160, + "tokens_trained": 0.300311336 + }, + { + "epoch": 0.8970212765957447, + "grad_norm": 1.0796667337417603, + "loss": 5.1029, + "lr": 0.0006278321678321679, + "step": 3162, + "tokens_trained": 0.300499712 + }, + { + "epoch": 0.8975886524822695, + "grad_norm": 1.054004192352295, + "loss": 5.0425, + "lr": 0.0006275524475524475, + "step": 3164, + "tokens_trained": 0.300689128 + }, + { + "epoch": 0.8981560283687944, + "grad_norm": 0.9226939082145691, + "loss": 5.0712, + "lr": 0.0006272727272727273, + "step": 3166, + "tokens_trained": 0.300878016 + }, + { + "epoch": 0.8987234042553192, + "grad_norm": 0.8905312418937683, + "loss": 5.0948, + "lr": 0.000626993006993007, + "step": 3168, + "tokens_trained": 0.301067672 + }, + { + "epoch": 0.8992907801418439, + "grad_norm": 0.92462557554245, + "loss": 5.0488, + "lr": 0.0006267132867132868, + "step": 3170, + "tokens_trained": 0.30125668 + }, + { + "epoch": 0.8998581560283688, + "grad_norm": 0.911163866519928, + "loss": 5.0655, + "lr": 0.0006264335664335664, + "step": 3172, + "tokens_trained": 0.301447736 + }, + { + "epoch": 0.9004255319148936, + "grad_norm": 1.0041508674621582, + "loss": 5.1074, + "lr": 0.0006261538461538462, + "step": 3174, + "tokens_trained": 0.301636976 + }, + { + "epoch": 0.9009929078014184, + "grad_norm": 1.1221826076507568, + "loss": 5.1076, + "lr": 0.0006258741258741258, + "step": 3176, + "tokens_trained": 0.301831152 + }, + { + "epoch": 0.9015602836879433, + "grad_norm": 1.0674721002578735, + "loss": 5.1029, + "lr": 0.0006255944055944057, + "step": 3178, + "tokens_trained": 0.302021192 + }, + { + "epoch": 0.902127659574468, + "grad_norm": 0.9207814335823059, + "loss": 5.1237, + "lr": 0.0006253146853146854, + "step": 3180, + "tokens_trained": 0.302214408 + }, + { + "epoch": 0.902695035460993, + "grad_norm": 0.9445079565048218, + "loss": 5.0714, + "lr": 0.000625034965034965, + "step": 3182, + "tokens_trained": 0.302406056 + }, + { + "epoch": 0.9032624113475177, + "grad_norm": 0.930630624294281, + "loss": 5.0326, + "lr": 0.0006247552447552448, + "step": 3184, + "tokens_trained": 0.302596376 + }, + { + "epoch": 0.9038297872340425, + "grad_norm": 0.9014614224433899, + "loss": 5.0768, + "lr": 0.0006244755244755245, + "step": 3186, + "tokens_trained": 0.302787288 + }, + { + "epoch": 0.9043971631205674, + "grad_norm": 0.9306453466415405, + "loss": 5.139, + "lr": 0.0006241958041958043, + "step": 3188, + "tokens_trained": 0.302976344 + }, + { + "epoch": 0.9049645390070922, + "grad_norm": 0.9506230354309082, + "loss": 5.0932, + "lr": 0.0006239160839160839, + "step": 3190, + "tokens_trained": 0.303166928 + }, + { + "epoch": 0.9055319148936171, + "grad_norm": 0.8852939605712891, + "loss": 5.0761, + "lr": 0.0006236363636363636, + "step": 3192, + "tokens_trained": 0.303357176 + }, + { + "epoch": 0.9060992907801418, + "grad_norm": 0.9017012119293213, + "loss": 4.9965, + "lr": 0.0006233566433566433, + "step": 3194, + "tokens_trained": 0.303547344 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 0.8619939684867859, + "loss": 5.0892, + "lr": 0.0006230769230769231, + "step": 3196, + "tokens_trained": 0.303737392 + }, + { + "epoch": 0.9072340425531915, + "grad_norm": 0.8667910695075989, + "loss": 5.1222, + "lr": 0.0006227972027972028, + "step": 3198, + "tokens_trained": 0.303926592 + }, + { + "epoch": 0.9078014184397163, + "grad_norm": 0.9172303676605225, + "loss": 5.0891, + "lr": 0.0006225174825174825, + "step": 3200, + "tokens_trained": 0.304117744 + }, + { + "epoch": 0.9083687943262412, + "grad_norm": 0.9247593879699707, + "loss": 5.0528, + "lr": 0.0006222377622377623, + "step": 3202, + "tokens_trained": 0.304304792 + }, + { + "epoch": 0.908936170212766, + "grad_norm": 0.9245242476463318, + "loss": 5.027, + "lr": 0.000621958041958042, + "step": 3204, + "tokens_trained": 0.304496016 + }, + { + "epoch": 0.9095035460992907, + "grad_norm": 0.8890556693077087, + "loss": 5.0348, + "lr": 0.0006216783216783217, + "step": 3206, + "tokens_trained": 0.304685896 + }, + { + "epoch": 0.9100709219858156, + "grad_norm": 0.9343590140342712, + "loss": 5.103, + "lr": 0.0006213986013986014, + "step": 3208, + "tokens_trained": 0.304876864 + }, + { + "epoch": 0.9106382978723404, + "grad_norm": 0.9546332955360413, + "loss": 5.0456, + "lr": 0.0006211188811188811, + "step": 3210, + "tokens_trained": 0.305067744 + }, + { + "epoch": 0.9112056737588653, + "grad_norm": 0.9404990077018738, + "loss": 5.0357, + "lr": 0.0006208391608391608, + "step": 3212, + "tokens_trained": 0.305256552 + }, + { + "epoch": 0.9117730496453901, + "grad_norm": 0.8743602633476257, + "loss": 5.0564, + "lr": 0.0006205594405594406, + "step": 3214, + "tokens_trained": 0.305446568 + }, + { + "epoch": 0.9123404255319149, + "grad_norm": 0.9437069892883301, + "loss": 5.0703, + "lr": 0.0006202797202797203, + "step": 3216, + "tokens_trained": 0.305636344 + }, + { + "epoch": 0.9129078014184397, + "grad_norm": 0.970951497554779, + "loss": 5.0722, + "lr": 0.00062, + "step": 3218, + "tokens_trained": 0.305825936 + }, + { + "epoch": 0.9134751773049645, + "grad_norm": 0.9047942757606506, + "loss": 5.113, + "lr": 0.0006197202797202797, + "step": 3220, + "tokens_trained": 0.306016936 + }, + { + "epoch": 0.9140425531914894, + "grad_norm": 0.9751421213150024, + "loss": 5.0465, + "lr": 0.0006194405594405595, + "step": 3222, + "tokens_trained": 0.306207216 + }, + { + "epoch": 0.9146099290780142, + "grad_norm": 0.9317526817321777, + "loss": 5.0601, + "lr": 0.0006191608391608392, + "step": 3224, + "tokens_trained": 0.306396832 + }, + { + "epoch": 0.915177304964539, + "grad_norm": 0.9828630685806274, + "loss": 5.0857, + "lr": 0.0006188811188811189, + "step": 3226, + "tokens_trained": 0.30658724 + }, + { + "epoch": 0.9157446808510639, + "grad_norm": 0.9108901619911194, + "loss": 5.0525, + "lr": 0.0006186013986013986, + "step": 3228, + "tokens_trained": 0.30677856 + }, + { + "epoch": 0.9163120567375886, + "grad_norm": 0.8517162203788757, + "loss": 5.1157, + "lr": 0.0006183216783216783, + "step": 3230, + "tokens_trained": 0.3069698 + }, + { + "epoch": 0.9168794326241134, + "grad_norm": 0.9589570760726929, + "loss": 5.0823, + "lr": 0.0006180419580419581, + "step": 3232, + "tokens_trained": 0.307160952 + }, + { + "epoch": 0.9174468085106383, + "grad_norm": 1.0031661987304688, + "loss": 5.0808, + "lr": 0.0006177622377622377, + "step": 3234, + "tokens_trained": 0.307352776 + }, + { + "epoch": 0.9180141843971631, + "grad_norm": 0.9295787215232849, + "loss": 5.0699, + "lr": 0.0006174825174825175, + "step": 3236, + "tokens_trained": 0.3075432 + }, + { + "epoch": 0.918581560283688, + "grad_norm": 0.9967226982116699, + "loss": 5.0036, + "lr": 0.0006172027972027972, + "step": 3238, + "tokens_trained": 0.307735016 + }, + { + "epoch": 0.9191489361702128, + "grad_norm": 1.0219292640686035, + "loss": 5.1142, + "lr": 0.000616923076923077, + "step": 3240, + "tokens_trained": 0.307926624 + }, + { + "epoch": 0.9197163120567375, + "grad_norm": 1.0547230243682861, + "loss": 5.0429, + "lr": 0.0006166433566433567, + "step": 3242, + "tokens_trained": 0.30811696 + }, + { + "epoch": 0.9202836879432624, + "grad_norm": 1.0130624771118164, + "loss": 5.1345, + "lr": 0.0006163636363636364, + "step": 3244, + "tokens_trained": 0.30830848 + }, + { + "epoch": 0.9208510638297872, + "grad_norm": 0.8802092671394348, + "loss": 5.0404, + "lr": 0.0006160839160839161, + "step": 3246, + "tokens_trained": 0.308497688 + }, + { + "epoch": 0.9214184397163121, + "grad_norm": 0.970391571521759, + "loss": 5.0875, + "lr": 0.0006158041958041957, + "step": 3248, + "tokens_trained": 0.308686352 + }, + { + "epoch": 0.9219858156028369, + "grad_norm": 0.9314327239990234, + "loss": 5.0519, + "lr": 0.0006155244755244756, + "step": 3250, + "tokens_trained": 0.308875888 + }, + { + "epoch": 0.9219858156028369, + "eval_loss": 5.063432216644287, + "eval_runtime": 20.6963, + "step": 3250, + "tokens_trained": 0.308875888 + }, + { + "epoch": 0.9225531914893617, + "grad_norm": 0.875278890132904, + "loss": 4.9958, + "lr": 0.0006152447552447552, + "step": 3252, + "tokens_trained": 0.309068888 + }, + { + "epoch": 0.9231205673758865, + "grad_norm": 0.9115424156188965, + "loss": 4.9971, + "lr": 0.000614965034965035, + "step": 3254, + "tokens_trained": 0.309260656 + }, + { + "epoch": 0.9236879432624113, + "grad_norm": 0.9202569723129272, + "loss": 5.0103, + "lr": 0.0006146853146853147, + "step": 3256, + "tokens_trained": 0.309452672 + }, + { + "epoch": 0.9242553191489362, + "grad_norm": 0.9471083879470825, + "loss": 5.0429, + "lr": 0.0006144055944055945, + "step": 3258, + "tokens_trained": 0.30964252 + }, + { + "epoch": 0.924822695035461, + "grad_norm": 0.9518803954124451, + "loss": 5.0143, + "lr": 0.0006141258741258742, + "step": 3260, + "tokens_trained": 0.309831288 + }, + { + "epoch": 0.9253900709219858, + "grad_norm": 0.9274792671203613, + "loss": 5.0121, + "lr": 0.0006138461538461538, + "step": 3262, + "tokens_trained": 0.310021056 + }, + { + "epoch": 0.9259574468085107, + "grad_norm": 0.9414265751838684, + "loss": 5.1362, + "lr": 0.0006135664335664336, + "step": 3264, + "tokens_trained": 0.310210328 + }, + { + "epoch": 0.9265248226950354, + "grad_norm": 0.968233048915863, + "loss": 4.9792, + "lr": 0.0006132867132867132, + "step": 3266, + "tokens_trained": 0.310399616 + }, + { + "epoch": 0.9270921985815603, + "grad_norm": 0.9223787784576416, + "loss": 5.0543, + "lr": 0.0006130069930069931, + "step": 3268, + "tokens_trained": 0.310588952 + }, + { + "epoch": 0.9276595744680851, + "grad_norm": 0.9317581653594971, + "loss": 5.0053, + "lr": 0.0006127272727272727, + "step": 3270, + "tokens_trained": 0.310779576 + }, + { + "epoch": 0.9282269503546099, + "grad_norm": 0.8910759687423706, + "loss": 5.1044, + "lr": 0.0006124475524475525, + "step": 3272, + "tokens_trained": 0.310970096 + }, + { + "epoch": 0.9287943262411348, + "grad_norm": 0.8903452157974243, + "loss": 5.093, + "lr": 0.0006121678321678322, + "step": 3274, + "tokens_trained": 0.311158808 + }, + { + "epoch": 0.9293617021276596, + "grad_norm": 0.9635697603225708, + "loss": 5.0149, + "lr": 0.0006118881118881118, + "step": 3276, + "tokens_trained": 0.311348672 + }, + { + "epoch": 0.9299290780141845, + "grad_norm": 1.0122349262237549, + "loss": 5.0353, + "lr": 0.0006116083916083917, + "step": 3278, + "tokens_trained": 0.31153696 + }, + { + "epoch": 0.9304964539007092, + "grad_norm": 0.9734505414962769, + "loss": 5.0531, + "lr": 0.0006113286713286713, + "step": 3280, + "tokens_trained": 0.311728288 + }, + { + "epoch": 0.931063829787234, + "grad_norm": 0.9433160424232483, + "loss": 5.0234, + "lr": 0.0006110489510489511, + "step": 3282, + "tokens_trained": 0.311917352 + }, + { + "epoch": 0.9316312056737589, + "grad_norm": 0.9984011054039001, + "loss": 5.0355, + "lr": 0.0006107692307692307, + "step": 3284, + "tokens_trained": 0.312108504 + }, + { + "epoch": 0.9321985815602837, + "grad_norm": 1.0186588764190674, + "loss": 4.9903, + "lr": 0.0006104895104895106, + "step": 3286, + "tokens_trained": 0.312300216 + }, + { + "epoch": 0.9327659574468085, + "grad_norm": 0.984987735748291, + "loss": 5.0188, + "lr": 0.0006102097902097902, + "step": 3288, + "tokens_trained": 0.312490928 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.9382873773574829, + "loss": 5.0223, + "lr": 0.0006099300699300699, + "step": 3290, + "tokens_trained": 0.312681928 + }, + { + "epoch": 0.9339007092198581, + "grad_norm": 0.8770633339881897, + "loss": 5.05, + "lr": 0.0006096503496503497, + "step": 3292, + "tokens_trained": 0.312870072 + }, + { + "epoch": 0.934468085106383, + "grad_norm": 0.9703201055526733, + "loss": 5.0905, + "lr": 0.0006093706293706293, + "step": 3294, + "tokens_trained": 0.313060608 + }, + { + "epoch": 0.9350354609929078, + "grad_norm": 0.9052334427833557, + "loss": 5.0416, + "lr": 0.0006090909090909092, + "step": 3296, + "tokens_trained": 0.313251584 + }, + { + "epoch": 0.9356028368794326, + "grad_norm": 0.949390709400177, + "loss": 4.9757, + "lr": 0.0006088111888111888, + "step": 3298, + "tokens_trained": 0.313440784 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 0.9845399260520935, + "loss": 5.0403, + "lr": 0.0006085314685314686, + "step": 3300, + "tokens_trained": 0.313631088 + }, + { + "epoch": 0.9367375886524822, + "grad_norm": 0.921394407749176, + "loss": 5.0464, + "lr": 0.0006082517482517482, + "step": 3302, + "tokens_trained": 0.313821704 + }, + { + "epoch": 0.9373049645390071, + "grad_norm": 0.9639559984207153, + "loss": 5.0658, + "lr": 0.000607972027972028, + "step": 3304, + "tokens_trained": 0.314011048 + }, + { + "epoch": 0.9378723404255319, + "grad_norm": 0.9921663403511047, + "loss": 5.0469, + "lr": 0.0006076923076923077, + "step": 3306, + "tokens_trained": 0.314199264 + }, + { + "epoch": 0.9384397163120567, + "grad_norm": 0.9891427159309387, + "loss": 5.0784, + "lr": 0.0006074125874125874, + "step": 3308, + "tokens_trained": 0.314388688 + }, + { + "epoch": 0.9390070921985816, + "grad_norm": 0.966525673866272, + "loss": 5.0759, + "lr": 0.0006071328671328672, + "step": 3310, + "tokens_trained": 0.31457712 + }, + { + "epoch": 0.9395744680851064, + "grad_norm": 0.9262145757675171, + "loss": 4.9822, + "lr": 0.0006068531468531468, + "step": 3312, + "tokens_trained": 0.314768096 + }, + { + "epoch": 0.9401418439716313, + "grad_norm": 0.9138565063476562, + "loss": 5.059, + "lr": 0.0006065734265734267, + "step": 3314, + "tokens_trained": 0.314959 + }, + { + "epoch": 0.940709219858156, + "grad_norm": 0.9083120226860046, + "loss": 5.0523, + "lr": 0.0006062937062937063, + "step": 3316, + "tokens_trained": 0.315148288 + }, + { + "epoch": 0.9412765957446808, + "grad_norm": 0.9483383893966675, + "loss": 5.0821, + "lr": 0.000606013986013986, + "step": 3318, + "tokens_trained": 0.31533864 + }, + { + "epoch": 0.9418439716312057, + "grad_norm": 0.8716344833374023, + "loss": 5.1046, + "lr": 0.0006057342657342657, + "step": 3320, + "tokens_trained": 0.31552972 + }, + { + "epoch": 0.9424113475177305, + "grad_norm": 0.9287091493606567, + "loss": 5.0531, + "lr": 0.0006054545454545455, + "step": 3322, + "tokens_trained": 0.315720136 + }, + { + "epoch": 0.9429787234042554, + "grad_norm": 0.9560433030128479, + "loss": 5.087, + "lr": 0.0006051748251748252, + "step": 3324, + "tokens_trained": 0.315911024 + }, + { + "epoch": 0.9435460992907801, + "grad_norm": 0.8612940311431885, + "loss": 5.1338, + "lr": 0.0006048951048951049, + "step": 3326, + "tokens_trained": 0.316102368 + }, + { + "epoch": 0.9441134751773049, + "grad_norm": 1.0215116739273071, + "loss": 5.034, + "lr": 0.0006046153846153846, + "step": 3328, + "tokens_trained": 0.316292296 + }, + { + "epoch": 0.9446808510638298, + "grad_norm": 1.0019500255584717, + "loss": 5.101, + "lr": 0.0006043356643356643, + "step": 3330, + "tokens_trained": 0.31648088 + }, + { + "epoch": 0.9452482269503546, + "grad_norm": 0.9435174465179443, + "loss": 5.0476, + "lr": 0.0006040559440559441, + "step": 3332, + "tokens_trained": 0.316672936 + }, + { + "epoch": 0.9458156028368795, + "grad_norm": 0.9211596846580505, + "loss": 5.039, + "lr": 0.0006037762237762238, + "step": 3334, + "tokens_trained": 0.31686408 + }, + { + "epoch": 0.9463829787234043, + "grad_norm": 0.9332453608512878, + "loss": 5.0857, + "lr": 0.0006034965034965035, + "step": 3336, + "tokens_trained": 0.317053896 + }, + { + "epoch": 0.946950354609929, + "grad_norm": 0.8761624097824097, + "loss": 5.0614, + "lr": 0.0006032167832167832, + "step": 3338, + "tokens_trained": 0.317245016 + }, + { + "epoch": 0.9475177304964539, + "grad_norm": 0.9113523364067078, + "loss": 5.0609, + "lr": 0.000602937062937063, + "step": 3340, + "tokens_trained": 0.317433592 + }, + { + "epoch": 0.9480851063829787, + "grad_norm": 1.0509337186813354, + "loss": 4.9984, + "lr": 0.0006026573426573426, + "step": 3342, + "tokens_trained": 0.317623392 + }, + { + "epoch": 0.9486524822695035, + "grad_norm": 0.9496453404426575, + "loss": 5.0632, + "lr": 0.0006023776223776224, + "step": 3344, + "tokens_trained": 0.317814848 + }, + { + "epoch": 0.9492198581560284, + "grad_norm": 0.913977861404419, + "loss": 5.0816, + "lr": 0.0006020979020979021, + "step": 3346, + "tokens_trained": 0.318003232 + }, + { + "epoch": 0.9497872340425532, + "grad_norm": 0.9476690292358398, + "loss": 5.1321, + "lr": 0.0006018181818181818, + "step": 3348, + "tokens_trained": 0.31819216 + }, + { + "epoch": 0.950354609929078, + "grad_norm": 1.0221197605133057, + "loss": 5.0602, + "lr": 0.0006015384615384616, + "step": 3350, + "tokens_trained": 0.318379648 + }, + { + "epoch": 0.9509219858156028, + "grad_norm": 0.9944773316383362, + "loss": 5.0595, + "lr": 0.0006012587412587413, + "step": 3352, + "tokens_trained": 0.3185692 + }, + { + "epoch": 0.9514893617021276, + "grad_norm": 0.9641481041908264, + "loss": 5.0842, + "lr": 0.000600979020979021, + "step": 3354, + "tokens_trained": 0.318758464 + }, + { + "epoch": 0.9520567375886525, + "grad_norm": 0.8794710636138916, + "loss": 5.0385, + "lr": 0.0006006993006993006, + "step": 3356, + "tokens_trained": 0.318948528 + }, + { + "epoch": 0.9526241134751773, + "grad_norm": 0.9986928701400757, + "loss": 5.0325, + "lr": 0.0006004195804195805, + "step": 3358, + "tokens_trained": 0.319137168 + }, + { + "epoch": 0.9531914893617022, + "grad_norm": 0.9385401606559753, + "loss": 4.9957, + "lr": 0.0006001398601398601, + "step": 3360, + "tokens_trained": 0.319327992 + }, + { + "epoch": 0.953758865248227, + "grad_norm": 0.9591023921966553, + "loss": 5.0883, + "lr": 0.0005998601398601399, + "step": 3362, + "tokens_trained": 0.319518928 + }, + { + "epoch": 0.9543262411347517, + "grad_norm": 0.9454349279403687, + "loss": 5.0639, + "lr": 0.0005995804195804196, + "step": 3364, + "tokens_trained": 0.319711176 + }, + { + "epoch": 0.9548936170212766, + "grad_norm": 0.9882696270942688, + "loss": 5.0326, + "lr": 0.0005993006993006993, + "step": 3366, + "tokens_trained": 0.319901272 + }, + { + "epoch": 0.9554609929078014, + "grad_norm": 0.9254516959190369, + "loss": 5.0454, + "lr": 0.0005990209790209791, + "step": 3368, + "tokens_trained": 0.320091928 + }, + { + "epoch": 0.9560283687943263, + "grad_norm": 0.9193766117095947, + "loss": 4.9996, + "lr": 0.0005987412587412587, + "step": 3370, + "tokens_trained": 0.320282712 + }, + { + "epoch": 0.9565957446808511, + "grad_norm": 0.9373677372932434, + "loss": 5.1228, + "lr": 0.0005984615384615385, + "step": 3372, + "tokens_trained": 0.320472168 + }, + { + "epoch": 0.9571631205673758, + "grad_norm": 0.9842008352279663, + "loss": 5.0338, + "lr": 0.0005981818181818181, + "step": 3374, + "tokens_trained": 0.320662592 + }, + { + "epoch": 0.9574468085106383, + "eval_loss": 5.064303398132324, + "eval_runtime": 20.617, + "step": 3375, + "tokens_trained": 0.320758504 + }, + { + "epoch": 0.9577304964539007, + "grad_norm": 0.9934602379798889, + "loss": 5.0444, + "lr": 0.000597902097902098, + "step": 3376, + "tokens_trained": 0.320853552 + }, + { + "epoch": 0.9582978723404255, + "grad_norm": 0.9192136526107788, + "loss": 5.0502, + "lr": 0.0005976223776223776, + "step": 3378, + "tokens_trained": 0.321043072 + }, + { + "epoch": 0.9588652482269504, + "grad_norm": 0.9416385293006897, + "loss": 5.0676, + "lr": 0.0005973426573426574, + "step": 3380, + "tokens_trained": 0.321234024 + }, + { + "epoch": 0.9594326241134752, + "grad_norm": 0.87016761302948, + "loss": 5.0474, + "lr": 0.0005970629370629371, + "step": 3382, + "tokens_trained": 0.321423504 + }, + { + "epoch": 0.96, + "grad_norm": 0.9421593546867371, + "loss": 5.0148, + "lr": 0.0005967832167832167, + "step": 3384, + "tokens_trained": 0.32161436 + }, + { + "epoch": 0.9605673758865249, + "grad_norm": 0.9040830135345459, + "loss": 5.0065, + "lr": 0.0005965034965034966, + "step": 3386, + "tokens_trained": 0.321804688 + }, + { + "epoch": 0.9611347517730496, + "grad_norm": 0.9497122764587402, + "loss": 5.0882, + "lr": 0.0005962237762237762, + "step": 3388, + "tokens_trained": 0.321994728 + }, + { + "epoch": 0.9617021276595744, + "grad_norm": 0.9700525999069214, + "loss": 5.0462, + "lr": 0.000595944055944056, + "step": 3390, + "tokens_trained": 0.322186 + }, + { + "epoch": 0.9622695035460993, + "grad_norm": 0.9304616451263428, + "loss": 5.0781, + "lr": 0.0005956643356643356, + "step": 3392, + "tokens_trained": 0.322376408 + }, + { + "epoch": 0.9628368794326241, + "grad_norm": 0.8804431557655334, + "loss": 5.1449, + "lr": 0.0005953846153846155, + "step": 3394, + "tokens_trained": 0.322566024 + }, + { + "epoch": 0.963404255319149, + "grad_norm": 0.8852412700653076, + "loss": 5.0602, + "lr": 0.0005951048951048951, + "step": 3396, + "tokens_trained": 0.322758272 + }, + { + "epoch": 0.9639716312056738, + "grad_norm": 1.015409231185913, + "loss": 5.0753, + "lr": 0.0005948251748251748, + "step": 3398, + "tokens_trained": 0.322948904 + }, + { + "epoch": 0.9645390070921985, + "grad_norm": 0.9504172205924988, + "loss": 5.1003, + "lr": 0.0005945454545454546, + "step": 3400, + "tokens_trained": 0.323140944 + }, + { + "epoch": 0.9651063829787234, + "grad_norm": 0.8708069920539856, + "loss": 5.0316, + "lr": 0.0005942657342657342, + "step": 3402, + "tokens_trained": 0.323331864 + }, + { + "epoch": 0.9656737588652482, + "grad_norm": 0.8804038166999817, + "loss": 5.038, + "lr": 0.0005939860139860141, + "step": 3404, + "tokens_trained": 0.323521296 + }, + { + "epoch": 0.9662411347517731, + "grad_norm": 0.901345431804657, + "loss": 5.1247, + "lr": 0.0005937062937062937, + "step": 3406, + "tokens_trained": 0.323713464 + }, + { + "epoch": 0.9668085106382979, + "grad_norm": 0.8839131593704224, + "loss": 5.058, + "lr": 0.0005934265734265735, + "step": 3408, + "tokens_trained": 0.323903208 + }, + { + "epoch": 0.9673758865248226, + "grad_norm": 0.9157027006149292, + "loss": 5.007, + "lr": 0.0005931468531468531, + "step": 3410, + "tokens_trained": 0.324091224 + }, + { + "epoch": 0.9679432624113475, + "grad_norm": 0.9776141047477722, + "loss": 5.0244, + "lr": 0.000592867132867133, + "step": 3412, + "tokens_trained": 0.324281696 + }, + { + "epoch": 0.9685106382978723, + "grad_norm": 0.8768822550773621, + "loss": 5.0321, + "lr": 0.0005925874125874126, + "step": 3414, + "tokens_trained": 0.324471136 + }, + { + "epoch": 0.9690780141843972, + "grad_norm": 0.7926638722419739, + "loss": 5.0433, + "lr": 0.0005923076923076923, + "step": 3416, + "tokens_trained": 0.324661816 + }, + { + "epoch": 0.969645390070922, + "grad_norm": 0.8630013465881348, + "loss": 5.0876, + "lr": 0.0005920279720279721, + "step": 3418, + "tokens_trained": 0.324852152 + }, + { + "epoch": 0.9702127659574468, + "grad_norm": 0.8769309520721436, + "loss": 5.0611, + "lr": 0.0005917482517482517, + "step": 3420, + "tokens_trained": 0.325042144 + }, + { + "epoch": 0.9707801418439717, + "grad_norm": 0.8933807611465454, + "loss": 5.0288, + "lr": 0.0005914685314685316, + "step": 3422, + "tokens_trained": 0.325232376 + }, + { + "epoch": 0.9713475177304964, + "grad_norm": 0.9544175267219543, + "loss": 5.0038, + "lr": 0.0005911888111888112, + "step": 3424, + "tokens_trained": 0.325423784 + }, + { + "epoch": 0.9719148936170213, + "grad_norm": 0.9057655930519104, + "loss": 5.0613, + "lr": 0.0005909090909090909, + "step": 3426, + "tokens_trained": 0.325614744 + }, + { + "epoch": 0.9724822695035461, + "grad_norm": 0.8956878781318665, + "loss": 5.0327, + "lr": 0.0005906293706293706, + "step": 3428, + "tokens_trained": 0.325803144 + }, + { + "epoch": 0.9730496453900709, + "grad_norm": 0.8879814147949219, + "loss": 5.0018, + "lr": 0.0005903496503496504, + "step": 3430, + "tokens_trained": 0.3259944 + }, + { + "epoch": 0.9736170212765958, + "grad_norm": 0.8801882863044739, + "loss": 5.125, + "lr": 0.0005900699300699301, + "step": 3432, + "tokens_trained": 0.326185928 + }, + { + "epoch": 0.9741843971631206, + "grad_norm": 0.8586528301239014, + "loss": 5.0299, + "lr": 0.0005897902097902098, + "step": 3434, + "tokens_trained": 0.326378416 + }, + { + "epoch": 0.9747517730496454, + "grad_norm": 0.8574861884117126, + "loss": 5.0743, + "lr": 0.0005895104895104896, + "step": 3436, + "tokens_trained": 0.326569616 + }, + { + "epoch": 0.9753191489361702, + "grad_norm": 0.8478572368621826, + "loss": 5.0547, + "lr": 0.0005892307692307692, + "step": 3438, + "tokens_trained": 0.326759744 + }, + { + "epoch": 0.975886524822695, + "grad_norm": 0.8645881414413452, + "loss": 5.0504, + "lr": 0.000588951048951049, + "step": 3440, + "tokens_trained": 0.3269478 + }, + { + "epoch": 0.9764539007092199, + "grad_norm": 0.8346559405326843, + "loss": 5.0472, + "lr": 0.0005886713286713287, + "step": 3442, + "tokens_trained": 0.32714012 + }, + { + "epoch": 0.9770212765957447, + "grad_norm": 0.8666026592254639, + "loss": 5.0557, + "lr": 0.0005883916083916084, + "step": 3444, + "tokens_trained": 0.327329992 + }, + { + "epoch": 0.9775886524822694, + "grad_norm": 0.9243910312652588, + "loss": 5.0326, + "lr": 0.0005881118881118881, + "step": 3446, + "tokens_trained": 0.327520664 + }, + { + "epoch": 0.9781560283687943, + "grad_norm": 0.8909792304039001, + "loss": 5.0948, + "lr": 0.0005878321678321679, + "step": 3448, + "tokens_trained": 0.327712056 + }, + { + "epoch": 0.9787234042553191, + "grad_norm": 0.8586627840995789, + "loss": 5.0587, + "lr": 0.0005875524475524476, + "step": 3450, + "tokens_trained": 0.327903456 + }, + { + "epoch": 0.979290780141844, + "grad_norm": 0.9551260471343994, + "loss": 5.0493, + "lr": 0.0005872727272727273, + "step": 3452, + "tokens_trained": 0.328093768 + }, + { + "epoch": 0.9798581560283688, + "grad_norm": 0.8501218557357788, + "loss": 5.0725, + "lr": 0.0005869930069930069, + "step": 3454, + "tokens_trained": 0.328281896 + }, + { + "epoch": 0.9804255319148936, + "grad_norm": 0.8573510646820068, + "loss": 5.057, + "lr": 0.0005867132867132867, + "step": 3456, + "tokens_trained": 0.32847448 + }, + { + "epoch": 0.9809929078014185, + "grad_norm": 0.8716034889221191, + "loss": 4.9833, + "lr": 0.0005864335664335665, + "step": 3458, + "tokens_trained": 0.328661304 + }, + { + "epoch": 0.9815602836879432, + "grad_norm": 0.8251221179962158, + "loss": 5.0059, + "lr": 0.0005861538461538462, + "step": 3460, + "tokens_trained": 0.328850496 + }, + { + "epoch": 0.9821276595744681, + "grad_norm": 0.8577293753623962, + "loss": 5.0385, + "lr": 0.0005858741258741259, + "step": 3462, + "tokens_trained": 0.329039896 + }, + { + "epoch": 0.9826950354609929, + "grad_norm": 0.9229962825775146, + "loss": 5.0115, + "lr": 0.0005855944055944055, + "step": 3464, + "tokens_trained": 0.329230472 + }, + { + "epoch": 0.9832624113475177, + "grad_norm": 0.8700546622276306, + "loss": 5.0319, + "lr": 0.0005853146853146854, + "step": 3466, + "tokens_trained": 0.32941888 + }, + { + "epoch": 0.9838297872340426, + "grad_norm": 0.8610907196998596, + "loss": 5.0327, + "lr": 0.000585034965034965, + "step": 3468, + "tokens_trained": 0.329611152 + }, + { + "epoch": 0.9843971631205674, + "grad_norm": 0.7971277236938477, + "loss": 5.0813, + "lr": 0.0005847552447552448, + "step": 3470, + "tokens_trained": 0.329800024 + }, + { + "epoch": 0.9849645390070922, + "grad_norm": 0.9169178009033203, + "loss": 4.9764, + "lr": 0.0005844755244755244, + "step": 3472, + "tokens_trained": 0.329991688 + }, + { + "epoch": 0.985531914893617, + "grad_norm": 0.9630699157714844, + "loss": 5.0263, + "lr": 0.0005841958041958042, + "step": 3474, + "tokens_trained": 0.33018312 + }, + { + "epoch": 0.9860992907801418, + "grad_norm": 0.9706154465675354, + "loss": 4.9928, + "lr": 0.000583916083916084, + "step": 3476, + "tokens_trained": 0.330372336 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 0.9754578471183777, + "loss": 5.0122, + "lr": 0.0005836363636363636, + "step": 3478, + "tokens_trained": 0.330564472 + }, + { + "epoch": 0.9872340425531915, + "grad_norm": 0.9906936287879944, + "loss": 5.0495, + "lr": 0.0005833566433566434, + "step": 3480, + "tokens_trained": 0.3307554 + }, + { + "epoch": 0.9878014184397164, + "grad_norm": 0.9739910960197449, + "loss": 4.9801, + "lr": 0.000583076923076923, + "step": 3482, + "tokens_trained": 0.330944608 + }, + { + "epoch": 0.9883687943262411, + "grad_norm": 1.0058059692382812, + "loss": 5.0974, + "lr": 0.0005827972027972029, + "step": 3484, + "tokens_trained": 0.331134752 + }, + { + "epoch": 0.9889361702127659, + "grad_norm": 1.0330032110214233, + "loss": 5.1054, + "lr": 0.0005825174825174825, + "step": 3486, + "tokens_trained": 0.331323744 + }, + { + "epoch": 0.9895035460992908, + "grad_norm": 0.9857019186019897, + "loss": 5.0417, + "lr": 0.0005822377622377623, + "step": 3488, + "tokens_trained": 0.33151316 + }, + { + "epoch": 0.9900709219858156, + "grad_norm": 0.8929789066314697, + "loss": 5.0753, + "lr": 0.0005819580419580419, + "step": 3490, + "tokens_trained": 0.331703136 + }, + { + "epoch": 0.9906382978723405, + "grad_norm": 0.9803673624992371, + "loss": 5.0748, + "lr": 0.0005816783216783216, + "step": 3492, + "tokens_trained": 0.331894376 + }, + { + "epoch": 0.9912056737588653, + "grad_norm": 1.0658507347106934, + "loss": 4.952, + "lr": 0.0005813986013986015, + "step": 3494, + "tokens_trained": 0.33208472 + }, + { + "epoch": 0.99177304964539, + "grad_norm": 0.9646208882331848, + "loss": 5.0638, + "lr": 0.0005811188811188811, + "step": 3496, + "tokens_trained": 0.332274704 + }, + { + "epoch": 0.9923404255319149, + "grad_norm": 0.9479737281799316, + "loss": 4.9608, + "lr": 0.0005808391608391609, + "step": 3498, + "tokens_trained": 0.332464656 + }, + { + "epoch": 0.9929078014184397, + "grad_norm": 0.9422057867050171, + "loss": 4.9805, + "lr": 0.0005805594405594405, + "step": 3500, + "tokens_trained": 0.332653056 + }, + { + "epoch": 0.9929078014184397, + "eval_loss": 5.051141738891602, + "eval_runtime": 20.5005, + "step": 3500, + "tokens_trained": 0.332653056 + }, + { + "epoch": 0.9934751773049645, + "grad_norm": 0.8606115579605103, + "loss": 5.014, + "lr": 0.0005802797202797204, + "step": 3502, + "tokens_trained": 0.33284184 + }, + { + "epoch": 0.9940425531914894, + "grad_norm": 0.9218055009841919, + "loss": 5.015, + "lr": 0.00058, + "step": 3504, + "tokens_trained": 0.333031504 + }, + { + "epoch": 0.9946099290780142, + "grad_norm": 0.8346299529075623, + "loss": 5.0793, + "lr": 0.0005797202797202797, + "step": 3506, + "tokens_trained": 0.333222184 + }, + { + "epoch": 0.995177304964539, + "grad_norm": 0.9426013231277466, + "loss": 5.0416, + "lr": 0.0005794405594405594, + "step": 3508, + "tokens_trained": 0.333413 + }, + { + "epoch": 0.9957446808510638, + "grad_norm": 0.973893940448761, + "loss": 5.0579, + "lr": 0.0005791608391608391, + "step": 3510, + "tokens_trained": 0.333602176 + }, + { + "epoch": 0.9963120567375886, + "grad_norm": 0.9642478823661804, + "loss": 5.1078, + "lr": 0.000578881118881119, + "step": 3512, + "tokens_trained": 0.333792992 + }, + { + "epoch": 0.9968794326241135, + "grad_norm": 0.9709126949310303, + "loss": 5.0379, + "lr": 0.0005786013986013986, + "step": 3514, + "tokens_trained": 0.333982568 + }, + { + "epoch": 0.9974468085106383, + "grad_norm": 0.9238979816436768, + "loss": 5.0391, + "lr": 0.0005783216783216784, + "step": 3516, + "tokens_trained": 0.334171688 + }, + { + "epoch": 0.9980141843971632, + "grad_norm": 0.884200930595398, + "loss": 5.0402, + "lr": 0.000578041958041958, + "step": 3518, + "tokens_trained": 0.334361968 + }, + { + "epoch": 0.9985815602836879, + "grad_norm": 0.9557647705078125, + "loss": 5.0816, + "lr": 0.0005777622377622377, + "step": 3520, + "tokens_trained": 0.3345518 + }, + { + "epoch": 0.9991489361702127, + "grad_norm": 0.963729202747345, + "loss": 5.0047, + "lr": 0.0005774825174825175, + "step": 3522, + "tokens_trained": 0.334743104 + }, + { + "epoch": 0.9997163120567376, + "grad_norm": 0.8432028889656067, + "loss": 5.0323, + "lr": 0.0005772027972027972, + "step": 3524, + "tokens_trained": 0.334932336 + }, + { + "epoch": 1.0002836879432624, + "grad_norm": 0.9493524432182312, + "loss": 5.0686, + "lr": 0.0005769230769230769, + "step": 3526, + "tokens_trained": 0.335119632 + }, + { + "epoch": 1.0008510638297872, + "grad_norm": 0.8715479969978333, + "loss": 4.9798, + "lr": 0.0005766433566433566, + "step": 3528, + "tokens_trained": 0.335308752 + }, + { + "epoch": 1.001418439716312, + "grad_norm": 0.9414225816726685, + "loss": 5.0294, + "lr": 0.0005763636363636365, + "step": 3530, + "tokens_trained": 0.335499976 + }, + { + "epoch": 1.001985815602837, + "grad_norm": 0.9580470323562622, + "loss": 5.0097, + "lr": 0.0005760839160839161, + "step": 3532, + "tokens_trained": 0.335687704 + }, + { + "epoch": 1.0025531914893617, + "grad_norm": 0.8775055408477783, + "loss": 5.047, + "lr": 0.0005758041958041958, + "step": 3534, + "tokens_trained": 0.335877328 + }, + { + "epoch": 1.0031205673758865, + "grad_norm": 0.8149566054344177, + "loss": 5.0598, + "lr": 0.0005755244755244755, + "step": 3536, + "tokens_trained": 0.33606848 + }, + { + "epoch": 1.0036879432624113, + "grad_norm": 0.8992729783058167, + "loss": 4.9875, + "lr": 0.0005752447552447552, + "step": 3538, + "tokens_trained": 0.336259808 + }, + { + "epoch": 1.004255319148936, + "grad_norm": 0.913520872592926, + "loss": 5.0254, + "lr": 0.000574965034965035, + "step": 3540, + "tokens_trained": 0.336449872 + }, + { + "epoch": 1.004822695035461, + "grad_norm": 0.9528400301933289, + "loss": 4.9949, + "lr": 0.0005746853146853147, + "step": 3542, + "tokens_trained": 0.336640192 + }, + { + "epoch": 1.0053900709219858, + "grad_norm": 0.933910071849823, + "loss": 5.0776, + "lr": 0.0005744055944055944, + "step": 3544, + "tokens_trained": 0.336829088 + }, + { + "epoch": 1.0059574468085106, + "grad_norm": 0.9097418785095215, + "loss": 5.0021, + "lr": 0.0005741258741258741, + "step": 3546, + "tokens_trained": 0.337021576 + }, + { + "epoch": 1.0065248226950354, + "grad_norm": 0.8718441724777222, + "loss": 5.0946, + "lr": 0.000573846153846154, + "step": 3548, + "tokens_trained": 0.337210208 + }, + { + "epoch": 1.0070921985815602, + "grad_norm": 0.887489378452301, + "loss": 4.9686, + "lr": 0.0005735664335664336, + "step": 3550, + "tokens_trained": 0.337401176 + }, + { + "epoch": 1.0076595744680852, + "grad_norm": 0.8851242065429688, + "loss": 5.0423, + "lr": 0.0005732867132867133, + "step": 3552, + "tokens_trained": 0.337589216 + }, + { + "epoch": 1.00822695035461, + "grad_norm": 0.8381972312927246, + "loss": 5.0645, + "lr": 0.000573006993006993, + "step": 3554, + "tokens_trained": 0.337777424 + }, + { + "epoch": 1.0087943262411347, + "grad_norm": 0.8307796716690063, + "loss": 5.036, + "lr": 0.0005727272727272727, + "step": 3556, + "tokens_trained": 0.337967088 + }, + { + "epoch": 1.0093617021276595, + "grad_norm": 0.9271431565284729, + "loss": 5.0384, + "lr": 0.0005724475524475525, + "step": 3558, + "tokens_trained": 0.33815904 + }, + { + "epoch": 1.0099290780141843, + "grad_norm": 0.9501886367797852, + "loss": 5.0929, + "lr": 0.0005721678321678322, + "step": 3560, + "tokens_trained": 0.338349184 + }, + { + "epoch": 1.0104964539007093, + "grad_norm": 0.9176658987998962, + "loss": 5.0721, + "lr": 0.0005718881118881118, + "step": 3562, + "tokens_trained": 0.338539664 + }, + { + "epoch": 1.011063829787234, + "grad_norm": 0.8755439519882202, + "loss": 5.0864, + "lr": 0.0005716083916083916, + "step": 3564, + "tokens_trained": 0.33872792 + }, + { + "epoch": 1.0116312056737589, + "grad_norm": 0.9178908467292786, + "loss": 5.035, + "lr": 0.0005713286713286714, + "step": 3566, + "tokens_trained": 0.33891592 + }, + { + "epoch": 1.0121985815602836, + "grad_norm": 0.9046779870986938, + "loss": 5.0286, + "lr": 0.0005710489510489511, + "step": 3568, + "tokens_trained": 0.3391062 + }, + { + "epoch": 1.0127659574468084, + "grad_norm": 0.8680547475814819, + "loss": 5.036, + "lr": 0.0005707692307692308, + "step": 3570, + "tokens_trained": 0.339295896 + }, + { + "epoch": 1.0133333333333334, + "grad_norm": 0.8271722793579102, + "loss": 5.0438, + "lr": 0.0005704895104895105, + "step": 3572, + "tokens_trained": 0.339487368 + }, + { + "epoch": 1.0139007092198582, + "grad_norm": 0.8582717180252075, + "loss": 5.1501, + "lr": 0.0005702097902097902, + "step": 3574, + "tokens_trained": 0.339678792 + }, + { + "epoch": 1.014468085106383, + "grad_norm": 0.9433448314666748, + "loss": 5.0575, + "lr": 0.0005699300699300699, + "step": 3576, + "tokens_trained": 0.33987056 + }, + { + "epoch": 1.0150354609929078, + "grad_norm": 0.8291800022125244, + "loss": 5.0284, + "lr": 0.0005696503496503497, + "step": 3578, + "tokens_trained": 0.340059304 + }, + { + "epoch": 1.0156028368794325, + "grad_norm": 0.8057491183280945, + "loss": 5.0737, + "lr": 0.0005693706293706293, + "step": 3580, + "tokens_trained": 0.34024912 + }, + { + "epoch": 1.0161702127659575, + "grad_norm": 0.8925788998603821, + "loss": 5.017, + "lr": 0.0005690909090909091, + "step": 3582, + "tokens_trained": 0.340439688 + }, + { + "epoch": 1.0167375886524823, + "grad_norm": 0.8613091707229614, + "loss": 5.0778, + "lr": 0.0005688111888111889, + "step": 3584, + "tokens_trained": 0.34063064 + }, + { + "epoch": 1.017304964539007, + "grad_norm": 0.9694734811782837, + "loss": 5.0831, + "lr": 0.0005685314685314686, + "step": 3586, + "tokens_trained": 0.340820944 + }, + { + "epoch": 1.0178723404255319, + "grad_norm": 0.9405204653739929, + "loss": 5.0819, + "lr": 0.0005682517482517483, + "step": 3588, + "tokens_trained": 0.341008368 + }, + { + "epoch": 1.0184397163120567, + "grad_norm": 0.9191365838050842, + "loss": 5.016, + "lr": 0.0005679720279720279, + "step": 3590, + "tokens_trained": 0.341198984 + }, + { + "epoch": 1.0190070921985817, + "grad_norm": 0.9363374710083008, + "loss": 5.0432, + "lr": 0.0005676923076923077, + "step": 3592, + "tokens_trained": 0.341391808 + }, + { + "epoch": 1.0195744680851064, + "grad_norm": 0.9394513368606567, + "loss": 5.0159, + "lr": 0.0005674125874125874, + "step": 3594, + "tokens_trained": 0.34158276 + }, + { + "epoch": 1.0201418439716312, + "grad_norm": 0.8832948803901672, + "loss": 5.1156, + "lr": 0.0005671328671328672, + "step": 3596, + "tokens_trained": 0.341772296 + }, + { + "epoch": 1.020709219858156, + "grad_norm": 0.8347297310829163, + "loss": 5.0666, + "lr": 0.0005668531468531468, + "step": 3598, + "tokens_trained": 0.341959528 + }, + { + "epoch": 1.0212765957446808, + "grad_norm": 0.8295504450798035, + "loss": 5.0179, + "lr": 0.0005665734265734265, + "step": 3600, + "tokens_trained": 0.342150464 + }, + { + "epoch": 1.0218439716312058, + "grad_norm": 0.9434390068054199, + "loss": 5.0127, + "lr": 0.0005662937062937064, + "step": 3602, + "tokens_trained": 0.342339448 + }, + { + "epoch": 1.0224113475177306, + "grad_norm": 0.9653499722480774, + "loss": 5.0665, + "lr": 0.000566013986013986, + "step": 3604, + "tokens_trained": 0.342530488 + }, + { + "epoch": 1.0229787234042553, + "grad_norm": 0.8737668991088867, + "loss": 5.0718, + "lr": 0.0005657342657342658, + "step": 3606, + "tokens_trained": 0.342719696 + }, + { + "epoch": 1.02354609929078, + "grad_norm": 0.8800668716430664, + "loss": 5.0302, + "lr": 0.0005654545454545454, + "step": 3608, + "tokens_trained": 0.342909824 + }, + { + "epoch": 1.0241134751773049, + "grad_norm": 0.904245913028717, + "loss": 5.0692, + "lr": 0.0005651748251748252, + "step": 3610, + "tokens_trained": 0.343098816 + }, + { + "epoch": 1.02468085106383, + "grad_norm": 0.8640607595443726, + "loss": 5.0146, + "lr": 0.0005648951048951049, + "step": 3612, + "tokens_trained": 0.343288344 + }, + { + "epoch": 1.0252482269503547, + "grad_norm": 0.9330228567123413, + "loss": 5.0123, + "lr": 0.0005646153846153847, + "step": 3614, + "tokens_trained": 0.34347712 + }, + { + "epoch": 1.0258156028368794, + "grad_norm": 0.8850971460342407, + "loss": 5.0718, + "lr": 0.0005643356643356643, + "step": 3616, + "tokens_trained": 0.343666264 + }, + { + "epoch": 1.0263829787234042, + "grad_norm": 0.9091493487358093, + "loss": 5.0508, + "lr": 0.000564055944055944, + "step": 3618, + "tokens_trained": 0.343854008 + }, + { + "epoch": 1.026950354609929, + "grad_norm": 0.8939360976219177, + "loss": 5.0492, + "lr": 0.0005637762237762239, + "step": 3620, + "tokens_trained": 0.344046368 + }, + { + "epoch": 1.027517730496454, + "grad_norm": 0.9629043340682983, + "loss": 5.0234, + "lr": 0.0005634965034965035, + "step": 3622, + "tokens_trained": 0.344236592 + }, + { + "epoch": 1.0280851063829788, + "grad_norm": 0.955611526966095, + "loss": 4.9878, + "lr": 0.0005632167832167833, + "step": 3624, + "tokens_trained": 0.344425704 + }, + { + "epoch": 1.0283687943262412, + "eval_loss": 5.0450639724731445, + "eval_runtime": 20.6963, + "step": 3625, + "tokens_trained": 0.344518808 + }, + { + "epoch": 1.0286524822695036, + "grad_norm": 0.9501426815986633, + "loss": 5.0039, + "lr": 0.0005629370629370629, + "step": 3626, + "tokens_trained": 0.344612688 + }, + { + "epoch": 1.0292198581560283, + "grad_norm": 0.9446471333503723, + "loss": 5.0306, + "lr": 0.0005626573426573426, + "step": 3628, + "tokens_trained": 0.344802448 + }, + { + "epoch": 1.0297872340425531, + "grad_norm": 0.9773867726325989, + "loss": 5.0016, + "lr": 0.0005623776223776224, + "step": 3630, + "tokens_trained": 0.344992872 + }, + { + "epoch": 1.030354609929078, + "grad_norm": 0.8802851438522339, + "loss": 5.0263, + "lr": 0.0005620979020979021, + "step": 3632, + "tokens_trained": 0.345182064 + }, + { + "epoch": 1.030921985815603, + "grad_norm": 0.9009132385253906, + "loss": 4.9681, + "lr": 0.0005618181818181818, + "step": 3634, + "tokens_trained": 0.345372888 + }, + { + "epoch": 1.0314893617021277, + "grad_norm": 0.9252756834030151, + "loss": 4.9491, + "lr": 0.0005615384615384615, + "step": 3636, + "tokens_trained": 0.345563088 + }, + { + "epoch": 1.0320567375886525, + "grad_norm": 0.9195572733879089, + "loss": 5.0525, + "lr": 0.0005612587412587414, + "step": 3638, + "tokens_trained": 0.345753928 + }, + { + "epoch": 1.0326241134751772, + "grad_norm": 0.8032271862030029, + "loss": 5.0535, + "lr": 0.000560979020979021, + "step": 3640, + "tokens_trained": 0.345945664 + }, + { + "epoch": 1.033191489361702, + "grad_norm": 0.7840321660041809, + "loss": 4.9713, + "lr": 0.0005606993006993008, + "step": 3642, + "tokens_trained": 0.346134096 + }, + { + "epoch": 1.033758865248227, + "grad_norm": 0.8394534587860107, + "loss": 5.0695, + "lr": 0.0005604195804195804, + "step": 3644, + "tokens_trained": 0.346325368 + }, + { + "epoch": 1.0343262411347518, + "grad_norm": 0.8543218374252319, + "loss": 4.9826, + "lr": 0.0005601398601398601, + "step": 3646, + "tokens_trained": 0.346515088 + }, + { + "epoch": 1.0348936170212766, + "grad_norm": 0.8483793139457703, + "loss": 4.9956, + "lr": 0.0005598601398601399, + "step": 3648, + "tokens_trained": 0.346705304 + }, + { + "epoch": 1.0354609929078014, + "grad_norm": 0.8377392888069153, + "loss": 4.9123, + "lr": 0.0005595804195804196, + "step": 3650, + "tokens_trained": 0.34689744 + }, + { + "epoch": 1.0360283687943261, + "grad_norm": 0.902778685092926, + "loss": 5.0771, + "lr": 0.0005593006993006993, + "step": 3652, + "tokens_trained": 0.347086984 + }, + { + "epoch": 1.0365957446808511, + "grad_norm": 0.915446937084198, + "loss": 5.0235, + "lr": 0.000559020979020979, + "step": 3654, + "tokens_trained": 0.347278816 + }, + { + "epoch": 1.037163120567376, + "grad_norm": 0.803059458732605, + "loss": 5.0255, + "lr": 0.0005587412587412589, + "step": 3656, + "tokens_trained": 0.347468136 + }, + { + "epoch": 1.0377304964539007, + "grad_norm": 0.9930711984634399, + "loss": 5.0759, + "lr": 0.0005584615384615385, + "step": 3658, + "tokens_trained": 0.347659624 + }, + { + "epoch": 1.0382978723404255, + "grad_norm": 0.9266470670700073, + "loss": 5.0732, + "lr": 0.0005581818181818182, + "step": 3660, + "tokens_trained": 0.347848536 + }, + { + "epoch": 1.0388652482269503, + "grad_norm": 0.8442680835723877, + "loss": 5.0594, + "lr": 0.0005579020979020979, + "step": 3662, + "tokens_trained": 0.348039968 + }, + { + "epoch": 1.0394326241134753, + "grad_norm": 0.8922600746154785, + "loss": 4.9876, + "lr": 0.0005576223776223776, + "step": 3664, + "tokens_trained": 0.348229432 + }, + { + "epoch": 1.04, + "grad_norm": 0.8602802753448486, + "loss": 5.0332, + "lr": 0.0005573426573426574, + "step": 3666, + "tokens_trained": 0.348420184 + }, + { + "epoch": 1.0405673758865248, + "grad_norm": 0.8762813806533813, + "loss": 4.9641, + "lr": 0.0005570629370629371, + "step": 3668, + "tokens_trained": 0.348609504 + }, + { + "epoch": 1.0411347517730496, + "grad_norm": 0.8674803972244263, + "loss": 5.0701, + "lr": 0.0005567832167832167, + "step": 3670, + "tokens_trained": 0.348799384 + }, + { + "epoch": 1.0417021276595744, + "grad_norm": 0.8296146988868713, + "loss": 5.0432, + "lr": 0.0005565034965034965, + "step": 3672, + "tokens_trained": 0.34898944 + }, + { + "epoch": 1.0422695035460994, + "grad_norm": 0.7757400870323181, + "loss": 5.0742, + "lr": 0.0005562237762237763, + "step": 3674, + "tokens_trained": 0.349178752 + }, + { + "epoch": 1.0428368794326242, + "grad_norm": 0.8509469032287598, + "loss": 5.0754, + "lr": 0.000555944055944056, + "step": 3676, + "tokens_trained": 0.349369944 + }, + { + "epoch": 1.043404255319149, + "grad_norm": 0.8896392583847046, + "loss": 5.0305, + "lr": 0.0005556643356643357, + "step": 3678, + "tokens_trained": 0.3495604 + }, + { + "epoch": 1.0439716312056737, + "grad_norm": 0.8363154530525208, + "loss": 4.9969, + "lr": 0.0005553846153846154, + "step": 3680, + "tokens_trained": 0.349749488 + }, + { + "epoch": 1.0445390070921985, + "grad_norm": 0.8382596969604492, + "loss": 4.9747, + "lr": 0.0005551048951048951, + "step": 3682, + "tokens_trained": 0.349939408 + }, + { + "epoch": 1.0451063829787235, + "grad_norm": 0.9114118218421936, + "loss": 4.9993, + "lr": 0.0005548251748251748, + "step": 3684, + "tokens_trained": 0.350129704 + }, + { + "epoch": 1.0456737588652483, + "grad_norm": 0.8570284843444824, + "loss": 5.0509, + "lr": 0.0005545454545454546, + "step": 3686, + "tokens_trained": 0.350319608 + }, + { + "epoch": 1.046241134751773, + "grad_norm": 0.8100084066390991, + "loss": 4.9202, + "lr": 0.0005542657342657342, + "step": 3688, + "tokens_trained": 0.35051 + }, + { + "epoch": 1.0468085106382978, + "grad_norm": 0.9485516548156738, + "loss": 4.983, + "lr": 0.000553986013986014, + "step": 3690, + "tokens_trained": 0.350702976 + }, + { + "epoch": 1.0473758865248226, + "grad_norm": 0.9124506115913391, + "loss": 5.0354, + "lr": 0.0005537062937062938, + "step": 3692, + "tokens_trained": 0.350894824 + }, + { + "epoch": 1.0479432624113476, + "grad_norm": 0.9002963900566101, + "loss": 5.0171, + "lr": 0.0005534265734265735, + "step": 3694, + "tokens_trained": 0.351085672 + }, + { + "epoch": 1.0485106382978724, + "grad_norm": 0.8576133251190186, + "loss": 5.0411, + "lr": 0.0005531468531468532, + "step": 3696, + "tokens_trained": 0.351274576 + }, + { + "epoch": 1.0490780141843972, + "grad_norm": 0.8824317455291748, + "loss": 5.034, + "lr": 0.0005528671328671328, + "step": 3698, + "tokens_trained": 0.351465168 + }, + { + "epoch": 1.049645390070922, + "grad_norm": 0.9119421243667603, + "loss": 5.0423, + "lr": 0.0005525874125874126, + "step": 3700, + "tokens_trained": 0.35165588 + }, + { + "epoch": 1.0502127659574467, + "grad_norm": 0.8260598182678223, + "loss": 5.0213, + "lr": 0.0005523076923076923, + "step": 3702, + "tokens_trained": 0.351846456 + }, + { + "epoch": 1.0507801418439717, + "grad_norm": 0.9968200922012329, + "loss": 4.9728, + "lr": 0.0005520279720279721, + "step": 3704, + "tokens_trained": 0.352036312 + }, + { + "epoch": 1.0513475177304965, + "grad_norm": 0.9910591840744019, + "loss": 5.0692, + "lr": 0.0005517482517482517, + "step": 3706, + "tokens_trained": 0.352227032 + }, + { + "epoch": 1.0519148936170213, + "grad_norm": 0.8656545877456665, + "loss": 5.0201, + "lr": 0.0005514685314685315, + "step": 3708, + "tokens_trained": 0.35241624 + }, + { + "epoch": 1.052482269503546, + "grad_norm": 0.9561606049537659, + "loss": 5.055, + "lr": 0.0005511888111888111, + "step": 3710, + "tokens_trained": 0.352607936 + }, + { + "epoch": 1.0530496453900708, + "grad_norm": 0.9602318406105042, + "loss": 5.0372, + "lr": 0.0005509090909090909, + "step": 3712, + "tokens_trained": 0.352797584 + }, + { + "epoch": 1.0536170212765958, + "grad_norm": 0.9743978381156921, + "loss": 5.0101, + "lr": 0.0005506293706293707, + "step": 3714, + "tokens_trained": 0.352988184 + }, + { + "epoch": 1.0541843971631206, + "grad_norm": 0.9676964282989502, + "loss": 5.0518, + "lr": 0.0005503496503496503, + "step": 3716, + "tokens_trained": 0.353180088 + }, + { + "epoch": 1.0547517730496454, + "grad_norm": 0.8736178874969482, + "loss": 5.0278, + "lr": 0.0005500699300699301, + "step": 3718, + "tokens_trained": 0.353370808 + }, + { + "epoch": 1.0553191489361702, + "grad_norm": 0.8516846895217896, + "loss": 4.9893, + "lr": 0.0005497902097902098, + "step": 3720, + "tokens_trained": 0.35356052 + }, + { + "epoch": 1.055886524822695, + "grad_norm": 1.0038187503814697, + "loss": 5.0376, + "lr": 0.0005495104895104896, + "step": 3722, + "tokens_trained": 0.353752744 + }, + { + "epoch": 1.05645390070922, + "grad_norm": 0.9077925682067871, + "loss": 5.045, + "lr": 0.0005492307692307692, + "step": 3724, + "tokens_trained": 0.353944136 + }, + { + "epoch": 1.0570212765957447, + "grad_norm": 0.8750975728034973, + "loss": 5.0275, + "lr": 0.0005489510489510489, + "step": 3726, + "tokens_trained": 0.354135648 + }, + { + "epoch": 1.0575886524822695, + "grad_norm": 0.9059204459190369, + "loss": 5.0502, + "lr": 0.0005486713286713286, + "step": 3728, + "tokens_trained": 0.354325256 + }, + { + "epoch": 1.0581560283687943, + "grad_norm": 0.8883426189422607, + "loss": 5.0016, + "lr": 0.0005483916083916084, + "step": 3730, + "tokens_trained": 0.354517776 + }, + { + "epoch": 1.058723404255319, + "grad_norm": 0.911379873752594, + "loss": 5.0363, + "lr": 0.0005481118881118882, + "step": 3732, + "tokens_trained": 0.354706528 + }, + { + "epoch": 1.0592907801418439, + "grad_norm": 0.8956911563873291, + "loss": 5.0028, + "lr": 0.0005478321678321678, + "step": 3734, + "tokens_trained": 0.354896352 + }, + { + "epoch": 1.0598581560283689, + "grad_norm": 0.9133324027061462, + "loss": 5.0426, + "lr": 0.0005475524475524476, + "step": 3736, + "tokens_trained": 0.3550884 + }, + { + "epoch": 1.0604255319148936, + "grad_norm": 0.8321526050567627, + "loss": 4.9918, + "lr": 0.0005472727272727273, + "step": 3738, + "tokens_trained": 0.355277608 + }, + { + "epoch": 1.0609929078014184, + "grad_norm": 0.8607254028320312, + "loss": 5.021, + "lr": 0.000546993006993007, + "step": 3740, + "tokens_trained": 0.355467432 + }, + { + "epoch": 1.0615602836879432, + "grad_norm": 0.8457037806510925, + "loss": 5.037, + "lr": 0.0005467132867132867, + "step": 3742, + "tokens_trained": 0.355659088 + }, + { + "epoch": 1.0621276595744682, + "grad_norm": 0.9381092190742493, + "loss": 4.9878, + "lr": 0.0005464335664335664, + "step": 3744, + "tokens_trained": 0.35585168 + }, + { + "epoch": 1.062695035460993, + "grad_norm": 0.8678731918334961, + "loss": 5.0716, + "lr": 0.0005461538461538461, + "step": 3746, + "tokens_trained": 0.356040984 + }, + { + "epoch": 1.0632624113475178, + "grad_norm": 0.8570135235786438, + "loss": 5.0018, + "lr": 0.0005458741258741259, + "step": 3748, + "tokens_trained": 0.356230064 + }, + { + "epoch": 1.0638297872340425, + "grad_norm": 0.9624084234237671, + "loss": 5.0531, + "lr": 0.0005455944055944057, + "step": 3750, + "tokens_trained": 0.356419352 + }, + { + "epoch": 1.0638297872340425, + "eval_loss": 5.037150859832764, + "eval_runtime": 20.8153, + "step": 3750, + "tokens_trained": 0.356419352 + }, + { + "epoch": 1.0643971631205673, + "grad_norm": 0.9213569760322571, + "loss": 5.0228, + "lr": 0.0005453146853146853, + "step": 3752, + "tokens_trained": 0.356611128 + }, + { + "epoch": 1.064964539007092, + "grad_norm": 0.8769538998603821, + "loss": 5.0138, + "lr": 0.000545034965034965, + "step": 3754, + "tokens_trained": 0.356800248 + }, + { + "epoch": 1.065531914893617, + "grad_norm": 0.9480370879173279, + "loss": 5.056, + "lr": 0.0005447552447552448, + "step": 3756, + "tokens_trained": 0.35699148 + }, + { + "epoch": 1.0660992907801419, + "grad_norm": 0.8391848206520081, + "loss": 5.0256, + "lr": 0.0005444755244755245, + "step": 3758, + "tokens_trained": 0.357182168 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 0.85853111743927, + "loss": 5.0147, + "lr": 0.0005441958041958042, + "step": 3760, + "tokens_trained": 0.357373032 + }, + { + "epoch": 1.0672340425531914, + "grad_norm": 0.8469287753105164, + "loss": 4.9702, + "lr": 0.0005439160839160839, + "step": 3762, + "tokens_trained": 0.357562944 + }, + { + "epoch": 1.0678014184397162, + "grad_norm": 0.8880507349967957, + "loss": 5.0123, + "lr": 0.0005436363636363635, + "step": 3764, + "tokens_trained": 0.357753872 + }, + { + "epoch": 1.0683687943262412, + "grad_norm": 0.9235898852348328, + "loss": 4.9693, + "lr": 0.0005433566433566434, + "step": 3766, + "tokens_trained": 0.357944312 + }, + { + "epoch": 1.068936170212766, + "grad_norm": 0.8787907361984253, + "loss": 4.9987, + "lr": 0.0005430769230769231, + "step": 3768, + "tokens_trained": 0.35813388 + }, + { + "epoch": 1.0695035460992908, + "grad_norm": 0.8627321124076843, + "loss": 4.9938, + "lr": 0.0005427972027972028, + "step": 3770, + "tokens_trained": 0.35832436 + }, + { + "epoch": 1.0700709219858155, + "grad_norm": 0.8891534805297852, + "loss": 4.9982, + "lr": 0.0005425174825174825, + "step": 3772, + "tokens_trained": 0.35851672 + }, + { + "epoch": 1.0706382978723403, + "grad_norm": 0.947503387928009, + "loss": 5.0114, + "lr": 0.0005422377622377623, + "step": 3774, + "tokens_trained": 0.358705936 + }, + { + "epoch": 1.0712056737588653, + "grad_norm": 0.9056106805801392, + "loss": 5.0199, + "lr": 0.000541958041958042, + "step": 3776, + "tokens_trained": 0.358896904 + }, + { + "epoch": 1.07177304964539, + "grad_norm": 0.9422404766082764, + "loss": 5.0556, + "lr": 0.0005416783216783216, + "step": 3778, + "tokens_trained": 0.35908716 + }, + { + "epoch": 1.0723404255319149, + "grad_norm": 0.9013909101486206, + "loss": 5.0516, + "lr": 0.0005413986013986014, + "step": 3780, + "tokens_trained": 0.359276784 + }, + { + "epoch": 1.0729078014184397, + "grad_norm": 0.8561504483222961, + "loss": 4.973, + "lr": 0.000541118881118881, + "step": 3782, + "tokens_trained": 0.35946816 + }, + { + "epoch": 1.0734751773049644, + "grad_norm": 0.8561832308769226, + "loss": 5.053, + "lr": 0.0005408391608391609, + "step": 3784, + "tokens_trained": 0.3596616 + }, + { + "epoch": 1.0740425531914894, + "grad_norm": 0.7730107307434082, + "loss": 5.006, + "lr": 0.0005405594405594406, + "step": 3786, + "tokens_trained": 0.359853624 + }, + { + "epoch": 1.0746099290780142, + "grad_norm": 0.889777660369873, + "loss": 5.0275, + "lr": 0.0005402797202797203, + "step": 3788, + "tokens_trained": 0.360044616 + }, + { + "epoch": 1.075177304964539, + "grad_norm": 0.8839589357376099, + "loss": 5.0635, + "lr": 0.00054, + "step": 3790, + "tokens_trained": 0.36023484 + }, + { + "epoch": 1.0757446808510638, + "grad_norm": 0.8542807102203369, + "loss": 5.0161, + "lr": 0.0005397202797202798, + "step": 3792, + "tokens_trained": 0.3604262 + }, + { + "epoch": 1.0763120567375886, + "grad_norm": 0.8976609706878662, + "loss": 5.0693, + "lr": 0.0005394405594405595, + "step": 3794, + "tokens_trained": 0.360615912 + }, + { + "epoch": 1.0768794326241136, + "grad_norm": 0.8138758540153503, + "loss": 4.9589, + "lr": 0.0005391608391608391, + "step": 3796, + "tokens_trained": 0.360807648 + }, + { + "epoch": 1.0774468085106383, + "grad_norm": 0.8604118227958679, + "loss": 5.0311, + "lr": 0.0005388811188811189, + "step": 3798, + "tokens_trained": 0.360998824 + }, + { + "epoch": 1.0780141843971631, + "grad_norm": 0.8839350342750549, + "loss": 5.0355, + "lr": 0.0005386013986013985, + "step": 3800, + "tokens_trained": 0.36119052 + }, + { + "epoch": 1.078581560283688, + "grad_norm": 0.9019435048103333, + "loss": 4.9899, + "lr": 0.0005383216783216784, + "step": 3802, + "tokens_trained": 0.361380456 + }, + { + "epoch": 1.0791489361702127, + "grad_norm": 0.8486269116401672, + "loss": 5.033, + "lr": 0.0005380419580419581, + "step": 3804, + "tokens_trained": 0.361569216 + }, + { + "epoch": 1.0797163120567377, + "grad_norm": 0.8133941888809204, + "loss": 5.0129, + "lr": 0.0005377622377622377, + "step": 3806, + "tokens_trained": 0.361761648 + }, + { + "epoch": 1.0802836879432625, + "grad_norm": 0.8590590357780457, + "loss": 5.0938, + "lr": 0.0005374825174825175, + "step": 3808, + "tokens_trained": 0.361950784 + }, + { + "epoch": 1.0808510638297872, + "grad_norm": 0.8362659215927124, + "loss": 5.0397, + "lr": 0.0005372027972027972, + "step": 3810, + "tokens_trained": 0.3621414 + }, + { + "epoch": 1.081418439716312, + "grad_norm": 0.912358820438385, + "loss": 4.9804, + "lr": 0.000536923076923077, + "step": 3812, + "tokens_trained": 0.362330072 + }, + { + "epoch": 1.0819858156028368, + "grad_norm": 0.9518508911132812, + "loss": 4.9975, + "lr": 0.0005366433566433566, + "step": 3814, + "tokens_trained": 0.362521472 + }, + { + "epoch": 1.0825531914893618, + "grad_norm": 0.8164550065994263, + "loss": 4.9745, + "lr": 0.0005363636363636364, + "step": 3816, + "tokens_trained": 0.362710744 + }, + { + "epoch": 1.0831205673758866, + "grad_norm": 0.9007307887077332, + "loss": 4.9715, + "lr": 0.000536083916083916, + "step": 3818, + "tokens_trained": 0.362900624 + }, + { + "epoch": 1.0836879432624114, + "grad_norm": 0.8775385022163391, + "loss": 5.0296, + "lr": 0.0005358041958041959, + "step": 3820, + "tokens_trained": 0.36309048 + }, + { + "epoch": 1.0842553191489361, + "grad_norm": 0.7864426970481873, + "loss": 4.9837, + "lr": 0.0005355244755244756, + "step": 3822, + "tokens_trained": 0.363280088 + }, + { + "epoch": 1.084822695035461, + "grad_norm": 0.7757525444030762, + "loss": 5.0445, + "lr": 0.0005352447552447552, + "step": 3824, + "tokens_trained": 0.363470768 + }, + { + "epoch": 1.085390070921986, + "grad_norm": 0.7588837146759033, + "loss": 5.0431, + "lr": 0.000534965034965035, + "step": 3826, + "tokens_trained": 0.363661176 + }, + { + "epoch": 1.0859574468085107, + "grad_norm": 0.8844705820083618, + "loss": 5.0192, + "lr": 0.0005346853146853147, + "step": 3828, + "tokens_trained": 0.363852544 + }, + { + "epoch": 1.0865248226950355, + "grad_norm": 0.8446291089057922, + "loss": 5.0647, + "lr": 0.0005344055944055945, + "step": 3830, + "tokens_trained": 0.364044088 + }, + { + "epoch": 1.0870921985815603, + "grad_norm": 0.8611181974411011, + "loss": 5.0475, + "lr": 0.0005341258741258741, + "step": 3832, + "tokens_trained": 0.364234688 + }, + { + "epoch": 1.087659574468085, + "grad_norm": 0.8670753240585327, + "loss": 5.0243, + "lr": 0.0005338461538461538, + "step": 3834, + "tokens_trained": 0.364424096 + }, + { + "epoch": 1.0882269503546098, + "grad_norm": 0.8563777208328247, + "loss": 5.0512, + "lr": 0.0005335664335664335, + "step": 3836, + "tokens_trained": 0.364611896 + }, + { + "epoch": 1.0887943262411348, + "grad_norm": 0.849647581577301, + "loss": 5.0089, + "lr": 0.0005332867132867133, + "step": 3838, + "tokens_trained": 0.364800808 + }, + { + "epoch": 1.0893617021276596, + "grad_norm": 0.8674852252006531, + "loss": 5.0018, + "lr": 0.0005330069930069931, + "step": 3840, + "tokens_trained": 0.364993432 + }, + { + "epoch": 1.0899290780141844, + "grad_norm": 0.8642079830169678, + "loss": 4.9989, + "lr": 0.0005327272727272727, + "step": 3842, + "tokens_trained": 0.365182432 + }, + { + "epoch": 1.0904964539007092, + "grad_norm": 0.8550288081169128, + "loss": 4.9855, + "lr": 0.0005324475524475525, + "step": 3844, + "tokens_trained": 0.365372416 + }, + { + "epoch": 1.0910638297872342, + "grad_norm": 0.901297926902771, + "loss": 5.0342, + "lr": 0.0005321678321678322, + "step": 3846, + "tokens_trained": 0.365564576 + }, + { + "epoch": 1.091631205673759, + "grad_norm": 0.8426658511161804, + "loss": 5.0301, + "lr": 0.000531888111888112, + "step": 3848, + "tokens_trained": 0.36575708 + }, + { + "epoch": 1.0921985815602837, + "grad_norm": 0.8530165553092957, + "loss": 5.071, + "lr": 0.0005316083916083916, + "step": 3850, + "tokens_trained": 0.365947984 + }, + { + "epoch": 1.0927659574468085, + "grad_norm": 0.9010403156280518, + "loss": 5.0279, + "lr": 0.0005313286713286713, + "step": 3852, + "tokens_trained": 0.366136392 + }, + { + "epoch": 1.0933333333333333, + "grad_norm": 0.9402730464935303, + "loss": 4.9896, + "lr": 0.000531048951048951, + "step": 3854, + "tokens_trained": 0.36632536 + }, + { + "epoch": 1.093900709219858, + "grad_norm": 0.8633377552032471, + "loss": 5.0093, + "lr": 0.0005307692307692308, + "step": 3856, + "tokens_trained": 0.366515056 + }, + { + "epoch": 1.094468085106383, + "grad_norm": 0.8778465390205383, + "loss": 4.9574, + "lr": 0.0005304895104895106, + "step": 3858, + "tokens_trained": 0.366705328 + }, + { + "epoch": 1.0950354609929078, + "grad_norm": 0.8562993407249451, + "loss": 4.9938, + "lr": 0.0005302097902097902, + "step": 3860, + "tokens_trained": 0.36689668 + }, + { + "epoch": 1.0956028368794326, + "grad_norm": 0.8061450719833374, + "loss": 5.0132, + "lr": 0.0005299300699300699, + "step": 3862, + "tokens_trained": 0.367087104 + }, + { + "epoch": 1.0961702127659574, + "grad_norm": 0.9253963232040405, + "loss": 4.9672, + "lr": 0.0005296503496503497, + "step": 3864, + "tokens_trained": 0.36727676 + }, + { + "epoch": 1.0967375886524824, + "grad_norm": 0.8339546918869019, + "loss": 4.9757, + "lr": 0.0005293706293706294, + "step": 3866, + "tokens_trained": 0.367467928 + }, + { + "epoch": 1.0973049645390072, + "grad_norm": 0.7303675413131714, + "loss": 4.9613, + "lr": 0.0005290909090909091, + "step": 3868, + "tokens_trained": 0.36765876 + }, + { + "epoch": 1.097872340425532, + "grad_norm": 0.8417290449142456, + "loss": 5.0074, + "lr": 0.0005288111888111888, + "step": 3870, + "tokens_trained": 0.367848064 + }, + { + "epoch": 1.0984397163120567, + "grad_norm": 0.7773861289024353, + "loss": 4.9411, + "lr": 0.0005285314685314684, + "step": 3872, + "tokens_trained": 0.368038176 + }, + { + "epoch": 1.0990070921985815, + "grad_norm": 0.8101850152015686, + "loss": 5.0479, + "lr": 0.0005282517482517483, + "step": 3874, + "tokens_trained": 0.368228208 + }, + { + "epoch": 1.099290780141844, + "eval_loss": 5.027334690093994, + "eval_runtime": 20.6629, + "step": 3875, + "tokens_trained": 0.368324424 + }, + { + "epoch": 1.0995744680851063, + "grad_norm": 0.8131702542304993, + "loss": 5.045, + "lr": 0.000527972027972028, + "step": 3876, + "tokens_trained": 0.368421216 + }, + { + "epoch": 1.1001418439716313, + "grad_norm": 0.7819017171859741, + "loss": 5.0151, + "lr": 0.0005276923076923077, + "step": 3878, + "tokens_trained": 0.368612904 + }, + { + "epoch": 1.100709219858156, + "grad_norm": 0.8118953108787537, + "loss": 5.0233, + "lr": 0.0005274125874125874, + "step": 3880, + "tokens_trained": 0.368803144 + }, + { + "epoch": 1.1012765957446808, + "grad_norm": 0.8203917741775513, + "loss": 4.9401, + "lr": 0.0005271328671328672, + "step": 3882, + "tokens_trained": 0.368993072 + }, + { + "epoch": 1.1018439716312056, + "grad_norm": 0.8229494690895081, + "loss": 5.0605, + "lr": 0.0005268531468531469, + "step": 3884, + "tokens_trained": 0.36918396 + }, + { + "epoch": 1.1024113475177304, + "grad_norm": 0.7224509119987488, + "loss": 5.03, + "lr": 0.0005265734265734266, + "step": 3886, + "tokens_trained": 0.36937192 + }, + { + "epoch": 1.1029787234042554, + "grad_norm": 0.8122052550315857, + "loss": 5.0416, + "lr": 0.0005262937062937063, + "step": 3888, + "tokens_trained": 0.36956204 + }, + { + "epoch": 1.1035460992907802, + "grad_norm": 0.8190508484840393, + "loss": 5.0106, + "lr": 0.0005260139860139859, + "step": 3890, + "tokens_trained": 0.369753208 + }, + { + "epoch": 1.104113475177305, + "grad_norm": 0.845341682434082, + "loss": 5.0, + "lr": 0.0005257342657342658, + "step": 3892, + "tokens_trained": 0.36994372 + }, + { + "epoch": 1.1046808510638297, + "grad_norm": 0.9587157964706421, + "loss": 5.0319, + "lr": 0.0005254545454545455, + "step": 3894, + "tokens_trained": 0.370133776 + }, + { + "epoch": 1.1052482269503545, + "grad_norm": 0.8882042169570923, + "loss": 4.9617, + "lr": 0.0005251748251748252, + "step": 3896, + "tokens_trained": 0.370322504 + }, + { + "epoch": 1.1058156028368795, + "grad_norm": 0.879010021686554, + "loss": 4.9197, + "lr": 0.0005248951048951049, + "step": 3898, + "tokens_trained": 0.370514864 + }, + { + "epoch": 1.1063829787234043, + "grad_norm": 0.9890635013580322, + "loss": 5.0381, + "lr": 0.0005246153846153847, + "step": 3900, + "tokens_trained": 0.370706568 + }, + { + "epoch": 1.106950354609929, + "grad_norm": 0.8491361737251282, + "loss": 5.0187, + "lr": 0.0005243356643356644, + "step": 3902, + "tokens_trained": 0.370899112 + }, + { + "epoch": 1.1075177304964539, + "grad_norm": 0.8746361136436462, + "loss": 5.0972, + "lr": 0.000524055944055944, + "step": 3904, + "tokens_trained": 0.37108932 + }, + { + "epoch": 1.1080851063829786, + "grad_norm": 0.9623220562934875, + "loss": 5.0143, + "lr": 0.0005237762237762238, + "step": 3906, + "tokens_trained": 0.371276808 + }, + { + "epoch": 1.1086524822695036, + "grad_norm": 0.8145681023597717, + "loss": 5.0081, + "lr": 0.0005234965034965034, + "step": 3908, + "tokens_trained": 0.3714666 + }, + { + "epoch": 1.1092198581560284, + "grad_norm": 0.8862302899360657, + "loss": 5.0613, + "lr": 0.0005232167832167833, + "step": 3910, + "tokens_trained": 0.371654632 + }, + { + "epoch": 1.1097872340425532, + "grad_norm": 0.8897994160652161, + "loss": 5.0447, + "lr": 0.000522937062937063, + "step": 3912, + "tokens_trained": 0.37184496 + }, + { + "epoch": 1.110354609929078, + "grad_norm": 0.9659616947174072, + "loss": 5.0852, + "lr": 0.0005226573426573427, + "step": 3914, + "tokens_trained": 0.372034032 + }, + { + "epoch": 1.1109219858156028, + "grad_norm": 0.8457762002944946, + "loss": 4.9992, + "lr": 0.0005223776223776224, + "step": 3916, + "tokens_trained": 0.372224056 + }, + { + "epoch": 1.1114893617021278, + "grad_norm": 0.8297874927520752, + "loss": 5.0416, + "lr": 0.0005220979020979021, + "step": 3918, + "tokens_trained": 0.372413992 + }, + { + "epoch": 1.1120567375886525, + "grad_norm": 0.8436822295188904, + "loss": 5.0201, + "lr": 0.0005218181818181819, + "step": 3920, + "tokens_trained": 0.372604784 + }, + { + "epoch": 1.1126241134751773, + "grad_norm": 0.8133619427680969, + "loss": 5.0074, + "lr": 0.0005215384615384615, + "step": 3922, + "tokens_trained": 0.372796808 + }, + { + "epoch": 1.113191489361702, + "grad_norm": 0.7879509925842285, + "loss": 5.0536, + "lr": 0.0005212587412587413, + "step": 3924, + "tokens_trained": 0.372988416 + }, + { + "epoch": 1.1137588652482269, + "grad_norm": 0.8212776780128479, + "loss": 5.0259, + "lr": 0.0005209790209790209, + "step": 3926, + "tokens_trained": 0.373178784 + }, + { + "epoch": 1.1143262411347519, + "grad_norm": 0.8426427245140076, + "loss": 5.0017, + "lr": 0.0005206993006993008, + "step": 3928, + "tokens_trained": 0.373367992 + }, + { + "epoch": 1.1148936170212767, + "grad_norm": 0.8375839591026306, + "loss": 4.9984, + "lr": 0.0005204195804195805, + "step": 3930, + "tokens_trained": 0.373558848 + }, + { + "epoch": 1.1154609929078014, + "grad_norm": 0.907742440700531, + "loss": 5.0629, + "lr": 0.0005201398601398601, + "step": 3932, + "tokens_trained": 0.373748656 + }, + { + "epoch": 1.1160283687943262, + "grad_norm": 0.8619366884231567, + "loss": 4.9702, + "lr": 0.0005198601398601399, + "step": 3934, + "tokens_trained": 0.373937232 + }, + { + "epoch": 1.116595744680851, + "grad_norm": 0.8558400273323059, + "loss": 4.9833, + "lr": 0.0005195804195804196, + "step": 3936, + "tokens_trained": 0.3741268 + }, + { + "epoch": 1.117163120567376, + "grad_norm": 0.7772043347358704, + "loss": 5.0636, + "lr": 0.0005193006993006994, + "step": 3938, + "tokens_trained": 0.374315752 + }, + { + "epoch": 1.1177304964539008, + "grad_norm": 0.9044018387794495, + "loss": 5.0419, + "lr": 0.000519020979020979, + "step": 3940, + "tokens_trained": 0.374504464 + }, + { + "epoch": 1.1182978723404255, + "grad_norm": 0.8944953083992004, + "loss": 4.961, + "lr": 0.0005187412587412588, + "step": 3942, + "tokens_trained": 0.374695528 + }, + { + "epoch": 1.1188652482269503, + "grad_norm": 0.8230746984481812, + "loss": 5.0148, + "lr": 0.0005184615384615384, + "step": 3944, + "tokens_trained": 0.374886128 + }, + { + "epoch": 1.119432624113475, + "grad_norm": 0.7891346216201782, + "loss": 4.9601, + "lr": 0.0005181818181818182, + "step": 3946, + "tokens_trained": 0.375074408 + }, + { + "epoch": 1.12, + "grad_norm": 0.8364359140396118, + "loss": 5.0317, + "lr": 0.000517902097902098, + "step": 3948, + "tokens_trained": 0.37526636 + }, + { + "epoch": 1.1205673758865249, + "grad_norm": 0.7932770252227783, + "loss": 4.9399, + "lr": 0.0005176223776223776, + "step": 3950, + "tokens_trained": 0.375455888 + }, + { + "epoch": 1.1211347517730497, + "grad_norm": 0.8276688456535339, + "loss": 4.9465, + "lr": 0.0005173426573426574, + "step": 3952, + "tokens_trained": 0.37564728 + }, + { + "epoch": 1.1217021276595744, + "grad_norm": 0.8242233991622925, + "loss": 5.0069, + "lr": 0.000517062937062937, + "step": 3954, + "tokens_trained": 0.375839296 + }, + { + "epoch": 1.1222695035460992, + "grad_norm": 0.8828895688056946, + "loss": 4.9488, + "lr": 0.0005167832167832169, + "step": 3956, + "tokens_trained": 0.376028744 + }, + { + "epoch": 1.122836879432624, + "grad_norm": 0.8730418682098389, + "loss": 4.9729, + "lr": 0.0005165034965034965, + "step": 3958, + "tokens_trained": 0.376217656 + }, + { + "epoch": 1.123404255319149, + "grad_norm": 0.7701008915901184, + "loss": 4.9922, + "lr": 0.0005162237762237762, + "step": 3960, + "tokens_trained": 0.3764066 + }, + { + "epoch": 1.1239716312056738, + "grad_norm": 0.8723980784416199, + "loss": 4.9452, + "lr": 0.0005159440559440559, + "step": 3962, + "tokens_trained": 0.376594952 + }, + { + "epoch": 1.1245390070921986, + "grad_norm": 0.9300636053085327, + "loss": 5.0595, + "lr": 0.0005156643356643357, + "step": 3964, + "tokens_trained": 0.376785256 + }, + { + "epoch": 1.1251063829787233, + "grad_norm": 0.8684858083724976, + "loss": 5.0372, + "lr": 0.0005153846153846154, + "step": 3966, + "tokens_trained": 0.376975 + }, + { + "epoch": 1.1256737588652483, + "grad_norm": 0.8335841298103333, + "loss": 5.0636, + "lr": 0.0005151048951048951, + "step": 3968, + "tokens_trained": 0.377164552 + }, + { + "epoch": 1.1262411347517731, + "grad_norm": 0.8454932570457458, + "loss": 4.9603, + "lr": 0.0005148251748251748, + "step": 3970, + "tokens_trained": 0.377353968 + }, + { + "epoch": 1.126808510638298, + "grad_norm": 0.8978991508483887, + "loss": 5.0161, + "lr": 0.0005145454545454545, + "step": 3972, + "tokens_trained": 0.377543664 + }, + { + "epoch": 1.1273758865248227, + "grad_norm": 0.863207995891571, + "loss": 4.9949, + "lr": 0.0005142657342657343, + "step": 3974, + "tokens_trained": 0.37773332 + }, + { + "epoch": 1.1279432624113475, + "grad_norm": 0.8614553213119507, + "loss": 4.9812, + "lr": 0.000513986013986014, + "step": 3976, + "tokens_trained": 0.377921272 + }, + { + "epoch": 1.1285106382978722, + "grad_norm": 0.8703583478927612, + "loss": 4.9823, + "lr": 0.0005137062937062937, + "step": 3978, + "tokens_trained": 0.378112584 + }, + { + "epoch": 1.1290780141843972, + "grad_norm": 0.7951223254203796, + "loss": 4.9732, + "lr": 0.0005134265734265734, + "step": 3980, + "tokens_trained": 0.378302088 + }, + { + "epoch": 1.129645390070922, + "grad_norm": 0.8486145734786987, + "loss": 4.9422, + "lr": 0.0005131468531468532, + "step": 3982, + "tokens_trained": 0.37849388 + }, + { + "epoch": 1.1302127659574468, + "grad_norm": 0.8054757714271545, + "loss": 4.9911, + "lr": 0.0005128671328671328, + "step": 3984, + "tokens_trained": 0.378682928 + }, + { + "epoch": 1.1307801418439716, + "grad_norm": 0.83322674036026, + "loss": 5.0289, + "lr": 0.0005125874125874126, + "step": 3986, + "tokens_trained": 0.378874296 + }, + { + "epoch": 1.1313475177304966, + "grad_norm": 0.8249304890632629, + "loss": 5.0455, + "lr": 0.0005123076923076923, + "step": 3988, + "tokens_trained": 0.379067408 + }, + { + "epoch": 1.1319148936170214, + "grad_norm": 0.8258949518203735, + "loss": 4.9703, + "lr": 0.000512027972027972, + "step": 3990, + "tokens_trained": 0.379255328 + }, + { + "epoch": 1.1324822695035461, + "grad_norm": 0.8535506725311279, + "loss": 5.0652, + "lr": 0.0005117482517482518, + "step": 3992, + "tokens_trained": 0.379446152 + }, + { + "epoch": 1.133049645390071, + "grad_norm": 0.8468305468559265, + "loss": 5.0071, + "lr": 0.0005114685314685315, + "step": 3994, + "tokens_trained": 0.379637664 + }, + { + "epoch": 1.1336170212765957, + "grad_norm": 0.8334465026855469, + "loss": 5.043, + "lr": 0.0005111888111888112, + "step": 3996, + "tokens_trained": 0.379829408 + }, + { + "epoch": 1.1341843971631205, + "grad_norm": 0.8690851926803589, + "loss": 4.9637, + "lr": 0.0005109090909090908, + "step": 3998, + "tokens_trained": 0.380021248 + }, + { + "epoch": 1.1347517730496455, + "grad_norm": 0.7997336983680725, + "loss": 5.0168, + "lr": 0.0005106293706293707, + "step": 4000, + "tokens_trained": 0.380212256 + }, + { + "epoch": 1.1347517730496455, + "eval_loss": 5.021827697753906, + "eval_runtime": 20.8538, + "step": 4000, + "tokens_trained": 0.380212256 + }, + { + "epoch": 1.1353191489361703, + "grad_norm": 0.8898105621337891, + "loss": 4.9954, + "lr": 0.0005103496503496503, + "step": 4002, + "tokens_trained": 0.380403128 + }, + { + "epoch": 1.135886524822695, + "grad_norm": 0.8997061848640442, + "loss": 5.0087, + "lr": 0.0005100699300699301, + "step": 4004, + "tokens_trained": 0.3805902 + }, + { + "epoch": 1.1364539007092198, + "grad_norm": 0.8276216387748718, + "loss": 4.9579, + "lr": 0.0005097902097902098, + "step": 4006, + "tokens_trained": 0.380778288 + }, + { + "epoch": 1.1370212765957446, + "grad_norm": 0.8275374174118042, + "loss": 4.973, + "lr": 0.0005095104895104895, + "step": 4008, + "tokens_trained": 0.38096896 + }, + { + "epoch": 1.1375886524822696, + "grad_norm": 0.881206214427948, + "loss": 5.0027, + "lr": 0.0005092307692307693, + "step": 4010, + "tokens_trained": 0.381159008 + }, + { + "epoch": 1.1381560283687944, + "grad_norm": 0.8062921762466431, + "loss": 4.9771, + "lr": 0.0005089510489510489, + "step": 4012, + "tokens_trained": 0.381350336 + }, + { + "epoch": 1.1387234042553191, + "grad_norm": 0.8482317924499512, + "loss": 4.972, + "lr": 0.0005086713286713287, + "step": 4014, + "tokens_trained": 0.381540512 + }, + { + "epoch": 1.139290780141844, + "grad_norm": 0.8180603981018066, + "loss": 5.0052, + "lr": 0.0005083916083916083, + "step": 4016, + "tokens_trained": 0.38173168 + }, + { + "epoch": 1.1398581560283687, + "grad_norm": 0.7816891670227051, + "loss": 4.9689, + "lr": 0.0005081118881118882, + "step": 4018, + "tokens_trained": 0.381922056 + }, + { + "epoch": 1.1404255319148937, + "grad_norm": 0.831451952457428, + "loss": 4.9931, + "lr": 0.0005078321678321678, + "step": 4020, + "tokens_trained": 0.382111816 + }, + { + "epoch": 1.1409929078014185, + "grad_norm": 0.8557744026184082, + "loss": 5.0101, + "lr": 0.0005075524475524476, + "step": 4022, + "tokens_trained": 0.38230276 + }, + { + "epoch": 1.1415602836879433, + "grad_norm": 0.8070439696311951, + "loss": 5.0457, + "lr": 0.0005072727272727273, + "step": 4024, + "tokens_trained": 0.38249052 + }, + { + "epoch": 1.142127659574468, + "grad_norm": 0.9021100401878357, + "loss": 4.9979, + "lr": 0.0005069930069930069, + "step": 4026, + "tokens_trained": 0.382679696 + }, + { + "epoch": 1.1426950354609928, + "grad_norm": 0.8565911650657654, + "loss": 4.9828, + "lr": 0.0005067132867132868, + "step": 4028, + "tokens_trained": 0.382869408 + }, + { + "epoch": 1.1432624113475178, + "grad_norm": 0.8522788286209106, + "loss": 5.0306, + "lr": 0.0005064335664335664, + "step": 4030, + "tokens_trained": 0.383058416 + }, + { + "epoch": 1.1438297872340426, + "grad_norm": 0.79361891746521, + "loss": 5.0027, + "lr": 0.0005061538461538462, + "step": 4032, + "tokens_trained": 0.383248504 + }, + { + "epoch": 1.1443971631205674, + "grad_norm": 0.8457452654838562, + "loss": 4.9762, + "lr": 0.0005058741258741258, + "step": 4034, + "tokens_trained": 0.383439016 + }, + { + "epoch": 1.1449645390070922, + "grad_norm": 0.9091781377792358, + "loss": 5.0534, + "lr": 0.0005055944055944057, + "step": 4036, + "tokens_trained": 0.383630552 + }, + { + "epoch": 1.145531914893617, + "grad_norm": 0.8448526263237, + "loss": 5.0068, + "lr": 0.0005053146853146853, + "step": 4038, + "tokens_trained": 0.383817712 + }, + { + "epoch": 1.1460992907801417, + "grad_norm": 0.7852639555931091, + "loss": 4.9615, + "lr": 0.000505034965034965, + "step": 4040, + "tokens_trained": 0.384008192 + }, + { + "epoch": 1.1466666666666667, + "grad_norm": 0.7787274122238159, + "loss": 5.0035, + "lr": 0.0005047552447552448, + "step": 4042, + "tokens_trained": 0.38419848 + }, + { + "epoch": 1.1472340425531915, + "grad_norm": 0.9463234543800354, + "loss": 5.0284, + "lr": 0.0005044755244755244, + "step": 4044, + "tokens_trained": 0.384390448 + }, + { + "epoch": 1.1478014184397163, + "grad_norm": 0.9096873998641968, + "loss": 5.0104, + "lr": 0.0005041958041958043, + "step": 4046, + "tokens_trained": 0.384578688 + }, + { + "epoch": 1.148368794326241, + "grad_norm": 0.8237007856369019, + "loss": 5.0225, + "lr": 0.0005039160839160839, + "step": 4048, + "tokens_trained": 0.384769368 + }, + { + "epoch": 1.148936170212766, + "grad_norm": 0.8391951322555542, + "loss": 4.9316, + "lr": 0.0005036363636363637, + "step": 4050, + "tokens_trained": 0.384959448 + }, + { + "epoch": 1.1495035460992908, + "grad_norm": 0.8555214405059814, + "loss": 5.0299, + "lr": 0.0005033566433566433, + "step": 4052, + "tokens_trained": 0.385148392 + }, + { + "epoch": 1.1500709219858156, + "grad_norm": 0.813484251499176, + "loss": 5.0792, + "lr": 0.0005030769230769231, + "step": 4054, + "tokens_trained": 0.385338144 + }, + { + "epoch": 1.1506382978723404, + "grad_norm": 0.8149204850196838, + "loss": 5.0607, + "lr": 0.0005027972027972028, + "step": 4056, + "tokens_trained": 0.385528776 + }, + { + "epoch": 1.1512056737588652, + "grad_norm": 0.8909300565719604, + "loss": 5.007, + "lr": 0.0005025174825174825, + "step": 4058, + "tokens_trained": 0.385717672 + }, + { + "epoch": 1.15177304964539, + "grad_norm": 0.8447635173797607, + "loss": 5.024, + "lr": 0.0005022377622377623, + "step": 4060, + "tokens_trained": 0.3859074 + }, + { + "epoch": 1.152340425531915, + "grad_norm": 0.8429125547409058, + "loss": 4.9871, + "lr": 0.0005019580419580419, + "step": 4062, + "tokens_trained": 0.386096712 + }, + { + "epoch": 1.1529078014184397, + "grad_norm": 0.8532034158706665, + "loss": 4.9807, + "lr": 0.0005016783216783218, + "step": 4064, + "tokens_trained": 0.386290392 + }, + { + "epoch": 1.1534751773049645, + "grad_norm": 0.8414303064346313, + "loss": 5.0426, + "lr": 0.0005013986013986014, + "step": 4066, + "tokens_trained": 0.386484048 + }, + { + "epoch": 1.1540425531914893, + "grad_norm": 0.8659424185752869, + "loss": 4.9572, + "lr": 0.0005011188811188811, + "step": 4068, + "tokens_trained": 0.386670896 + }, + { + "epoch": 1.1546099290780143, + "grad_norm": 0.8472128510475159, + "loss": 4.9993, + "lr": 0.0005008391608391608, + "step": 4070, + "tokens_trained": 0.38686096 + }, + { + "epoch": 1.155177304964539, + "grad_norm": 0.7704010009765625, + "loss": 5.0267, + "lr": 0.0005005594405594406, + "step": 4072, + "tokens_trained": 0.387052256 + }, + { + "epoch": 1.1557446808510639, + "grad_norm": 0.8503726720809937, + "loss": 4.953, + "lr": 0.0005002797202797203, + "step": 4074, + "tokens_trained": 0.387241648 + }, + { + "epoch": 1.1563120567375886, + "grad_norm": 0.8159539699554443, + "loss": 5.0096, + "lr": 0.0005, + "step": 4076, + "tokens_trained": 0.387432368 + }, + { + "epoch": 1.1568794326241134, + "grad_norm": 0.7673088312149048, + "loss": 4.9996, + "lr": 0.0004997202797202798, + "step": 4078, + "tokens_trained": 0.387620656 + }, + { + "epoch": 1.1574468085106382, + "grad_norm": 0.8308261036872864, + "loss": 5.0114, + "lr": 0.0004994405594405594, + "step": 4080, + "tokens_trained": 0.387809712 + }, + { + "epoch": 1.1580141843971632, + "grad_norm": 0.8294357657432556, + "loss": 5.0508, + "lr": 0.0004991608391608391, + "step": 4082, + "tokens_trained": 0.387999152 + }, + { + "epoch": 1.158581560283688, + "grad_norm": 0.8797832727432251, + "loss": 4.9784, + "lr": 0.0004988811188811189, + "step": 4084, + "tokens_trained": 0.3881876 + }, + { + "epoch": 1.1591489361702128, + "grad_norm": 0.8250353932380676, + "loss": 4.959, + "lr": 0.0004986013986013986, + "step": 4086, + "tokens_trained": 0.38837592 + }, + { + "epoch": 1.1597163120567375, + "grad_norm": 0.8896451592445374, + "loss": 5.0103, + "lr": 0.0004983216783216784, + "step": 4088, + "tokens_trained": 0.388565768 + }, + { + "epoch": 1.1602836879432625, + "grad_norm": 0.7970037460327148, + "loss": 5.0534, + "lr": 0.0004980419580419581, + "step": 4090, + "tokens_trained": 0.388755536 + }, + { + "epoch": 1.1608510638297873, + "grad_norm": 0.8623605966567993, + "loss": 4.986, + "lr": 0.0004977622377622378, + "step": 4092, + "tokens_trained": 0.388947 + }, + { + "epoch": 1.161418439716312, + "grad_norm": 0.8195328712463379, + "loss": 5.0193, + "lr": 0.0004974825174825175, + "step": 4094, + "tokens_trained": 0.38913532 + }, + { + "epoch": 1.1619858156028369, + "grad_norm": 0.8058289885520935, + "loss": 5.0001, + "lr": 0.0004972027972027972, + "step": 4096, + "tokens_trained": 0.389325904 + }, + { + "epoch": 1.1625531914893616, + "grad_norm": 0.8325840830802917, + "loss": 5.0711, + "lr": 0.0004969230769230769, + "step": 4098, + "tokens_trained": 0.3895166 + }, + { + "epoch": 1.1631205673758864, + "grad_norm": 0.8684342503547668, + "loss": 4.9548, + "lr": 0.0004966433566433566, + "step": 4100, + "tokens_trained": 0.389704048 + }, + { + "epoch": 1.1636879432624114, + "grad_norm": 0.891304612159729, + "loss": 4.9711, + "lr": 0.0004963636363636364, + "step": 4102, + "tokens_trained": 0.389893816 + }, + { + "epoch": 1.1642553191489362, + "grad_norm": 0.8750278353691101, + "loss": 5.0493, + "lr": 0.0004960839160839161, + "step": 4104, + "tokens_trained": 0.390082752 + }, + { + "epoch": 1.164822695035461, + "grad_norm": 0.8391188383102417, + "loss": 4.9804, + "lr": 0.0004958041958041959, + "step": 4106, + "tokens_trained": 0.390272096 + }, + { + "epoch": 1.1653900709219858, + "grad_norm": 0.8190635442733765, + "loss": 5.0121, + "lr": 0.0004955244755244756, + "step": 4108, + "tokens_trained": 0.390462024 + }, + { + "epoch": 1.1659574468085105, + "grad_norm": 0.7800264954566956, + "loss": 4.9819, + "lr": 0.0004952447552447552, + "step": 4110, + "tokens_trained": 0.390651968 + }, + { + "epoch": 1.1665248226950355, + "grad_norm": 0.8210972547531128, + "loss": 4.9929, + "lr": 0.000494965034965035, + "step": 4112, + "tokens_trained": 0.390842776 + }, + { + "epoch": 1.1670921985815603, + "grad_norm": 0.9442235827445984, + "loss": 5.0133, + "lr": 0.0004946853146853147, + "step": 4114, + "tokens_trained": 0.391031856 + }, + { + "epoch": 1.167659574468085, + "grad_norm": 0.8627631068229675, + "loss": 4.9587, + "lr": 0.0004944055944055944, + "step": 4116, + "tokens_trained": 0.391223288 + }, + { + "epoch": 1.1682269503546099, + "grad_norm": 0.7751641869544983, + "loss": 4.9934, + "lr": 0.0004941258741258741, + "step": 4118, + "tokens_trained": 0.391412784 + }, + { + "epoch": 1.1687943262411347, + "grad_norm": 0.8243580460548401, + "loss": 5.0126, + "lr": 0.0004938461538461538, + "step": 4120, + "tokens_trained": 0.391603056 + }, + { + "epoch": 1.1693617021276597, + "grad_norm": 0.8990906476974487, + "loss": 5.0234, + "lr": 0.0004935664335664336, + "step": 4122, + "tokens_trained": 0.391793368 + }, + { + "epoch": 1.1699290780141844, + "grad_norm": 0.8721649050712585, + "loss": 4.997, + "lr": 0.0004932867132867133, + "step": 4124, + "tokens_trained": 0.39198508 + }, + { + "epoch": 1.1702127659574468, + "eval_loss": 5.014278411865234, + "eval_runtime": 21.0162, + "step": 4125, + "tokens_trained": 0.392082752 + }, + { + "epoch": 1.1704964539007092, + "grad_norm": 0.7662192583084106, + "loss": 4.9791, + "lr": 0.0004930069930069931, + "step": 4126, + "tokens_trained": 0.392179088 + }, + { + "epoch": 1.171063829787234, + "grad_norm": 0.9081931710243225, + "loss": 4.9882, + "lr": 0.0004927272727272727, + "step": 4128, + "tokens_trained": 0.392369312 + }, + { + "epoch": 1.1716312056737588, + "grad_norm": 0.8503204584121704, + "loss": 5.0403, + "lr": 0.0004924475524475525, + "step": 4130, + "tokens_trained": 0.392557944 + }, + { + "epoch": 1.1721985815602838, + "grad_norm": 0.8676162362098694, + "loss": 5.0716, + "lr": 0.0004921678321678322, + "step": 4132, + "tokens_trained": 0.39274924 + }, + { + "epoch": 1.1727659574468086, + "grad_norm": 0.8527748584747314, + "loss": 5.0416, + "lr": 0.0004918881118881118, + "step": 4134, + "tokens_trained": 0.392939672 + }, + { + "epoch": 1.1733333333333333, + "grad_norm": 0.8113415241241455, + "loss": 5.0525, + "lr": 0.0004916083916083916, + "step": 4136, + "tokens_trained": 0.393131152 + }, + { + "epoch": 1.1739007092198581, + "grad_norm": 0.8555265665054321, + "loss": 5.0734, + "lr": 0.0004913286713286713, + "step": 4138, + "tokens_trained": 0.39332136 + }, + { + "epoch": 1.174468085106383, + "grad_norm": 0.9134076237678528, + "loss": 4.9742, + "lr": 0.0004910489510489511, + "step": 4140, + "tokens_trained": 0.393509376 + }, + { + "epoch": 1.1750354609929077, + "grad_norm": 0.8159533739089966, + "loss": 5.0728, + "lr": 0.0004907692307692308, + "step": 4142, + "tokens_trained": 0.393699616 + }, + { + "epoch": 1.1756028368794327, + "grad_norm": 0.8070579767227173, + "loss": 5.0032, + "lr": 0.0004904895104895106, + "step": 4144, + "tokens_trained": 0.393888176 + }, + { + "epoch": 1.1761702127659575, + "grad_norm": 0.8635644316673279, + "loss": 5.0564, + "lr": 0.0004902097902097902, + "step": 4146, + "tokens_trained": 0.39407804 + }, + { + "epoch": 1.1767375886524822, + "grad_norm": 0.8500214219093323, + "loss": 4.9698, + "lr": 0.00048993006993007, + "step": 4148, + "tokens_trained": 0.394268456 + }, + { + "epoch": 1.177304964539007, + "grad_norm": 0.8485430479049683, + "loss": 4.9751, + "lr": 0.0004896503496503497, + "step": 4150, + "tokens_trained": 0.394459912 + }, + { + "epoch": 1.177872340425532, + "grad_norm": 0.8265682458877563, + "loss": 4.9703, + "lr": 0.0004893706293706293, + "step": 4152, + "tokens_trained": 0.394650984 + }, + { + "epoch": 1.1784397163120568, + "grad_norm": 0.7867625951766968, + "loss": 4.8901, + "lr": 0.0004890909090909091, + "step": 4154, + "tokens_trained": 0.394843184 + }, + { + "epoch": 1.1790070921985816, + "grad_norm": 0.8666532635688782, + "loss": 4.9144, + "lr": 0.0004888111888111888, + "step": 4156, + "tokens_trained": 0.39503568 + }, + { + "epoch": 1.1795744680851064, + "grad_norm": 0.862920880317688, + "loss": 4.9529, + "lr": 0.0004885314685314686, + "step": 4158, + "tokens_trained": 0.395225424 + }, + { + "epoch": 1.1801418439716311, + "grad_norm": 0.810485303401947, + "loss": 5.0165, + "lr": 0.0004882517482517483, + "step": 4160, + "tokens_trained": 0.395415632 + }, + { + "epoch": 1.180709219858156, + "grad_norm": 0.7997188568115234, + "loss": 5.0197, + "lr": 0.000487972027972028, + "step": 4162, + "tokens_trained": 0.39560452 + }, + { + "epoch": 1.181276595744681, + "grad_norm": 0.8133664727210999, + "loss": 5.0056, + "lr": 0.0004876923076923077, + "step": 4164, + "tokens_trained": 0.395794008 + }, + { + "epoch": 1.1818439716312057, + "grad_norm": 0.8120067119598389, + "loss": 4.913, + "lr": 0.00048741258741258743, + "step": 4166, + "tokens_trained": 0.395983296 + }, + { + "epoch": 1.1824113475177305, + "grad_norm": 0.8434014320373535, + "loss": 4.9777, + "lr": 0.0004871328671328671, + "step": 4168, + "tokens_trained": 0.396175216 + }, + { + "epoch": 1.1829787234042553, + "grad_norm": 0.8452426195144653, + "loss": 4.9693, + "lr": 0.00048685314685314687, + "step": 4170, + "tokens_trained": 0.3963634 + }, + { + "epoch": 1.1835460992907803, + "grad_norm": 0.8733723759651184, + "loss": 4.9757, + "lr": 0.00048657342657342656, + "step": 4172, + "tokens_trained": 0.39655404 + }, + { + "epoch": 1.184113475177305, + "grad_norm": 0.8372209072113037, + "loss": 4.9725, + "lr": 0.0004862937062937063, + "step": 4174, + "tokens_trained": 0.396744688 + }, + { + "epoch": 1.1846808510638298, + "grad_norm": 0.7722007632255554, + "loss": 5.0234, + "lr": 0.000486013986013986, + "step": 4176, + "tokens_trained": 0.396935848 + }, + { + "epoch": 1.1852482269503546, + "grad_norm": 0.8685297966003418, + "loss": 4.9777, + "lr": 0.0004857342657342658, + "step": 4178, + "tokens_trained": 0.39712576 + }, + { + "epoch": 1.1858156028368794, + "grad_norm": 0.8083483576774597, + "loss": 4.973, + "lr": 0.0004854545454545455, + "step": 4180, + "tokens_trained": 0.397315672 + }, + { + "epoch": 1.1863829787234041, + "grad_norm": 0.8481479287147522, + "loss": 5.0308, + "lr": 0.00048517482517482517, + "step": 4182, + "tokens_trained": 0.39750464 + }, + { + "epoch": 1.1869503546099291, + "grad_norm": 0.7996193170547485, + "loss": 4.9251, + "lr": 0.0004848951048951049, + "step": 4184, + "tokens_trained": 0.397693584 + }, + { + "epoch": 1.187517730496454, + "grad_norm": 0.811189591884613, + "loss": 5.0092, + "lr": 0.0004846153846153846, + "step": 4186, + "tokens_trained": 0.397883352 + }, + { + "epoch": 1.1880851063829787, + "grad_norm": 0.9195986390113831, + "loss": 4.961, + "lr": 0.00048433566433566435, + "step": 4188, + "tokens_trained": 0.398073712 + }, + { + "epoch": 1.1886524822695035, + "grad_norm": 0.8444050550460815, + "loss": 4.9707, + "lr": 0.00048405594405594404, + "step": 4190, + "tokens_trained": 0.398265744 + }, + { + "epoch": 1.1892198581560285, + "grad_norm": 0.859663724899292, + "loss": 5.0202, + "lr": 0.0004837762237762238, + "step": 4192, + "tokens_trained": 0.39845568 + }, + { + "epoch": 1.1897872340425533, + "grad_norm": 0.8403055667877197, + "loss": 4.9831, + "lr": 0.0004834965034965035, + "step": 4194, + "tokens_trained": 0.398647696 + }, + { + "epoch": 1.190354609929078, + "grad_norm": 0.8377063870429993, + "loss": 5.0545, + "lr": 0.0004832167832167833, + "step": 4196, + "tokens_trained": 0.398838432 + }, + { + "epoch": 1.1909219858156028, + "grad_norm": 0.8102120161056519, + "loss": 5.0068, + "lr": 0.00048293706293706297, + "step": 4198, + "tokens_trained": 0.399027968 + }, + { + "epoch": 1.1914893617021276, + "grad_norm": 0.8520330190658569, + "loss": 5.0102, + "lr": 0.00048265734265734266, + "step": 4200, + "tokens_trained": 0.3992202 + }, + { + "epoch": 1.1920567375886524, + "grad_norm": 0.8204303979873657, + "loss": 5.0303, + "lr": 0.0004823776223776224, + "step": 4202, + "tokens_trained": 0.399411656 + }, + { + "epoch": 1.1926241134751774, + "grad_norm": 0.8569766879081726, + "loss": 5.0097, + "lr": 0.0004820979020979021, + "step": 4204, + "tokens_trained": 0.399602136 + }, + { + "epoch": 1.1931914893617022, + "grad_norm": 0.8269557952880859, + "loss": 4.9694, + "lr": 0.00048181818181818184, + "step": 4206, + "tokens_trained": 0.399793544 + }, + { + "epoch": 1.193758865248227, + "grad_norm": 0.9124187231063843, + "loss": 4.9506, + "lr": 0.0004815384615384615, + "step": 4208, + "tokens_trained": 0.399982856 + }, + { + "epoch": 1.1943262411347517, + "grad_norm": 0.8813201189041138, + "loss": 4.9989, + "lr": 0.00048125874125874127, + "step": 4210, + "tokens_trained": 0.400173184 + }, + { + "epoch": 1.1948936170212765, + "grad_norm": 0.8605351448059082, + "loss": 5.0437, + "lr": 0.00048097902097902096, + "step": 4212, + "tokens_trained": 0.400363824 + }, + { + "epoch": 1.1954609929078015, + "grad_norm": 0.8277431726455688, + "loss": 5.0283, + "lr": 0.00048069930069930076, + "step": 4214, + "tokens_trained": 0.400554648 + }, + { + "epoch": 1.1960283687943263, + "grad_norm": 0.828187108039856, + "loss": 5.0573, + "lr": 0.00048041958041958045, + "step": 4216, + "tokens_trained": 0.400746632 + }, + { + "epoch": 1.196595744680851, + "grad_norm": 0.8459845781326294, + "loss": 5.0734, + "lr": 0.00048013986013986014, + "step": 4218, + "tokens_trained": 0.400937568 + }, + { + "epoch": 1.1971631205673758, + "grad_norm": 0.7948288321495056, + "loss": 5.011, + "lr": 0.0004798601398601399, + "step": 4220, + "tokens_trained": 0.401127024 + }, + { + "epoch": 1.1977304964539006, + "grad_norm": 0.8868036866188049, + "loss": 5.0318, + "lr": 0.0004795804195804196, + "step": 4222, + "tokens_trained": 0.401318248 + }, + { + "epoch": 1.1982978723404256, + "grad_norm": 0.7660478353500366, + "loss": 5.0656, + "lr": 0.0004793006993006993, + "step": 4224, + "tokens_trained": 0.401506136 + }, + { + "epoch": 1.1988652482269504, + "grad_norm": 0.779299259185791, + "loss": 4.9907, + "lr": 0.000479020979020979, + "step": 4226, + "tokens_trained": 0.401696856 + }, + { + "epoch": 1.1994326241134752, + "grad_norm": 0.7903150916099548, + "loss": 4.9744, + "lr": 0.00047874125874125875, + "step": 4228, + "tokens_trained": 0.401885744 + }, + { + "epoch": 1.2, + "grad_norm": 0.7829038500785828, + "loss": 4.9847, + "lr": 0.00047846153846153844, + "step": 4230, + "tokens_trained": 0.402075072 + }, + { + "epoch": 1.2005673758865247, + "grad_norm": 0.9025991559028625, + "loss": 4.9758, + "lr": 0.00047818181818181824, + "step": 4232, + "tokens_trained": 0.4022674 + }, + { + "epoch": 1.2011347517730497, + "grad_norm": 0.8891049027442932, + "loss": 4.9791, + "lr": 0.00047790209790209793, + "step": 4234, + "tokens_trained": 0.402459792 + }, + { + "epoch": 1.2017021276595745, + "grad_norm": 0.7566952109336853, + "loss": 5.0183, + "lr": 0.0004776223776223776, + "step": 4236, + "tokens_trained": 0.402649768 + }, + { + "epoch": 1.2022695035460993, + "grad_norm": 0.80048668384552, + "loss": 4.9493, + "lr": 0.00047734265734265737, + "step": 4238, + "tokens_trained": 0.4028382 + }, + { + "epoch": 1.202836879432624, + "grad_norm": 0.7540125250816345, + "loss": 4.9685, + "lr": 0.00047706293706293706, + "step": 4240, + "tokens_trained": 0.403028848 + }, + { + "epoch": 1.2034042553191489, + "grad_norm": 0.7707799673080444, + "loss": 4.984, + "lr": 0.0004767832167832168, + "step": 4242, + "tokens_trained": 0.40321844 + }, + { + "epoch": 1.2039716312056739, + "grad_norm": 0.7681775093078613, + "loss": 4.9807, + "lr": 0.0004765034965034965, + "step": 4244, + "tokens_trained": 0.40340716 + }, + { + "epoch": 1.2045390070921986, + "grad_norm": 0.7557908892631531, + "loss": 4.9912, + "lr": 0.00047622377622377624, + "step": 4246, + "tokens_trained": 0.403600152 + }, + { + "epoch": 1.2051063829787234, + "grad_norm": 0.822948694229126, + "loss": 5.0144, + "lr": 0.00047594405594405593, + "step": 4248, + "tokens_trained": 0.403788 + }, + { + "epoch": 1.2056737588652482, + "grad_norm": 0.7625008225440979, + "loss": 4.8949, + "lr": 0.00047566433566433573, + "step": 4250, + "tokens_trained": 0.40397872 + }, + { + "epoch": 1.2056737588652482, + "eval_loss": 5.00390625, + "eval_runtime": 20.2421, + "step": 4250, + "tokens_trained": 0.40397872 + }, + { + "epoch": 1.206241134751773, + "grad_norm": 0.7532864212989807, + "loss": 5.0128, + "lr": 0.0004753846153846154, + "step": 4252, + "tokens_trained": 0.404169384 + }, + { + "epoch": 1.206808510638298, + "grad_norm": 0.69386887550354, + "loss": 4.9849, + "lr": 0.0004751048951048951, + "step": 4254, + "tokens_trained": 0.40435968 + }, + { + "epoch": 1.2073758865248228, + "grad_norm": 0.7845306992530823, + "loss": 5.0254, + "lr": 0.00047482517482517485, + "step": 4256, + "tokens_trained": 0.404549424 + }, + { + "epoch": 1.2079432624113475, + "grad_norm": 0.8036428093910217, + "loss": 4.9676, + "lr": 0.00047454545454545454, + "step": 4258, + "tokens_trained": 0.404739344 + }, + { + "epoch": 1.2085106382978723, + "grad_norm": 0.8440237045288086, + "loss": 4.9965, + "lr": 0.0004742657342657343, + "step": 4260, + "tokens_trained": 0.40492952 + }, + { + "epoch": 1.209078014184397, + "grad_norm": 0.7936769127845764, + "loss": 5.0458, + "lr": 0.000473986013986014, + "step": 4262, + "tokens_trained": 0.405117144 + }, + { + "epoch": 1.2096453900709219, + "grad_norm": 0.8117086291313171, + "loss": 5.0196, + "lr": 0.0004737062937062937, + "step": 4264, + "tokens_trained": 0.405310184 + }, + { + "epoch": 1.2102127659574469, + "grad_norm": 0.7395413517951965, + "loss": 4.9655, + "lr": 0.0004734265734265734, + "step": 4266, + "tokens_trained": 0.405498272 + }, + { + "epoch": 1.2107801418439716, + "grad_norm": 0.8879559636116028, + "loss": 4.9637, + "lr": 0.0004731468531468531, + "step": 4268, + "tokens_trained": 0.4056908 + }, + { + "epoch": 1.2113475177304964, + "grad_norm": 0.8651279211044312, + "loss": 4.945, + "lr": 0.0004728671328671329, + "step": 4270, + "tokens_trained": 0.405879384 + }, + { + "epoch": 1.2119148936170212, + "grad_norm": 0.8421851992607117, + "loss": 4.9391, + "lr": 0.0004725874125874126, + "step": 4272, + "tokens_trained": 0.406071432 + }, + { + "epoch": 1.2124822695035462, + "grad_norm": 0.815262496471405, + "loss": 5.0465, + "lr": 0.00047230769230769234, + "step": 4274, + "tokens_trained": 0.406262776 + }, + { + "epoch": 1.213049645390071, + "grad_norm": 0.8042894005775452, + "loss": 4.8908, + "lr": 0.00047202797202797203, + "step": 4276, + "tokens_trained": 0.406452656 + }, + { + "epoch": 1.2136170212765958, + "grad_norm": 0.8514822721481323, + "loss": 4.9961, + "lr": 0.00047174825174825177, + "step": 4278, + "tokens_trained": 0.406642224 + }, + { + "epoch": 1.2141843971631205, + "grad_norm": 0.7532519102096558, + "loss": 4.9658, + "lr": 0.00047146853146853146, + "step": 4280, + "tokens_trained": 0.406830288 + }, + { + "epoch": 1.2147517730496453, + "grad_norm": 0.7978721261024475, + "loss": 4.9477, + "lr": 0.0004711888111888112, + "step": 4282, + "tokens_trained": 0.4070214 + }, + { + "epoch": 1.21531914893617, + "grad_norm": 0.8998175859451294, + "loss": 5.0531, + "lr": 0.0004709090909090909, + "step": 4284, + "tokens_trained": 0.407211064 + }, + { + "epoch": 1.215886524822695, + "grad_norm": 0.7281949520111084, + "loss": 4.9474, + "lr": 0.0004706293706293706, + "step": 4286, + "tokens_trained": 0.40740104 + }, + { + "epoch": 1.2164539007092199, + "grad_norm": 0.7590287923812866, + "loss": 5.0104, + "lr": 0.0004703496503496504, + "step": 4288, + "tokens_trained": 0.40759144 + }, + { + "epoch": 1.2170212765957447, + "grad_norm": 0.8452118039131165, + "loss": 5.024, + "lr": 0.0004700699300699301, + "step": 4290, + "tokens_trained": 0.407780576 + }, + { + "epoch": 1.2175886524822694, + "grad_norm": 0.8062863945960999, + "loss": 5.0099, + "lr": 0.0004697902097902098, + "step": 4292, + "tokens_trained": 0.407971808 + }, + { + "epoch": 1.2181560283687944, + "grad_norm": 0.8372058272361755, + "loss": 5.0832, + "lr": 0.0004695104895104895, + "step": 4294, + "tokens_trained": 0.408162104 + }, + { + "epoch": 1.2187234042553192, + "grad_norm": 0.7989845871925354, + "loss": 4.971, + "lr": 0.00046923076923076926, + "step": 4296, + "tokens_trained": 0.408351392 + }, + { + "epoch": 1.219290780141844, + "grad_norm": 0.7519237399101257, + "loss": 4.9739, + "lr": 0.00046895104895104895, + "step": 4298, + "tokens_trained": 0.408541056 + }, + { + "epoch": 1.2198581560283688, + "grad_norm": 0.769143283367157, + "loss": 4.9483, + "lr": 0.0004686713286713287, + "step": 4300, + "tokens_trained": 0.408731728 + }, + { + "epoch": 1.2204255319148936, + "grad_norm": 0.7855169177055359, + "loss": 5.0007, + "lr": 0.0004683916083916084, + "step": 4302, + "tokens_trained": 0.408921824 + }, + { + "epoch": 1.2209929078014183, + "grad_norm": 0.8531661629676819, + "loss": 5.018, + "lr": 0.00046811188811188807, + "step": 4304, + "tokens_trained": 0.409112528 + }, + { + "epoch": 1.2215602836879433, + "grad_norm": 0.8178502321243286, + "loss": 4.9869, + "lr": 0.00046783216783216787, + "step": 4306, + "tokens_trained": 0.40930284 + }, + { + "epoch": 1.2221276595744681, + "grad_norm": 0.7806143164634705, + "loss": 4.9561, + "lr": 0.00046755244755244756, + "step": 4308, + "tokens_trained": 0.409492304 + }, + { + "epoch": 1.222695035460993, + "grad_norm": 0.7506605982780457, + "loss": 4.937, + "lr": 0.0004672727272727273, + "step": 4310, + "tokens_trained": 0.409680208 + }, + { + "epoch": 1.2232624113475177, + "grad_norm": 0.8441674113273621, + "loss": 4.9137, + "lr": 0.000466993006993007, + "step": 4312, + "tokens_trained": 0.409869952 + }, + { + "epoch": 1.2238297872340427, + "grad_norm": 0.8911812901496887, + "loss": 5.0072, + "lr": 0.00046671328671328674, + "step": 4314, + "tokens_trained": 0.410058728 + }, + { + "epoch": 1.2243971631205675, + "grad_norm": 0.7732901573181152, + "loss": 4.9094, + "lr": 0.00046643356643356643, + "step": 4316, + "tokens_trained": 0.410249624 + }, + { + "epoch": 1.2249645390070922, + "grad_norm": 0.7372212409973145, + "loss": 4.9646, + "lr": 0.0004661538461538462, + "step": 4318, + "tokens_trained": 0.410440088 + }, + { + "epoch": 1.225531914893617, + "grad_norm": 0.8266177177429199, + "loss": 5.01, + "lr": 0.00046587412587412587, + "step": 4320, + "tokens_trained": 0.410630384 + }, + { + "epoch": 1.2260992907801418, + "grad_norm": 0.7471604347229004, + "loss": 4.9741, + "lr": 0.00046559440559440556, + "step": 4322, + "tokens_trained": 0.410819312 + }, + { + "epoch": 1.2266666666666666, + "grad_norm": 0.8529990911483765, + "loss": 5.0115, + "lr": 0.00046531468531468536, + "step": 4324, + "tokens_trained": 0.411007776 + }, + { + "epoch": 1.2272340425531916, + "grad_norm": 0.8250638246536255, + "loss": 4.9974, + "lr": 0.00046503496503496505, + "step": 4326, + "tokens_trained": 0.4111994 + }, + { + "epoch": 1.2278014184397164, + "grad_norm": 0.7049713730812073, + "loss": 4.9412, + "lr": 0.0004647552447552448, + "step": 4328, + "tokens_trained": 0.411387512 + }, + { + "epoch": 1.2283687943262411, + "grad_norm": 0.8164275884628296, + "loss": 4.9696, + "lr": 0.0004644755244755245, + "step": 4330, + "tokens_trained": 0.411579192 + }, + { + "epoch": 1.228936170212766, + "grad_norm": 0.786007821559906, + "loss": 4.9015, + "lr": 0.0004641958041958042, + "step": 4332, + "tokens_trained": 0.411769256 + }, + { + "epoch": 1.2295035460992907, + "grad_norm": 0.7956440448760986, + "loss": 4.9864, + "lr": 0.0004639160839160839, + "step": 4334, + "tokens_trained": 0.411958112 + }, + { + "epoch": 1.2300709219858157, + "grad_norm": 0.7968415021896362, + "loss": 5.0563, + "lr": 0.00046363636363636366, + "step": 4336, + "tokens_trained": 0.412148936 + }, + { + "epoch": 1.2306382978723405, + "grad_norm": 0.9666130542755127, + "loss": 4.9907, + "lr": 0.00046335664335664335, + "step": 4338, + "tokens_trained": 0.412337728 + }, + { + "epoch": 1.2312056737588652, + "grad_norm": 0.9147318005561829, + "loss": 5.0003, + "lr": 0.00046307692307692304, + "step": 4340, + "tokens_trained": 0.412527736 + }, + { + "epoch": 1.23177304964539, + "grad_norm": 0.7779629230499268, + "loss": 4.9392, + "lr": 0.00046279720279720284, + "step": 4342, + "tokens_trained": 0.412717944 + }, + { + "epoch": 1.2323404255319148, + "grad_norm": 0.8160842061042786, + "loss": 4.9644, + "lr": 0.00046251748251748253, + "step": 4344, + "tokens_trained": 0.412909288 + }, + { + "epoch": 1.2329078014184398, + "grad_norm": 0.8430790305137634, + "loss": 4.9472, + "lr": 0.0004622377622377623, + "step": 4346, + "tokens_trained": 0.413097912 + }, + { + "epoch": 1.2334751773049646, + "grad_norm": 0.8291404843330383, + "loss": 4.9647, + "lr": 0.00046195804195804196, + "step": 4348, + "tokens_trained": 0.413290272 + }, + { + "epoch": 1.2340425531914894, + "grad_norm": 0.8272704482078552, + "loss": 4.9685, + "lr": 0.0004616783216783217, + "step": 4350, + "tokens_trained": 0.41348152 + }, + { + "epoch": 1.2346099290780141, + "grad_norm": 0.7785531282424927, + "loss": 5.0172, + "lr": 0.0004613986013986014, + "step": 4352, + "tokens_trained": 0.413670184 + }, + { + "epoch": 1.235177304964539, + "grad_norm": 0.8512988090515137, + "loss": 4.9727, + "lr": 0.00046111888111888114, + "step": 4354, + "tokens_trained": 0.413860232 + }, + { + "epoch": 1.235744680851064, + "grad_norm": 0.7373901009559631, + "loss": 4.9092, + "lr": 0.00046083916083916083, + "step": 4356, + "tokens_trained": 0.414051312 + }, + { + "epoch": 1.2363120567375887, + "grad_norm": 0.7716902494430542, + "loss": 4.9456, + "lr": 0.0004605594405594405, + "step": 4358, + "tokens_trained": 0.414239448 + }, + { + "epoch": 1.2368794326241135, + "grad_norm": 0.8303737044334412, + "loss": 4.9656, + "lr": 0.0004602797202797203, + "step": 4360, + "tokens_trained": 0.414430488 + }, + { + "epoch": 1.2374468085106383, + "grad_norm": 0.850261926651001, + "loss": 4.9407, + "lr": 0.00046, + "step": 4362, + "tokens_trained": 0.414620536 + }, + { + "epoch": 1.238014184397163, + "grad_norm": 0.8391888737678528, + "loss": 4.9772, + "lr": 0.00045972027972027976, + "step": 4364, + "tokens_trained": 0.4148106 + }, + { + "epoch": 1.2385815602836878, + "grad_norm": 0.8289617300033569, + "loss": 5.0061, + "lr": 0.00045944055944055945, + "step": 4366, + "tokens_trained": 0.414998608 + }, + { + "epoch": 1.2391489361702128, + "grad_norm": 0.801800549030304, + "loss": 5.0436, + "lr": 0.0004591608391608392, + "step": 4368, + "tokens_trained": 0.415190568 + }, + { + "epoch": 1.2397163120567376, + "grad_norm": 0.8448522686958313, + "loss": 4.9398, + "lr": 0.0004588811188811189, + "step": 4370, + "tokens_trained": 0.415378536 + }, + { + "epoch": 1.2402836879432624, + "grad_norm": 0.8992466330528259, + "loss": 4.9277, + "lr": 0.0004586013986013986, + "step": 4372, + "tokens_trained": 0.4155704 + }, + { + "epoch": 1.2408510638297872, + "grad_norm": 0.8534346222877502, + "loss": 4.8933, + "lr": 0.0004583216783216783, + "step": 4374, + "tokens_trained": 0.41575984 + }, + { + "epoch": 1.2411347517730495, + "eval_loss": 4.997620582580566, + "eval_runtime": 20.4786, + "step": 4375, + "tokens_trained": 0.415855704 + }, + { + "epoch": 1.2414184397163122, + "grad_norm": 0.8547607064247131, + "loss": 5.0331, + "lr": 0.000458041958041958, + "step": 4376, + "tokens_trained": 0.415951704 + }, + { + "epoch": 1.241985815602837, + "grad_norm": 0.7995121479034424, + "loss": 4.9727, + "lr": 0.0004577622377622378, + "step": 4378, + "tokens_trained": 0.416142464 + }, + { + "epoch": 1.2425531914893617, + "grad_norm": 0.7953593730926514, + "loss": 5.054, + "lr": 0.0004574825174825175, + "step": 4380, + "tokens_trained": 0.416331184 + }, + { + "epoch": 1.2431205673758865, + "grad_norm": 0.8307169079780579, + "loss": 4.9694, + "lr": 0.00045720279720279724, + "step": 4382, + "tokens_trained": 0.416522688 + }, + { + "epoch": 1.2436879432624113, + "grad_norm": 0.8380933403968811, + "loss": 4.9432, + "lr": 0.00045692307692307693, + "step": 4384, + "tokens_trained": 0.416712408 + }, + { + "epoch": 1.244255319148936, + "grad_norm": 0.8354132771492004, + "loss": 4.9649, + "lr": 0.0004566433566433567, + "step": 4386, + "tokens_trained": 0.416902056 + }, + { + "epoch": 1.244822695035461, + "grad_norm": 0.8815358877182007, + "loss": 4.9998, + "lr": 0.00045636363636363637, + "step": 4388, + "tokens_trained": 0.417090856 + }, + { + "epoch": 1.2453900709219858, + "grad_norm": 0.8799077868461609, + "loss": 4.984, + "lr": 0.00045608391608391606, + "step": 4390, + "tokens_trained": 0.417281408 + }, + { + "epoch": 1.2459574468085106, + "grad_norm": 0.9041373133659363, + "loss": 4.9209, + "lr": 0.0004558041958041958, + "step": 4392, + "tokens_trained": 0.41747192 + }, + { + "epoch": 1.2465248226950354, + "grad_norm": 0.8234816193580627, + "loss": 5.022, + "lr": 0.0004555244755244755, + "step": 4394, + "tokens_trained": 0.41766064 + }, + { + "epoch": 1.2470921985815604, + "grad_norm": 0.8067740797996521, + "loss": 5.0255, + "lr": 0.00045524475524475524, + "step": 4396, + "tokens_trained": 0.417852816 + }, + { + "epoch": 1.2476595744680852, + "grad_norm": 0.812566876411438, + "loss": 4.9524, + "lr": 0.000454965034965035, + "step": 4398, + "tokens_trained": 0.418043848 + }, + { + "epoch": 1.24822695035461, + "grad_norm": 0.7977521419525146, + "loss": 5.023, + "lr": 0.0004546853146853147, + "step": 4400, + "tokens_trained": 0.418234224 + }, + { + "epoch": 1.2487943262411347, + "grad_norm": 0.7514439225196838, + "loss": 4.9909, + "lr": 0.0004544055944055944, + "step": 4402, + "tokens_trained": 0.418424576 + }, + { + "epoch": 1.2493617021276595, + "grad_norm": 0.7931577563285828, + "loss": 5.0128, + "lr": 0.00045412587412587416, + "step": 4404, + "tokens_trained": 0.418616776 + }, + { + "epoch": 1.2499290780141843, + "grad_norm": 0.787543773651123, + "loss": 4.9616, + "lr": 0.00045384615384615385, + "step": 4406, + "tokens_trained": 0.418805744 + }, + { + "epoch": 1.2504964539007093, + "grad_norm": 0.7384114861488342, + "loss": 5.0641, + "lr": 0.00045356643356643354, + "step": 4408, + "tokens_trained": 0.418997784 + }, + { + "epoch": 1.251063829787234, + "grad_norm": 0.8014666438102722, + "loss": 4.9652, + "lr": 0.0004532867132867133, + "step": 4410, + "tokens_trained": 0.419187464 + }, + { + "epoch": 1.2516312056737589, + "grad_norm": 0.7648611068725586, + "loss": 4.9813, + "lr": 0.000453006993006993, + "step": 4412, + "tokens_trained": 0.419376864 + }, + { + "epoch": 1.2521985815602836, + "grad_norm": 0.7647461891174316, + "loss": 5.0052, + "lr": 0.0004527272727272727, + "step": 4414, + "tokens_trained": 0.419568352 + }, + { + "epoch": 1.2527659574468086, + "grad_norm": 0.7152479887008667, + "loss": 4.9851, + "lr": 0.00045244755244755247, + "step": 4416, + "tokens_trained": 0.419759464 + }, + { + "epoch": 1.2533333333333334, + "grad_norm": 0.7977505326271057, + "loss": 5.0082, + "lr": 0.0004521678321678322, + "step": 4418, + "tokens_trained": 0.419951 + }, + { + "epoch": 1.2539007092198582, + "grad_norm": 0.7556982040405273, + "loss": 5.0207, + "lr": 0.0004518881118881119, + "step": 4420, + "tokens_trained": 0.420141312 + }, + { + "epoch": 1.254468085106383, + "grad_norm": 0.8059271574020386, + "loss": 5.0286, + "lr": 0.00045160839160839165, + "step": 4422, + "tokens_trained": 0.420330672 + }, + { + "epoch": 1.2550354609929077, + "grad_norm": 0.836380660533905, + "loss": 4.9406, + "lr": 0.00045132867132867134, + "step": 4424, + "tokens_trained": 0.420519952 + }, + { + "epoch": 1.2556028368794325, + "grad_norm": 0.7693254947662354, + "loss": 4.9533, + "lr": 0.000451048951048951, + "step": 4426, + "tokens_trained": 0.42070948 + }, + { + "epoch": 1.2561702127659575, + "grad_norm": 0.8241584897041321, + "loss": 5.0407, + "lr": 0.00045076923076923077, + "step": 4428, + "tokens_trained": 0.420899504 + }, + { + "epoch": 1.2567375886524823, + "grad_norm": 0.7866604328155518, + "loss": 4.9119, + "lr": 0.00045048951048951046, + "step": 4430, + "tokens_trained": 0.421088352 + }, + { + "epoch": 1.257304964539007, + "grad_norm": 0.8286674618721008, + "loss": 5.016, + "lr": 0.0004502097902097902, + "step": 4432, + "tokens_trained": 0.421277528 + }, + { + "epoch": 1.2578723404255319, + "grad_norm": 0.7921491265296936, + "loss": 4.9991, + "lr": 0.00044993006993006995, + "step": 4434, + "tokens_trained": 0.421468272 + }, + { + "epoch": 1.2584397163120569, + "grad_norm": 0.807640016078949, + "loss": 5.042, + "lr": 0.0004496503496503497, + "step": 4436, + "tokens_trained": 0.421658096 + }, + { + "epoch": 1.2590070921985816, + "grad_norm": 0.7414442896842957, + "loss": 4.9647, + "lr": 0.0004493706293706294, + "step": 4438, + "tokens_trained": 0.421849712 + }, + { + "epoch": 1.2595744680851064, + "grad_norm": 0.8236945867538452, + "loss": 4.9562, + "lr": 0.00044909090909090913, + "step": 4440, + "tokens_trained": 0.422038344 + }, + { + "epoch": 1.2601418439716312, + "grad_norm": 0.7859675884246826, + "loss": 4.9568, + "lr": 0.0004488111888111888, + "step": 4442, + "tokens_trained": 0.422227928 + }, + { + "epoch": 1.260709219858156, + "grad_norm": 0.7467136383056641, + "loss": 4.9543, + "lr": 0.0004485314685314685, + "step": 4444, + "tokens_trained": 0.422415664 + }, + { + "epoch": 1.2612765957446808, + "grad_norm": 0.711588978767395, + "loss": 5.0494, + "lr": 0.00044825174825174826, + "step": 4446, + "tokens_trained": 0.422606696 + }, + { + "epoch": 1.2618439716312055, + "grad_norm": 0.750599205493927, + "loss": 4.9878, + "lr": 0.00044797202797202795, + "step": 4448, + "tokens_trained": 0.422796416 + }, + { + "epoch": 1.2624113475177305, + "grad_norm": 0.7823654413223267, + "loss": 4.947, + "lr": 0.0004476923076923077, + "step": 4450, + "tokens_trained": 0.422986968 + }, + { + "epoch": 1.2629787234042553, + "grad_norm": 0.8101715445518494, + "loss": 4.925, + "lr": 0.00044741258741258744, + "step": 4452, + "tokens_trained": 0.423174384 + }, + { + "epoch": 1.26354609929078, + "grad_norm": 0.8134462237358093, + "loss": 5.051, + "lr": 0.0004471328671328672, + "step": 4454, + "tokens_trained": 0.42336536 + }, + { + "epoch": 1.264113475177305, + "grad_norm": 0.8446463942527771, + "loss": 4.9789, + "lr": 0.00044685314685314687, + "step": 4456, + "tokens_trained": 0.423556136 + }, + { + "epoch": 1.2646808510638299, + "grad_norm": 0.7812824845314026, + "loss": 4.9819, + "lr": 0.0004465734265734266, + "step": 4458, + "tokens_trained": 0.423745736 + }, + { + "epoch": 1.2652482269503547, + "grad_norm": 0.7645587921142578, + "loss": 4.9824, + "lr": 0.0004462937062937063, + "step": 4460, + "tokens_trained": 0.423935408 + }, + { + "epoch": 1.2658156028368794, + "grad_norm": 0.8110623955726624, + "loss": 4.9664, + "lr": 0.000446013986013986, + "step": 4462, + "tokens_trained": 0.424125264 + }, + { + "epoch": 1.2663829787234042, + "grad_norm": 0.7860397696495056, + "loss": 4.9871, + "lr": 0.00044573426573426574, + "step": 4464, + "tokens_trained": 0.424314544 + }, + { + "epoch": 1.266950354609929, + "grad_norm": 0.7764657735824585, + "loss": 5.0335, + "lr": 0.00044545454545454543, + "step": 4466, + "tokens_trained": 0.424502264 + }, + { + "epoch": 1.2675177304964538, + "grad_norm": 0.7725886702537537, + "loss": 4.9705, + "lr": 0.0004451748251748252, + "step": 4468, + "tokens_trained": 0.424691888 + }, + { + "epoch": 1.2680851063829788, + "grad_norm": 0.8336632251739502, + "loss": 5.0535, + "lr": 0.0004448951048951049, + "step": 4470, + "tokens_trained": 0.424880992 + }, + { + "epoch": 1.2686524822695036, + "grad_norm": 0.7934354543685913, + "loss": 5.0105, + "lr": 0.00044461538461538466, + "step": 4472, + "tokens_trained": 0.425069536 + }, + { + "epoch": 1.2692198581560283, + "grad_norm": 0.7649230360984802, + "loss": 4.978, + "lr": 0.00044433566433566435, + "step": 4474, + "tokens_trained": 0.425259168 + }, + { + "epoch": 1.2697872340425531, + "grad_norm": 0.7798753976821899, + "loss": 5.0526, + "lr": 0.0004440559440559441, + "step": 4476, + "tokens_trained": 0.425450064 + }, + { + "epoch": 1.2703546099290781, + "grad_norm": 0.7455066442489624, + "loss": 4.9914, + "lr": 0.0004437762237762238, + "step": 4478, + "tokens_trained": 0.42564068 + }, + { + "epoch": 1.270921985815603, + "grad_norm": 0.7951638698577881, + "loss": 5.0092, + "lr": 0.0004434965034965035, + "step": 4480, + "tokens_trained": 0.42583048 + }, + { + "epoch": 1.2714893617021277, + "grad_norm": 0.7585451602935791, + "loss": 5.016, + "lr": 0.0004432167832167832, + "step": 4482, + "tokens_trained": 0.42602172 + }, + { + "epoch": 1.2720567375886525, + "grad_norm": 0.8267669081687927, + "loss": 4.972, + "lr": 0.0004429370629370629, + "step": 4484, + "tokens_trained": 0.426212496 + }, + { + "epoch": 1.2726241134751772, + "grad_norm": 0.7738245129585266, + "loss": 5.0239, + "lr": 0.00044265734265734266, + "step": 4486, + "tokens_trained": 0.426401408 + }, + { + "epoch": 1.273191489361702, + "grad_norm": 0.9146332144737244, + "loss": 5.0361, + "lr": 0.0004423776223776224, + "step": 4488, + "tokens_trained": 0.426591056 + }, + { + "epoch": 1.273758865248227, + "grad_norm": 0.8278553485870361, + "loss": 4.9512, + "lr": 0.00044209790209790215, + "step": 4490, + "tokens_trained": 0.42678144 + }, + { + "epoch": 1.2743262411347518, + "grad_norm": 0.7594732046127319, + "loss": 4.9391, + "lr": 0.00044181818181818184, + "step": 4492, + "tokens_trained": 0.426971472 + }, + { + "epoch": 1.2748936170212766, + "grad_norm": 0.8350242376327515, + "loss": 4.9151, + "lr": 0.00044153846153846153, + "step": 4494, + "tokens_trained": 0.427161504 + }, + { + "epoch": 1.2754609929078013, + "grad_norm": 0.85927414894104, + "loss": 4.9303, + "lr": 0.0004412587412587413, + "step": 4496, + "tokens_trained": 0.427351448 + }, + { + "epoch": 1.2760283687943264, + "grad_norm": 0.8133000135421753, + "loss": 4.9668, + "lr": 0.00044097902097902096, + "step": 4498, + "tokens_trained": 0.427539384 + }, + { + "epoch": 1.2765957446808511, + "grad_norm": 0.7529495358467102, + "loss": 4.9364, + "lr": 0.0004406993006993007, + "step": 4500, + "tokens_trained": 0.427730552 + }, + { + "epoch": 1.2765957446808511, + "eval_loss": 4.999549388885498, + "eval_runtime": 20.6764, + "step": 4500, + "tokens_trained": 0.427730552 + }, + { + "epoch": 1.277163120567376, + "grad_norm": 0.7332281470298767, + "loss": 5.024, + "lr": 0.0004404195804195804, + "step": 4502, + "tokens_trained": 0.427922016 + }, + { + "epoch": 1.2777304964539007, + "grad_norm": 0.7735735774040222, + "loss": 4.9235, + "lr": 0.00044013986013986014, + "step": 4504, + "tokens_trained": 0.428112824 + }, + { + "epoch": 1.2782978723404255, + "grad_norm": 0.8075562119483948, + "loss": 5.0712, + "lr": 0.0004398601398601399, + "step": 4506, + "tokens_trained": 0.428306056 + }, + { + "epoch": 1.2788652482269502, + "grad_norm": 0.8019667863845825, + "loss": 4.9597, + "lr": 0.00043958041958041963, + "step": 4508, + "tokens_trained": 0.428496768 + }, + { + "epoch": 1.2794326241134752, + "grad_norm": 0.7908930778503418, + "loss": 4.9471, + "lr": 0.0004393006993006993, + "step": 4510, + "tokens_trained": 0.428685312 + }, + { + "epoch": 1.28, + "grad_norm": 0.8128061890602112, + "loss": 4.9244, + "lr": 0.000439020979020979, + "step": 4512, + "tokens_trained": 0.428875184 + }, + { + "epoch": 1.2805673758865248, + "grad_norm": 0.7859349250793457, + "loss": 5.0096, + "lr": 0.00043874125874125876, + "step": 4514, + "tokens_trained": 0.429066688 + }, + { + "epoch": 1.2811347517730496, + "grad_norm": 0.7396280169487, + "loss": 4.9263, + "lr": 0.00043846153846153845, + "step": 4516, + "tokens_trained": 0.429254336 + }, + { + "epoch": 1.2817021276595746, + "grad_norm": 0.8057092428207397, + "loss": 4.9705, + "lr": 0.0004381818181818182, + "step": 4518, + "tokens_trained": 0.429446032 + }, + { + "epoch": 1.2822695035460994, + "grad_norm": 0.8460845351219177, + "loss": 4.9311, + "lr": 0.0004379020979020979, + "step": 4520, + "tokens_trained": 0.429636152 + }, + { + "epoch": 1.2828368794326241, + "grad_norm": 0.7627289891242981, + "loss": 4.9622, + "lr": 0.00043762237762237763, + "step": 4522, + "tokens_trained": 0.429825536 + }, + { + "epoch": 1.283404255319149, + "grad_norm": 0.7211505174636841, + "loss": 4.9851, + "lr": 0.0004373426573426573, + "step": 4524, + "tokens_trained": 0.430016616 + }, + { + "epoch": 1.2839716312056737, + "grad_norm": 0.7647969722747803, + "loss": 4.9708, + "lr": 0.0004370629370629371, + "step": 4526, + "tokens_trained": 0.430208336 + }, + { + "epoch": 1.2845390070921985, + "grad_norm": 0.7541454434394836, + "loss": 4.9404, + "lr": 0.0004367832167832168, + "step": 4528, + "tokens_trained": 0.430398968 + }, + { + "epoch": 1.2851063829787235, + "grad_norm": 0.7825188636779785, + "loss": 4.9741, + "lr": 0.0004365034965034965, + "step": 4530, + "tokens_trained": 0.430589112 + }, + { + "epoch": 1.2856737588652483, + "grad_norm": 0.7198429107666016, + "loss": 4.9745, + "lr": 0.00043622377622377624, + "step": 4532, + "tokens_trained": 0.43077964 + }, + { + "epoch": 1.286241134751773, + "grad_norm": 0.7174004912376404, + "loss": 5.037, + "lr": 0.00043594405594405593, + "step": 4534, + "tokens_trained": 0.43096964 + }, + { + "epoch": 1.2868085106382978, + "grad_norm": 0.7118927240371704, + "loss": 5.0456, + "lr": 0.0004356643356643357, + "step": 4536, + "tokens_trained": 0.431160024 + }, + { + "epoch": 1.2873758865248228, + "grad_norm": 0.7081615924835205, + "loss": 4.9763, + "lr": 0.00043538461538461537, + "step": 4538, + "tokens_trained": 0.431351344 + }, + { + "epoch": 1.2879432624113476, + "grad_norm": 0.7620618343353271, + "loss": 4.9863, + "lr": 0.0004351048951048951, + "step": 4540, + "tokens_trained": 0.43154232 + }, + { + "epoch": 1.2885106382978724, + "grad_norm": 0.8104450702667236, + "loss": 4.9903, + "lr": 0.0004348251748251748, + "step": 4542, + "tokens_trained": 0.431731592 + }, + { + "epoch": 1.2890780141843972, + "grad_norm": 0.7488150000572205, + "loss": 5.0189, + "lr": 0.0004345454545454546, + "step": 4544, + "tokens_trained": 0.431922608 + }, + { + "epoch": 1.289645390070922, + "grad_norm": 0.7956752181053162, + "loss": 4.9259, + "lr": 0.0004342657342657343, + "step": 4546, + "tokens_trained": 0.432113808 + }, + { + "epoch": 1.2902127659574467, + "grad_norm": 0.7799624800682068, + "loss": 5.0129, + "lr": 0.000433986013986014, + "step": 4548, + "tokens_trained": 0.432304088 + }, + { + "epoch": 1.2907801418439715, + "grad_norm": 0.792834997177124, + "loss": 5.0647, + "lr": 0.0004337062937062937, + "step": 4550, + "tokens_trained": 0.432493096 + }, + { + "epoch": 1.2913475177304965, + "grad_norm": 0.7479969263076782, + "loss": 4.9514, + "lr": 0.0004334265734265734, + "step": 4552, + "tokens_trained": 0.432680128 + }, + { + "epoch": 1.2919148936170213, + "grad_norm": 0.7381340861320496, + "loss": 4.9865, + "lr": 0.00043314685314685316, + "step": 4554, + "tokens_trained": 0.43287188 + }, + { + "epoch": 1.292482269503546, + "grad_norm": 0.7690939903259277, + "loss": 4.9704, + "lr": 0.00043286713286713285, + "step": 4556, + "tokens_trained": 0.43306148 + }, + { + "epoch": 1.293049645390071, + "grad_norm": 0.7883870005607605, + "loss": 4.9766, + "lr": 0.0004325874125874126, + "step": 4558, + "tokens_trained": 0.433252704 + }, + { + "epoch": 1.2936170212765958, + "grad_norm": 0.782208263874054, + "loss": 4.9967, + "lr": 0.0004323076923076923, + "step": 4560, + "tokens_trained": 0.433444272 + }, + { + "epoch": 1.2941843971631206, + "grad_norm": 0.7333335280418396, + "loss": 4.9237, + "lr": 0.0004320279720279721, + "step": 4562, + "tokens_trained": 0.433634264 + }, + { + "epoch": 1.2947517730496454, + "grad_norm": 0.7663769721984863, + "loss": 4.9961, + "lr": 0.0004317482517482518, + "step": 4564, + "tokens_trained": 0.433825632 + }, + { + "epoch": 1.2953191489361702, + "grad_norm": 0.75322026014328, + "loss": 4.9294, + "lr": 0.00043146853146853147, + "step": 4566, + "tokens_trained": 0.434015424 + }, + { + "epoch": 1.295886524822695, + "grad_norm": 0.7660694718360901, + "loss": 4.953, + "lr": 0.0004311888111888112, + "step": 4568, + "tokens_trained": 0.434208048 + }, + { + "epoch": 1.2964539007092197, + "grad_norm": 0.7548807859420776, + "loss": 4.9164, + "lr": 0.0004309090909090909, + "step": 4570, + "tokens_trained": 0.434397424 + }, + { + "epoch": 1.2970212765957447, + "grad_norm": 0.760160505771637, + "loss": 4.9748, + "lr": 0.00043062937062937065, + "step": 4572, + "tokens_trained": 0.434588752 + }, + { + "epoch": 1.2975886524822695, + "grad_norm": 0.8081098198890686, + "loss": 4.9596, + "lr": 0.00043034965034965034, + "step": 4574, + "tokens_trained": 0.434779696 + }, + { + "epoch": 1.2981560283687943, + "grad_norm": 0.7557078003883362, + "loss": 4.979, + "lr": 0.0004300699300699301, + "step": 4576, + "tokens_trained": 0.434971072 + }, + { + "epoch": 1.298723404255319, + "grad_norm": 0.7966912984848022, + "loss": 4.9257, + "lr": 0.00042979020979020977, + "step": 4578, + "tokens_trained": 0.435160496 + }, + { + "epoch": 1.299290780141844, + "grad_norm": 0.8104644417762756, + "loss": 4.9675, + "lr": 0.00042951048951048957, + "step": 4580, + "tokens_trained": 0.435349392 + }, + { + "epoch": 1.2998581560283688, + "grad_norm": 0.711733877658844, + "loss": 4.929, + "lr": 0.00042923076923076926, + "step": 4582, + "tokens_trained": 0.435539752 + }, + { + "epoch": 1.3004255319148936, + "grad_norm": 0.7435249090194702, + "loss": 5.0012, + "lr": 0.00042895104895104895, + "step": 4584, + "tokens_trained": 0.435730112 + }, + { + "epoch": 1.3009929078014184, + "grad_norm": 0.8262581825256348, + "loss": 4.9065, + "lr": 0.0004286713286713287, + "step": 4586, + "tokens_trained": 0.435918552 + }, + { + "epoch": 1.3015602836879432, + "grad_norm": 0.7614077925682068, + "loss": 5.022, + "lr": 0.0004283916083916084, + "step": 4588, + "tokens_trained": 0.43611052 + }, + { + "epoch": 1.302127659574468, + "grad_norm": 0.7792633175849915, + "loss": 4.9763, + "lr": 0.00042811188811188813, + "step": 4590, + "tokens_trained": 0.43629848 + }, + { + "epoch": 1.302695035460993, + "grad_norm": 0.748753011226654, + "loss": 4.9588, + "lr": 0.0004278321678321678, + "step": 4592, + "tokens_trained": 0.436487384 + }, + { + "epoch": 1.3032624113475177, + "grad_norm": 0.6770404577255249, + "loss": 5.0546, + "lr": 0.00042755244755244756, + "step": 4594, + "tokens_trained": 0.436677688 + }, + { + "epoch": 1.3038297872340425, + "grad_norm": 0.7595148682594299, + "loss": 4.9832, + "lr": 0.00042727272727272726, + "step": 4596, + "tokens_trained": 0.436866288 + }, + { + "epoch": 1.3043971631205673, + "grad_norm": 0.7239478230476379, + "loss": 4.9597, + "lr": 0.00042699300699300705, + "step": 4598, + "tokens_trained": 0.437057192 + }, + { + "epoch": 1.3049645390070923, + "grad_norm": 0.7907828092575073, + "loss": 5.0041, + "lr": 0.00042671328671328674, + "step": 4600, + "tokens_trained": 0.437247856 + }, + { + "epoch": 1.305531914893617, + "grad_norm": 0.6975818872451782, + "loss": 4.9256, + "lr": 0.00042643356643356643, + "step": 4602, + "tokens_trained": 0.43743696 + }, + { + "epoch": 1.3060992907801419, + "grad_norm": 0.7589024305343628, + "loss": 4.9781, + "lr": 0.0004261538461538462, + "step": 4604, + "tokens_trained": 0.437627408 + }, + { + "epoch": 1.3066666666666666, + "grad_norm": 0.7332574725151062, + "loss": 5.0012, + "lr": 0.00042587412587412587, + "step": 4606, + "tokens_trained": 0.43781732 + }, + { + "epoch": 1.3072340425531914, + "grad_norm": 0.8402982950210571, + "loss": 4.9202, + "lr": 0.0004255944055944056, + "step": 4608, + "tokens_trained": 0.438006368 + }, + { + "epoch": 1.3078014184397162, + "grad_norm": 0.8018138408660889, + "loss": 4.9518, + "lr": 0.0004253146853146853, + "step": 4610, + "tokens_trained": 0.438196728 + }, + { + "epoch": 1.3083687943262412, + "grad_norm": 0.8211417198181152, + "loss": 4.9916, + "lr": 0.00042503496503496505, + "step": 4612, + "tokens_trained": 0.43838568 + }, + { + "epoch": 1.308936170212766, + "grad_norm": 0.8054932355880737, + "loss": 4.9329, + "lr": 0.00042475524475524474, + "step": 4614, + "tokens_trained": 0.438577096 + }, + { + "epoch": 1.3095035460992908, + "grad_norm": 0.795623779296875, + "loss": 4.9572, + "lr": 0.0004244755244755245, + "step": 4616, + "tokens_trained": 0.438767032 + }, + { + "epoch": 1.3100709219858155, + "grad_norm": 0.7230743169784546, + "loss": 5.0013, + "lr": 0.00042419580419580423, + "step": 4618, + "tokens_trained": 0.438955216 + }, + { + "epoch": 1.3106382978723405, + "grad_norm": 0.7714941501617432, + "loss": 4.9493, + "lr": 0.0004239160839160839, + "step": 4620, + "tokens_trained": 0.439145848 + }, + { + "epoch": 1.3112056737588653, + "grad_norm": 0.7291305661201477, + "loss": 4.9792, + "lr": 0.00042363636363636366, + "step": 4622, + "tokens_trained": 0.439334768 + }, + { + "epoch": 1.31177304964539, + "grad_norm": 0.6893495321273804, + "loss": 4.9703, + "lr": 0.00042335664335664335, + "step": 4624, + "tokens_trained": 0.439524928 + }, + { + "epoch": 1.3120567375886525, + "eval_loss": 4.985546112060547, + "eval_runtime": 20.6802, + "step": 4625, + "tokens_trained": 0.439619056 + }, + { + "epoch": 1.3123404255319149, + "grad_norm": 0.7363048791885376, + "loss": 4.9635, + "lr": 0.0004230769230769231, + "step": 4626, + "tokens_trained": 0.439714232 + }, + { + "epoch": 1.3129078014184397, + "grad_norm": 0.7479920387268066, + "loss": 5.0308, + "lr": 0.0004227972027972028, + "step": 4628, + "tokens_trained": 0.439904056 + }, + { + "epoch": 1.3134751773049644, + "grad_norm": 0.7858623266220093, + "loss": 4.9504, + "lr": 0.00042251748251748253, + "step": 4630, + "tokens_trained": 0.440093304 + }, + { + "epoch": 1.3140425531914894, + "grad_norm": 0.7382465600967407, + "loss": 4.9397, + "lr": 0.0004222377622377622, + "step": 4632, + "tokens_trained": 0.440283584 + }, + { + "epoch": 1.3146099290780142, + "grad_norm": 0.7232691049575806, + "loss": 5.0304, + "lr": 0.00042195804195804197, + "step": 4634, + "tokens_trained": 0.440473064 + }, + { + "epoch": 1.315177304964539, + "grad_norm": 0.7827140092849731, + "loss": 5.0059, + "lr": 0.0004216783216783217, + "step": 4636, + "tokens_trained": 0.440664664 + }, + { + "epoch": 1.3157446808510638, + "grad_norm": 0.7799215316772461, + "loss": 4.9534, + "lr": 0.0004213986013986014, + "step": 4638, + "tokens_trained": 0.4408536 + }, + { + "epoch": 1.3163120567375888, + "grad_norm": 0.8065125346183777, + "loss": 4.99, + "lr": 0.00042111888111888115, + "step": 4640, + "tokens_trained": 0.441042616 + }, + { + "epoch": 1.3168794326241136, + "grad_norm": 0.7722545266151428, + "loss": 4.9687, + "lr": 0.00042083916083916084, + "step": 4642, + "tokens_trained": 0.441233296 + }, + { + "epoch": 1.3174468085106383, + "grad_norm": 0.7521271109580994, + "loss": 5.0357, + "lr": 0.0004205594405594406, + "step": 4644, + "tokens_trained": 0.441423976 + }, + { + "epoch": 1.3180141843971631, + "grad_norm": 0.7580513954162598, + "loss": 4.9353, + "lr": 0.00042027972027972027, + "step": 4646, + "tokens_trained": 0.441612488 + }, + { + "epoch": 1.318581560283688, + "grad_norm": 0.7603718638420105, + "loss": 5.0189, + "lr": 0.00042, + "step": 4648, + "tokens_trained": 0.441800944 + }, + { + "epoch": 1.3191489361702127, + "grad_norm": 0.7828201055526733, + "loss": 4.941, + "lr": 0.0004197202797202797, + "step": 4650, + "tokens_trained": 0.441990072 + }, + { + "epoch": 1.3197163120567375, + "grad_norm": 0.7227108478546143, + "loss": 4.9707, + "lr": 0.0004194405594405594, + "step": 4652, + "tokens_trained": 0.44218048 + }, + { + "epoch": 1.3202836879432625, + "grad_norm": 0.8121836185455322, + "loss": 4.91, + "lr": 0.0004191608391608392, + "step": 4654, + "tokens_trained": 0.442370728 + }, + { + "epoch": 1.3208510638297872, + "grad_norm": 0.6706936955451965, + "loss": 4.907, + "lr": 0.0004188811188811189, + "step": 4656, + "tokens_trained": 0.442560352 + }, + { + "epoch": 1.321418439716312, + "grad_norm": 0.7793337106704712, + "loss": 5.0206, + "lr": 0.00041860139860139863, + "step": 4658, + "tokens_trained": 0.442750192 + }, + { + "epoch": 1.321985815602837, + "grad_norm": 0.7981981039047241, + "loss": 5.0155, + "lr": 0.0004183216783216783, + "step": 4660, + "tokens_trained": 0.442940848 + }, + { + "epoch": 1.3225531914893618, + "grad_norm": 0.7972844243049622, + "loss": 4.9879, + "lr": 0.00041804195804195807, + "step": 4662, + "tokens_trained": 0.443128896 + }, + { + "epoch": 1.3231205673758866, + "grad_norm": 0.8017681241035461, + "loss": 4.9746, + "lr": 0.00041776223776223776, + "step": 4664, + "tokens_trained": 0.443320528 + }, + { + "epoch": 1.3236879432624113, + "grad_norm": 0.7505584955215454, + "loss": 4.9819, + "lr": 0.0004174825174825175, + "step": 4666, + "tokens_trained": 0.443510888 + }, + { + "epoch": 1.3242553191489361, + "grad_norm": 0.772155225276947, + "loss": 5.0783, + "lr": 0.0004172027972027972, + "step": 4668, + "tokens_trained": 0.443701856 + }, + { + "epoch": 1.324822695035461, + "grad_norm": 0.7051090598106384, + "loss": 4.9403, + "lr": 0.0004169230769230769, + "step": 4670, + "tokens_trained": 0.44389428 + }, + { + "epoch": 1.3253900709219857, + "grad_norm": 0.7992343902587891, + "loss": 4.9498, + "lr": 0.0004166433566433567, + "step": 4672, + "tokens_trained": 0.444087272 + }, + { + "epoch": 1.3259574468085107, + "grad_norm": 0.7696804404258728, + "loss": 5.0109, + "lr": 0.00041636363636363637, + "step": 4674, + "tokens_trained": 0.444274648 + }, + { + "epoch": 1.3265248226950355, + "grad_norm": 0.7982995510101318, + "loss": 4.9506, + "lr": 0.0004160839160839161, + "step": 4676, + "tokens_trained": 0.444464632 + }, + { + "epoch": 1.3270921985815602, + "grad_norm": 0.8207205533981323, + "loss": 4.9527, + "lr": 0.0004158041958041958, + "step": 4678, + "tokens_trained": 0.444655184 + }, + { + "epoch": 1.327659574468085, + "grad_norm": 0.7874724268913269, + "loss": 4.9924, + "lr": 0.00041552447552447555, + "step": 4680, + "tokens_trained": 0.444845096 + }, + { + "epoch": 1.32822695035461, + "grad_norm": 0.7951269149780273, + "loss": 5.0061, + "lr": 0.00041524475524475524, + "step": 4682, + "tokens_trained": 0.445034912 + }, + { + "epoch": 1.3287943262411348, + "grad_norm": 0.7952069640159607, + "loss": 5.029, + "lr": 0.000414965034965035, + "step": 4684, + "tokens_trained": 0.445224664 + }, + { + "epoch": 1.3293617021276596, + "grad_norm": 0.7753441333770752, + "loss": 5.0353, + "lr": 0.0004146853146853147, + "step": 4686, + "tokens_trained": 0.44541256 + }, + { + "epoch": 1.3299290780141844, + "grad_norm": 0.7112265229225159, + "loss": 4.9221, + "lr": 0.00041440559440559437, + "step": 4688, + "tokens_trained": 0.445604696 + }, + { + "epoch": 1.3304964539007091, + "grad_norm": 0.7774649262428284, + "loss": 5.0125, + "lr": 0.00041412587412587417, + "step": 4690, + "tokens_trained": 0.445794752 + }, + { + "epoch": 1.331063829787234, + "grad_norm": 0.8355589509010315, + "loss": 4.9665, + "lr": 0.00041384615384615386, + "step": 4692, + "tokens_trained": 0.44598544 + }, + { + "epoch": 1.331631205673759, + "grad_norm": 0.7191185355186462, + "loss": 4.9798, + "lr": 0.0004135664335664336, + "step": 4694, + "tokens_trained": 0.44617436 + }, + { + "epoch": 1.3321985815602837, + "grad_norm": 0.7386505007743835, + "loss": 4.9756, + "lr": 0.0004132867132867133, + "step": 4696, + "tokens_trained": 0.446363384 + }, + { + "epoch": 1.3327659574468085, + "grad_norm": 0.7661808133125305, + "loss": 4.9374, + "lr": 0.00041300699300699304, + "step": 4698, + "tokens_trained": 0.44655264 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.7530731558799744, + "loss": 4.9681, + "lr": 0.0004127272727272727, + "step": 4700, + "tokens_trained": 0.446743016 + }, + { + "epoch": 1.3339007092198583, + "grad_norm": 0.7512504458427429, + "loss": 4.9827, + "lr": 0.00041244755244755247, + "step": 4702, + "tokens_trained": 0.446932608 + }, + { + "epoch": 1.334468085106383, + "grad_norm": 0.7335140109062195, + "loss": 4.9586, + "lr": 0.00041216783216783216, + "step": 4704, + "tokens_trained": 0.447122208 + }, + { + "epoch": 1.3350354609929078, + "grad_norm": 0.7327559590339661, + "loss": 4.9666, + "lr": 0.00041188811188811185, + "step": 4706, + "tokens_trained": 0.447312824 + }, + { + "epoch": 1.3356028368794326, + "grad_norm": 0.7450160980224609, + "loss": 4.9197, + "lr": 0.00041160839160839165, + "step": 4708, + "tokens_trained": 0.447500672 + }, + { + "epoch": 1.3361702127659574, + "grad_norm": 0.6740980744361877, + "loss": 5.0133, + "lr": 0.00041132867132867134, + "step": 4710, + "tokens_trained": 0.447689552 + }, + { + "epoch": 1.3367375886524822, + "grad_norm": 0.7320116758346558, + "loss": 4.9751, + "lr": 0.0004110489510489511, + "step": 4712, + "tokens_trained": 0.447880128 + }, + { + "epoch": 1.3373049645390072, + "grad_norm": 0.7833261489868164, + "loss": 4.9285, + "lr": 0.0004107692307692308, + "step": 4714, + "tokens_trained": 0.448069496 + }, + { + "epoch": 1.337872340425532, + "grad_norm": 0.7570978999137878, + "loss": 5.0047, + "lr": 0.0004104895104895105, + "step": 4716, + "tokens_trained": 0.448258184 + }, + { + "epoch": 1.3384397163120567, + "grad_norm": 0.7320883274078369, + "loss": 4.9751, + "lr": 0.0004102097902097902, + "step": 4718, + "tokens_trained": 0.448449488 + }, + { + "epoch": 1.3390070921985815, + "grad_norm": 0.7385469675064087, + "loss": 4.9712, + "lr": 0.0004099300699300699, + "step": 4720, + "tokens_trained": 0.448638776 + }, + { + "epoch": 1.3395744680851065, + "grad_norm": 0.7620404958724976, + "loss": 4.8906, + "lr": 0.00040965034965034964, + "step": 4722, + "tokens_trained": 0.448830528 + }, + { + "epoch": 1.3401418439716313, + "grad_norm": 0.7389976382255554, + "loss": 4.9994, + "lr": 0.00040937062937062934, + "step": 4724, + "tokens_trained": 0.449018952 + }, + { + "epoch": 1.340709219858156, + "grad_norm": 0.7150964140892029, + "loss": 4.9244, + "lr": 0.00040909090909090913, + "step": 4726, + "tokens_trained": 0.44920784 + }, + { + "epoch": 1.3412765957446808, + "grad_norm": 0.7163580060005188, + "loss": 5.0069, + "lr": 0.0004088111888111888, + "step": 4728, + "tokens_trained": 0.449396696 + }, + { + "epoch": 1.3418439716312056, + "grad_norm": 0.7657668590545654, + "loss": 4.9322, + "lr": 0.00040853146853146857, + "step": 4730, + "tokens_trained": 0.449585568 + }, + { + "epoch": 1.3424113475177304, + "grad_norm": 0.7743586301803589, + "loss": 4.9691, + "lr": 0.00040825174825174826, + "step": 4732, + "tokens_trained": 0.44977396 + }, + { + "epoch": 1.3429787234042554, + "grad_norm": 0.8050113320350647, + "loss": 4.9514, + "lr": 0.000407972027972028, + "step": 4734, + "tokens_trained": 0.449964656 + }, + { + "epoch": 1.3435460992907802, + "grad_norm": 0.7641178965568542, + "loss": 4.8956, + "lr": 0.0004076923076923077, + "step": 4736, + "tokens_trained": 0.450154896 + }, + { + "epoch": 1.344113475177305, + "grad_norm": 0.8350791931152344, + "loss": 4.9625, + "lr": 0.0004074125874125874, + "step": 4738, + "tokens_trained": 0.450344528 + }, + { + "epoch": 1.3446808510638297, + "grad_norm": 0.7148427367210388, + "loss": 4.9657, + "lr": 0.00040713286713286713, + "step": 4740, + "tokens_trained": 0.450535416 + }, + { + "epoch": 1.3452482269503547, + "grad_norm": 0.7961207032203674, + "loss": 4.897, + "lr": 0.0004068531468531468, + "step": 4742, + "tokens_trained": 0.450724968 + }, + { + "epoch": 1.3458156028368795, + "grad_norm": 0.8115900754928589, + "loss": 4.9171, + "lr": 0.0004065734265734266, + "step": 4744, + "tokens_trained": 0.450916104 + }, + { + "epoch": 1.3463829787234043, + "grad_norm": 0.7608439326286316, + "loss": 4.9817, + "lr": 0.0004062937062937063, + "step": 4746, + "tokens_trained": 0.451107272 + }, + { + "epoch": 1.346950354609929, + "grad_norm": 0.7412408590316772, + "loss": 5.0053, + "lr": 0.00040601398601398605, + "step": 4748, + "tokens_trained": 0.451297408 + }, + { + "epoch": 1.3475177304964538, + "grad_norm": 0.7785027623176575, + "loss": 5.0091, + "lr": 0.00040573426573426574, + "step": 4750, + "tokens_trained": 0.451488272 + }, + { + "epoch": 1.3475177304964538, + "eval_loss": 4.9776997566223145, + "eval_runtime": 20.4142, + "step": 4750, + "tokens_trained": 0.451488272 + }, + { + "epoch": 1.3480851063829786, + "grad_norm": 0.7034481763839722, + "loss": 5.0312, + "lr": 0.0004054545454545455, + "step": 4752, + "tokens_trained": 0.451678048 + }, + { + "epoch": 1.3486524822695036, + "grad_norm": 0.8021607398986816, + "loss": 4.9923, + "lr": 0.0004051748251748252, + "step": 4754, + "tokens_trained": 0.451867816 + }, + { + "epoch": 1.3492198581560284, + "grad_norm": 0.7409330606460571, + "loss": 4.9429, + "lr": 0.00040489510489510487, + "step": 4756, + "tokens_trained": 0.45205644 + }, + { + "epoch": 1.3497872340425532, + "grad_norm": 0.6617271900177002, + "loss": 5.0044, + "lr": 0.0004046153846153846, + "step": 4758, + "tokens_trained": 0.452247464 + }, + { + "epoch": 1.350354609929078, + "grad_norm": 0.7742848992347717, + "loss": 4.9794, + "lr": 0.0004043356643356643, + "step": 4760, + "tokens_trained": 0.452437608 + }, + { + "epoch": 1.350921985815603, + "grad_norm": 0.7627806663513184, + "loss": 4.9562, + "lr": 0.0004040559440559441, + "step": 4762, + "tokens_trained": 0.452627568 + }, + { + "epoch": 1.3514893617021277, + "grad_norm": 0.8105679750442505, + "loss": 5.0514, + "lr": 0.0004037762237762238, + "step": 4764, + "tokens_trained": 0.452817176 + }, + { + "epoch": 1.3520567375886525, + "grad_norm": 0.7783811688423157, + "loss": 4.9414, + "lr": 0.00040349650349650354, + "step": 4766, + "tokens_trained": 0.4530078 + }, + { + "epoch": 1.3526241134751773, + "grad_norm": 0.7357584238052368, + "loss": 4.9184, + "lr": 0.00040321678321678323, + "step": 4768, + "tokens_trained": 0.453196856 + }, + { + "epoch": 1.353191489361702, + "grad_norm": 0.79344242811203, + "loss": 4.8904, + "lr": 0.00040293706293706297, + "step": 4770, + "tokens_trained": 0.4533878 + }, + { + "epoch": 1.3537588652482269, + "grad_norm": 0.7372890710830688, + "loss": 4.9378, + "lr": 0.00040265734265734266, + "step": 4772, + "tokens_trained": 0.4535766 + }, + { + "epoch": 1.3543262411347516, + "grad_norm": 0.7920981049537659, + "loss": 4.9701, + "lr": 0.00040237762237762235, + "step": 4774, + "tokens_trained": 0.45376792 + }, + { + "epoch": 1.3548936170212766, + "grad_norm": 0.7568764686584473, + "loss": 5.0008, + "lr": 0.0004020979020979021, + "step": 4776, + "tokens_trained": 0.453958072 + }, + { + "epoch": 1.3554609929078014, + "grad_norm": 0.7389140129089355, + "loss": 4.9886, + "lr": 0.0004018181818181818, + "step": 4778, + "tokens_trained": 0.454147016 + }, + { + "epoch": 1.3560283687943262, + "grad_norm": 0.7528326511383057, + "loss": 4.9669, + "lr": 0.00040153846153846153, + "step": 4780, + "tokens_trained": 0.454338392 + }, + { + "epoch": 1.3565957446808512, + "grad_norm": 0.7838888764381409, + "loss": 5.0034, + "lr": 0.0004012587412587413, + "step": 4782, + "tokens_trained": 0.45452652 + }, + { + "epoch": 1.357163120567376, + "grad_norm": 0.8001760244369507, + "loss": 4.969, + "lr": 0.000400979020979021, + "step": 4784, + "tokens_trained": 0.454714896 + }, + { + "epoch": 1.3577304964539008, + "grad_norm": 0.7670722007751465, + "loss": 5.0728, + "lr": 0.0004006993006993007, + "step": 4786, + "tokens_trained": 0.45490428 + }, + { + "epoch": 1.3582978723404255, + "grad_norm": 0.7396910786628723, + "loss": 4.9123, + "lr": 0.00040041958041958046, + "step": 4788, + "tokens_trained": 0.45509412 + }, + { + "epoch": 1.3588652482269503, + "grad_norm": 0.8072660565376282, + "loss": 4.9988, + "lr": 0.00040013986013986015, + "step": 4790, + "tokens_trained": 0.455283592 + }, + { + "epoch": 1.359432624113475, + "grad_norm": 0.7714769840240479, + "loss": 4.9984, + "lr": 0.00039986013986013984, + "step": 4792, + "tokens_trained": 0.455476456 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.749272882938385, + "loss": 5.02, + "lr": 0.0003995804195804196, + "step": 4794, + "tokens_trained": 0.455666568 + }, + { + "epoch": 1.3605673758865249, + "grad_norm": 0.9460277557373047, + "loss": 5.0115, + "lr": 0.00039930069930069927, + "step": 4796, + "tokens_trained": 0.455855472 + }, + { + "epoch": 1.3611347517730497, + "grad_norm": 0.8013962507247925, + "loss": 4.8935, + "lr": 0.000399020979020979, + "step": 4798, + "tokens_trained": 0.456043256 + }, + { + "epoch": 1.3617021276595744, + "grad_norm": 0.8356024026870728, + "loss": 5.004, + "lr": 0.00039874125874125876, + "step": 4800, + "tokens_trained": 0.456232728 + }, + { + "epoch": 1.3622695035460992, + "grad_norm": 0.7791249752044678, + "loss": 4.9025, + "lr": 0.0003984615384615385, + "step": 4802, + "tokens_trained": 0.456422672 + }, + { + "epoch": 1.3628368794326242, + "grad_norm": 0.7426172494888306, + "loss": 4.9706, + "lr": 0.0003981818181818182, + "step": 4804, + "tokens_trained": 0.456612888 + }, + { + "epoch": 1.363404255319149, + "grad_norm": 0.8252729773521423, + "loss": 4.9679, + "lr": 0.00039790209790209794, + "step": 4806, + "tokens_trained": 0.456802432 + }, + { + "epoch": 1.3639716312056738, + "grad_norm": 0.7870017290115356, + "loss": 4.9609, + "lr": 0.00039762237762237763, + "step": 4808, + "tokens_trained": 0.456990752 + }, + { + "epoch": 1.3645390070921986, + "grad_norm": 0.815733790397644, + "loss": 4.9634, + "lr": 0.0003973426573426573, + "step": 4810, + "tokens_trained": 0.457181528 + }, + { + "epoch": 1.3651063829787233, + "grad_norm": 0.6886212825775146, + "loss": 4.954, + "lr": 0.00039706293706293707, + "step": 4812, + "tokens_trained": 0.457370064 + }, + { + "epoch": 1.365673758865248, + "grad_norm": 0.7102149724960327, + "loss": 4.8986, + "lr": 0.00039678321678321676, + "step": 4814, + "tokens_trained": 0.457559112 + }, + { + "epoch": 1.3662411347517731, + "grad_norm": 0.7671045064926147, + "loss": 4.9504, + "lr": 0.0003965034965034965, + "step": 4816, + "tokens_trained": 0.457749888 + }, + { + "epoch": 1.366808510638298, + "grad_norm": 0.7828851938247681, + "loss": 4.9522, + "lr": 0.00039622377622377625, + "step": 4818, + "tokens_trained": 0.457939616 + }, + { + "epoch": 1.3673758865248227, + "grad_norm": 0.7570793628692627, + "loss": 4.9273, + "lr": 0.000395944055944056, + "step": 4820, + "tokens_trained": 0.458131776 + }, + { + "epoch": 1.3679432624113474, + "grad_norm": 0.7246227860450745, + "loss": 5.0342, + "lr": 0.0003956643356643357, + "step": 4822, + "tokens_trained": 0.458323576 + }, + { + "epoch": 1.3685106382978725, + "grad_norm": 0.7387742400169373, + "loss": 4.976, + "lr": 0.0003953846153846154, + "step": 4824, + "tokens_trained": 0.4585148 + }, + { + "epoch": 1.3690780141843972, + "grad_norm": 0.7457069158554077, + "loss": 5.0033, + "lr": 0.0003951048951048951, + "step": 4826, + "tokens_trained": 0.458706352 + }, + { + "epoch": 1.369645390070922, + "grad_norm": 0.721156895160675, + "loss": 4.9869, + "lr": 0.0003948251748251748, + "step": 4828, + "tokens_trained": 0.45889584 + }, + { + "epoch": 1.3702127659574468, + "grad_norm": 0.7440138459205627, + "loss": 4.9304, + "lr": 0.00039454545454545455, + "step": 4830, + "tokens_trained": 0.459084496 + }, + { + "epoch": 1.3707801418439716, + "grad_norm": 0.7051060199737549, + "loss": 4.9388, + "lr": 0.00039426573426573424, + "step": 4832, + "tokens_trained": 0.459273328 + }, + { + "epoch": 1.3713475177304963, + "grad_norm": 0.7923696637153625, + "loss": 4.9632, + "lr": 0.000393986013986014, + "step": 4834, + "tokens_trained": 0.459461936 + }, + { + "epoch": 1.3719148936170213, + "grad_norm": 0.7542476654052734, + "loss": 4.9849, + "lr": 0.00039370629370629373, + "step": 4836, + "tokens_trained": 0.459654296 + }, + { + "epoch": 1.3724822695035461, + "grad_norm": 0.6460102200508118, + "loss": 4.9345, + "lr": 0.0003934265734265735, + "step": 4838, + "tokens_trained": 0.459840832 + }, + { + "epoch": 1.373049645390071, + "grad_norm": 0.6898486614227295, + "loss": 4.9322, + "lr": 0.00039314685314685316, + "step": 4840, + "tokens_trained": 0.46003016 + }, + { + "epoch": 1.3736170212765957, + "grad_norm": 0.7820252776145935, + "loss": 4.9832, + "lr": 0.00039286713286713286, + "step": 4842, + "tokens_trained": 0.460220928 + }, + { + "epoch": 1.3741843971631207, + "grad_norm": 0.681734561920166, + "loss": 4.8975, + "lr": 0.0003925874125874126, + "step": 4844, + "tokens_trained": 0.460410064 + }, + { + "epoch": 1.3747517730496455, + "grad_norm": 0.7517859935760498, + "loss": 4.941, + "lr": 0.0003923076923076923, + "step": 4846, + "tokens_trained": 0.46059848 + }, + { + "epoch": 1.3753191489361702, + "grad_norm": 0.7375074625015259, + "loss": 4.9473, + "lr": 0.00039202797202797203, + "step": 4848, + "tokens_trained": 0.46078916 + }, + { + "epoch": 1.375886524822695, + "grad_norm": 0.728672444820404, + "loss": 4.962, + "lr": 0.0003917482517482517, + "step": 4850, + "tokens_trained": 0.460977672 + }, + { + "epoch": 1.3764539007092198, + "grad_norm": 0.7166595458984375, + "loss": 4.9366, + "lr": 0.00039146853146853147, + "step": 4852, + "tokens_trained": 0.461166912 + }, + { + "epoch": 1.3770212765957446, + "grad_norm": 0.7807113528251648, + "loss": 4.9279, + "lr": 0.0003911888111888112, + "step": 4854, + "tokens_trained": 0.46135684 + }, + { + "epoch": 1.3775886524822696, + "grad_norm": 0.7296082973480225, + "loss": 4.9246, + "lr": 0.00039090909090909096, + "step": 4856, + "tokens_trained": 0.461546944 + }, + { + "epoch": 1.3781560283687944, + "grad_norm": 0.7450242638587952, + "loss": 4.9474, + "lr": 0.00039062937062937065, + "step": 4858, + "tokens_trained": 0.461736576 + }, + { + "epoch": 1.3787234042553191, + "grad_norm": 0.6994244456291199, + "loss": 4.9334, + "lr": 0.00039034965034965034, + "step": 4860, + "tokens_trained": 0.461925424 + }, + { + "epoch": 1.379290780141844, + "grad_norm": 0.7981341481208801, + "loss": 4.9785, + "lr": 0.0003900699300699301, + "step": 4862, + "tokens_trained": 0.462115912 + }, + { + "epoch": 1.379858156028369, + "grad_norm": 0.6945004463195801, + "loss": 4.9581, + "lr": 0.0003897902097902098, + "step": 4864, + "tokens_trained": 0.462306424 + }, + { + "epoch": 1.3804255319148937, + "grad_norm": 0.7116626501083374, + "loss": 4.947, + "lr": 0.0003895104895104895, + "step": 4866, + "tokens_trained": 0.462497352 + }, + { + "epoch": 1.3809929078014185, + "grad_norm": 0.7096779346466064, + "loss": 4.956, + "lr": 0.0003892307692307692, + "step": 4868, + "tokens_trained": 0.462686872 + }, + { + "epoch": 1.3815602836879433, + "grad_norm": 0.6993130445480347, + "loss": 4.9038, + "lr": 0.00038895104895104895, + "step": 4870, + "tokens_trained": 0.462877712 + }, + { + "epoch": 1.382127659574468, + "grad_norm": 0.7118195295333862, + "loss": 4.9709, + "lr": 0.0003886713286713287, + "step": 4872, + "tokens_trained": 0.463069304 + }, + { + "epoch": 1.3826950354609928, + "grad_norm": 0.760608971118927, + "loss": 4.9574, + "lr": 0.00038839160839160844, + "step": 4874, + "tokens_trained": 0.463260616 + }, + { + "epoch": 1.3829787234042552, + "eval_loss": 4.976211071014404, + "eval_runtime": 20.5866, + "step": 4875, + "tokens_trained": 0.463356504 + }, + { + "epoch": 1.3832624113475176, + "grad_norm": 0.7358114123344421, + "loss": 5.0454, + "lr": 0.00038811188811188813, + "step": 4876, + "tokens_trained": 0.4634498 + }, + { + "epoch": 1.3838297872340426, + "grad_norm": 0.7012422680854797, + "loss": 4.9396, + "lr": 0.0003878321678321678, + "step": 4878, + "tokens_trained": 0.463639736 + }, + { + "epoch": 1.3843971631205674, + "grad_norm": 0.7740567922592163, + "loss": 4.9148, + "lr": 0.00038755244755244757, + "step": 4880, + "tokens_trained": 0.463831816 + }, + { + "epoch": 1.3849645390070922, + "grad_norm": 0.7246590852737427, + "loss": 4.9392, + "lr": 0.00038727272727272726, + "step": 4882, + "tokens_trained": 0.464022656 + }, + { + "epoch": 1.3855319148936172, + "grad_norm": 0.7365467548370361, + "loss": 4.9912, + "lr": 0.000386993006993007, + "step": 4884, + "tokens_trained": 0.464212584 + }, + { + "epoch": 1.386099290780142, + "grad_norm": 0.7027139067649841, + "loss": 4.9846, + "lr": 0.0003867132867132867, + "step": 4886, + "tokens_trained": 0.464404256 + }, + { + "epoch": 1.3866666666666667, + "grad_norm": 0.7768815755844116, + "loss": 4.9042, + "lr": 0.00038643356643356644, + "step": 4888, + "tokens_trained": 0.46459544 + }, + { + "epoch": 1.3872340425531915, + "grad_norm": 0.7037492990493774, + "loss": 4.8755, + "lr": 0.0003861538461538462, + "step": 4890, + "tokens_trained": 0.464784224 + }, + { + "epoch": 1.3878014184397163, + "grad_norm": 0.8143949508666992, + "loss": 4.9742, + "lr": 0.00038587412587412593, + "step": 4892, + "tokens_trained": 0.464975456 + }, + { + "epoch": 1.388368794326241, + "grad_norm": 0.7223230600357056, + "loss": 4.9473, + "lr": 0.0003855944055944056, + "step": 4894, + "tokens_trained": 0.465164944 + }, + { + "epoch": 1.3889361702127658, + "grad_norm": 0.7167389988899231, + "loss": 4.9495, + "lr": 0.0003853146853146853, + "step": 4896, + "tokens_trained": 0.46535596 + }, + { + "epoch": 1.3895035460992908, + "grad_norm": 0.764140248298645, + "loss": 4.9759, + "lr": 0.00038503496503496505, + "step": 4898, + "tokens_trained": 0.465545192 + }, + { + "epoch": 1.3900709219858156, + "grad_norm": 0.7822412252426147, + "loss": 5.0296, + "lr": 0.00038475524475524474, + "step": 4900, + "tokens_trained": 0.465734568 + }, + { + "epoch": 1.3906382978723404, + "grad_norm": 0.7479943633079529, + "loss": 4.9059, + "lr": 0.0003844755244755245, + "step": 4902, + "tokens_trained": 0.465923464 + }, + { + "epoch": 1.3912056737588652, + "grad_norm": 0.7703482508659363, + "loss": 4.927, + "lr": 0.0003841958041958042, + "step": 4904, + "tokens_trained": 0.466114272 + }, + { + "epoch": 1.3917730496453902, + "grad_norm": 0.7773356437683105, + "loss": 4.9733, + "lr": 0.0003839160839160839, + "step": 4906, + "tokens_trained": 0.466305032 + }, + { + "epoch": 1.392340425531915, + "grad_norm": 0.7287682294845581, + "loss": 4.9497, + "lr": 0.0003836363636363636, + "step": 4908, + "tokens_trained": 0.466495336 + }, + { + "epoch": 1.3929078014184397, + "grad_norm": 0.7540012001991272, + "loss": 4.948, + "lr": 0.0003833566433566434, + "step": 4910, + "tokens_trained": 0.466686312 + }, + { + "epoch": 1.3934751773049645, + "grad_norm": 0.6999531388282776, + "loss": 5.0049, + "lr": 0.0003830769230769231, + "step": 4912, + "tokens_trained": 0.466878424 + }, + { + "epoch": 1.3940425531914893, + "grad_norm": 0.7895733714103699, + "loss": 4.9807, + "lr": 0.0003827972027972028, + "step": 4914, + "tokens_trained": 0.467068576 + }, + { + "epoch": 1.394609929078014, + "grad_norm": 0.8046857118606567, + "loss": 4.9518, + "lr": 0.00038251748251748254, + "step": 4916, + "tokens_trained": 0.467259704 + }, + { + "epoch": 1.395177304964539, + "grad_norm": 0.6962889432907104, + "loss": 4.993, + "lr": 0.0003822377622377622, + "step": 4918, + "tokens_trained": 0.46744908 + }, + { + "epoch": 1.3957446808510638, + "grad_norm": 0.7171238660812378, + "loss": 4.9422, + "lr": 0.00038195804195804197, + "step": 4920, + "tokens_trained": 0.467637408 + }, + { + "epoch": 1.3963120567375886, + "grad_norm": 0.7095980644226074, + "loss": 4.9518, + "lr": 0.00038167832167832166, + "step": 4922, + "tokens_trained": 0.467828128 + }, + { + "epoch": 1.3968794326241134, + "grad_norm": 0.798319399356842, + "loss": 4.9267, + "lr": 0.0003813986013986014, + "step": 4924, + "tokens_trained": 0.4680194 + }, + { + "epoch": 1.3974468085106384, + "grad_norm": 0.6752556562423706, + "loss": 4.9905, + "lr": 0.0003811188811188811, + "step": 4926, + "tokens_trained": 0.468209408 + }, + { + "epoch": 1.3980141843971632, + "grad_norm": 0.7536012530326843, + "loss": 4.938, + "lr": 0.0003808391608391609, + "step": 4928, + "tokens_trained": 0.468399016 + }, + { + "epoch": 1.398581560283688, + "grad_norm": 0.7366868257522583, + "loss": 4.946, + "lr": 0.0003805594405594406, + "step": 4930, + "tokens_trained": 0.468591152 + }, + { + "epoch": 1.3991489361702127, + "grad_norm": 0.765252411365509, + "loss": 4.9843, + "lr": 0.0003802797202797203, + "step": 4932, + "tokens_trained": 0.468781688 + }, + { + "epoch": 1.3997163120567375, + "grad_norm": 0.6715340614318848, + "loss": 4.955, + "lr": 0.00038, + "step": 4934, + "tokens_trained": 0.468972624 + }, + { + "epoch": 1.4002836879432623, + "grad_norm": 0.7280968427658081, + "loss": 4.9459, + "lr": 0.0003797202797202797, + "step": 4936, + "tokens_trained": 0.46916264 + }, + { + "epoch": 1.4008510638297873, + "grad_norm": 0.7301554679870605, + "loss": 4.9083, + "lr": 0.00037944055944055946, + "step": 4938, + "tokens_trained": 0.469352208 + }, + { + "epoch": 1.401418439716312, + "grad_norm": 0.7966684103012085, + "loss": 4.9554, + "lr": 0.00037916083916083915, + "step": 4940, + "tokens_trained": 0.469542584 + }, + { + "epoch": 1.4019858156028369, + "grad_norm": 0.7339959144592285, + "loss": 4.9614, + "lr": 0.0003788811188811189, + "step": 4942, + "tokens_trained": 0.4697328 + }, + { + "epoch": 1.4025531914893616, + "grad_norm": 0.7321662902832031, + "loss": 4.9159, + "lr": 0.0003786013986013986, + "step": 4944, + "tokens_trained": 0.469922768 + }, + { + "epoch": 1.4031205673758866, + "grad_norm": 0.7663842439651489, + "loss": 5.0158, + "lr": 0.0003783216783216784, + "step": 4946, + "tokens_trained": 0.470111912 + }, + { + "epoch": 1.4036879432624114, + "grad_norm": 0.6754962801933289, + "loss": 4.973, + "lr": 0.00037804195804195807, + "step": 4948, + "tokens_trained": 0.470303544 + }, + { + "epoch": 1.4042553191489362, + "grad_norm": 0.6885703802108765, + "loss": 4.9308, + "lr": 0.00037776223776223776, + "step": 4950, + "tokens_trained": 0.470493872 + }, + { + "epoch": 1.404822695035461, + "grad_norm": 0.7635725140571594, + "loss": 4.9281, + "lr": 0.0003774825174825175, + "step": 4952, + "tokens_trained": 0.47068516 + }, + { + "epoch": 1.4053900709219858, + "grad_norm": 0.6963970065116882, + "loss": 4.9579, + "lr": 0.0003772027972027972, + "step": 4954, + "tokens_trained": 0.470875848 + }, + { + "epoch": 1.4059574468085105, + "grad_norm": 0.7530264854431152, + "loss": 4.9418, + "lr": 0.00037692307692307694, + "step": 4956, + "tokens_trained": 0.47106516 + }, + { + "epoch": 1.4065248226950355, + "grad_norm": 0.707700788974762, + "loss": 5.0045, + "lr": 0.00037664335664335663, + "step": 4958, + "tokens_trained": 0.471252432 + }, + { + "epoch": 1.4070921985815603, + "grad_norm": 0.7403944134712219, + "loss": 4.9305, + "lr": 0.0003763636363636364, + "step": 4960, + "tokens_trained": 0.471442672 + }, + { + "epoch": 1.407659574468085, + "grad_norm": 0.753716230392456, + "loss": 4.9812, + "lr": 0.00037608391608391607, + "step": 4962, + "tokens_trained": 0.471631888 + }, + { + "epoch": 1.4082269503546099, + "grad_norm": 0.8004569411277771, + "loss": 4.9217, + "lr": 0.0003758041958041958, + "step": 4964, + "tokens_trained": 0.47182072 + }, + { + "epoch": 1.4087943262411349, + "grad_norm": 0.7715573906898499, + "loss": 4.9233, + "lr": 0.00037552447552447555, + "step": 4966, + "tokens_trained": 0.472011104 + }, + { + "epoch": 1.4093617021276597, + "grad_norm": 0.6821765303611755, + "loss": 4.9976, + "lr": 0.00037524475524475524, + "step": 4968, + "tokens_trained": 0.472201568 + }, + { + "epoch": 1.4099290780141844, + "grad_norm": 0.7360137701034546, + "loss": 4.9414, + "lr": 0.000374965034965035, + "step": 4970, + "tokens_trained": 0.472390656 + }, + { + "epoch": 1.4104964539007092, + "grad_norm": 0.6912544369697571, + "loss": 4.9692, + "lr": 0.0003746853146853147, + "step": 4972, + "tokens_trained": 0.4725808 + }, + { + "epoch": 1.411063829787234, + "grad_norm": 0.7245798110961914, + "loss": 4.9708, + "lr": 0.0003744055944055944, + "step": 4974, + "tokens_trained": 0.472768096 + }, + { + "epoch": 1.4116312056737588, + "grad_norm": 0.8210451602935791, + "loss": 4.9523, + "lr": 0.0003741258741258741, + "step": 4976, + "tokens_trained": 0.472957224 + }, + { + "epoch": 1.4121985815602836, + "grad_norm": 0.7312847971916199, + "loss": 4.948, + "lr": 0.00037384615384615386, + "step": 4978, + "tokens_trained": 0.47314768 + }, + { + "epoch": 1.4127659574468086, + "grad_norm": 0.7223467826843262, + "loss": 4.9328, + "lr": 0.00037356643356643355, + "step": 4980, + "tokens_trained": 0.47333896 + }, + { + "epoch": 1.4133333333333333, + "grad_norm": 0.7106639742851257, + "loss": 4.9244, + "lr": 0.0003732867132867133, + "step": 4982, + "tokens_trained": 0.473530608 + }, + { + "epoch": 1.413900709219858, + "grad_norm": 0.717099130153656, + "loss": 4.9213, + "lr": 0.00037300699300699304, + "step": 4984, + "tokens_trained": 0.473720968 + }, + { + "epoch": 1.414468085106383, + "grad_norm": 0.7603443264961243, + "loss": 4.9815, + "lr": 0.00037272727272727273, + "step": 4986, + "tokens_trained": 0.473911448 + }, + { + "epoch": 1.415035460992908, + "grad_norm": 0.7069094181060791, + "loss": 4.9781, + "lr": 0.0003724475524475525, + "step": 4988, + "tokens_trained": 0.474101576 + }, + { + "epoch": 1.4156028368794327, + "grad_norm": 0.6874499917030334, + "loss": 4.9678, + "lr": 0.00037216783216783216, + "step": 4990, + "tokens_trained": 0.474292264 + }, + { + "epoch": 1.4161702127659574, + "grad_norm": 0.7207010984420776, + "loss": 4.9998, + "lr": 0.0003718881118881119, + "step": 4992, + "tokens_trained": 0.474482208 + }, + { + "epoch": 1.4167375886524822, + "grad_norm": 0.7269707322120667, + "loss": 4.9148, + "lr": 0.0003716083916083916, + "step": 4994, + "tokens_trained": 0.474671136 + }, + { + "epoch": 1.417304964539007, + "grad_norm": 0.6694115400314331, + "loss": 4.9269, + "lr": 0.00037132867132867134, + "step": 4996, + "tokens_trained": 0.474862056 + }, + { + "epoch": 1.4178723404255318, + "grad_norm": 0.6479254364967346, + "loss": 4.9747, + "lr": 0.00037104895104895103, + "step": 4998, + "tokens_trained": 0.475053784 + }, + { + "epoch": 1.4184397163120568, + "grad_norm": 0.660739004611969, + "loss": 4.9034, + "lr": 0.0003707692307692308, + "step": 5000, + "tokens_trained": 0.47524568 + }, + { + "epoch": 1.4184397163120568, + "eval_loss": 4.966795444488525, + "eval_runtime": 21.018, + "step": 5000, + "tokens_trained": 0.47524568 + }, + { + "epoch": 1.4190070921985816, + "grad_norm": 0.7606148719787598, + "loss": 4.991, + "lr": 0.0003704895104895105, + "step": 5002, + "tokens_trained": 0.475437208 + }, + { + "epoch": 1.4195744680851063, + "grad_norm": 0.6917815208435059, + "loss": 4.9582, + "lr": 0.0003702097902097902, + "step": 5004, + "tokens_trained": 0.475625952 + }, + { + "epoch": 1.4201418439716311, + "grad_norm": 0.731756865978241, + "loss": 4.9908, + "lr": 0.00036993006993006996, + "step": 5006, + "tokens_trained": 0.475815792 + }, + { + "epoch": 1.4207092198581561, + "grad_norm": 0.7233264446258545, + "loss": 4.936, + "lr": 0.00036965034965034965, + "step": 5008, + "tokens_trained": 0.476005808 + }, + { + "epoch": 1.421276595744681, + "grad_norm": 0.6983084082603455, + "loss": 4.9926, + "lr": 0.0003693706293706294, + "step": 5010, + "tokens_trained": 0.476195072 + }, + { + "epoch": 1.4218439716312057, + "grad_norm": 0.752465009689331, + "loss": 4.9064, + "lr": 0.0003690909090909091, + "step": 5012, + "tokens_trained": 0.47638492 + }, + { + "epoch": 1.4224113475177305, + "grad_norm": 0.7406246662139893, + "loss": 4.9646, + "lr": 0.00036881118881118883, + "step": 5014, + "tokens_trained": 0.476575672 + }, + { + "epoch": 1.4229787234042552, + "grad_norm": 0.7230610251426697, + "loss": 5.0192, + "lr": 0.0003685314685314685, + "step": 5016, + "tokens_trained": 0.476764648 + }, + { + "epoch": 1.42354609929078, + "grad_norm": 0.7906433343887329, + "loss": 4.9454, + "lr": 0.00036825174825174826, + "step": 5018, + "tokens_trained": 0.476955176 + }, + { + "epoch": 1.424113475177305, + "grad_norm": 0.713800847530365, + "loss": 4.9439, + "lr": 0.000367972027972028, + "step": 5020, + "tokens_trained": 0.477145336 + }, + { + "epoch": 1.4246808510638298, + "grad_norm": 0.80546635389328, + "loss": 4.9492, + "lr": 0.0003676923076923077, + "step": 5022, + "tokens_trained": 0.477338024 + }, + { + "epoch": 1.4252482269503546, + "grad_norm": 0.831771969795227, + "loss": 4.9498, + "lr": 0.00036741258741258744, + "step": 5024, + "tokens_trained": 0.47752756 + }, + { + "epoch": 1.4258156028368794, + "grad_norm": 0.7554155588150024, + "loss": 4.9411, + "lr": 0.00036713286713286713, + "step": 5026, + "tokens_trained": 0.477717624 + }, + { + "epoch": 1.4263829787234044, + "grad_norm": 0.7594896554946899, + "loss": 4.9751, + "lr": 0.0003668531468531469, + "step": 5028, + "tokens_trained": 0.477908536 + }, + { + "epoch": 1.4269503546099291, + "grad_norm": 0.6471177339553833, + "loss": 4.8401, + "lr": 0.00036657342657342657, + "step": 5030, + "tokens_trained": 0.478097528 + }, + { + "epoch": 1.427517730496454, + "grad_norm": 0.7507487535476685, + "loss": 4.9596, + "lr": 0.0003662937062937063, + "step": 5032, + "tokens_trained": 0.47828772 + }, + { + "epoch": 1.4280851063829787, + "grad_norm": 0.7026324272155762, + "loss": 4.9324, + "lr": 0.000366013986013986, + "step": 5034, + "tokens_trained": 0.478478448 + }, + { + "epoch": 1.4286524822695035, + "grad_norm": 0.7535367012023926, + "loss": 4.8952, + "lr": 0.0003657342657342657, + "step": 5036, + "tokens_trained": 0.47866956 + }, + { + "epoch": 1.4292198581560283, + "grad_norm": 0.7286129593849182, + "loss": 4.9075, + "lr": 0.0003654545454545455, + "step": 5038, + "tokens_trained": 0.47885872 + }, + { + "epoch": 1.4297872340425533, + "grad_norm": 0.6919812560081482, + "loss": 4.9271, + "lr": 0.0003651748251748252, + "step": 5040, + "tokens_trained": 0.479048376 + }, + { + "epoch": 1.430354609929078, + "grad_norm": 0.7181224822998047, + "loss": 4.9257, + "lr": 0.0003648951048951049, + "step": 5042, + "tokens_trained": 0.479237688 + }, + { + "epoch": 1.4309219858156028, + "grad_norm": 0.7457099556922913, + "loss": 4.8927, + "lr": 0.0003646153846153846, + "step": 5044, + "tokens_trained": 0.479426584 + }, + { + "epoch": 1.4314893617021276, + "grad_norm": 0.7675755023956299, + "loss": 4.9287, + "lr": 0.00036433566433566436, + "step": 5046, + "tokens_trained": 0.479617216 + }, + { + "epoch": 1.4320567375886526, + "grad_norm": 0.7041569352149963, + "loss": 4.9276, + "lr": 0.00036405594405594405, + "step": 5048, + "tokens_trained": 0.479807304 + }, + { + "epoch": 1.4326241134751774, + "grad_norm": 0.7185978293418884, + "loss": 4.9652, + "lr": 0.00036377622377622374, + "step": 5050, + "tokens_trained": 0.479997504 + }, + { + "epoch": 1.4331914893617022, + "grad_norm": 0.7958099246025085, + "loss": 4.94, + "lr": 0.0003634965034965035, + "step": 5052, + "tokens_trained": 0.480189784 + }, + { + "epoch": 1.433758865248227, + "grad_norm": 0.6902858018875122, + "loss": 4.9827, + "lr": 0.0003632167832167832, + "step": 5054, + "tokens_trained": 0.480383144 + }, + { + "epoch": 1.4343262411347517, + "grad_norm": 0.6887302398681641, + "loss": 5.0039, + "lr": 0.000362937062937063, + "step": 5056, + "tokens_trained": 0.480570904 + }, + { + "epoch": 1.4348936170212765, + "grad_norm": 0.7241384983062744, + "loss": 4.9248, + "lr": 0.00036265734265734267, + "step": 5058, + "tokens_trained": 0.480759136 + }, + { + "epoch": 1.4354609929078015, + "grad_norm": 0.7790824770927429, + "loss": 4.9626, + "lr": 0.0003623776223776224, + "step": 5060, + "tokens_trained": 0.480949056 + }, + { + "epoch": 1.4360283687943263, + "grad_norm": 0.8010179400444031, + "loss": 4.9652, + "lr": 0.0003620979020979021, + "step": 5062, + "tokens_trained": 0.481139536 + }, + { + "epoch": 1.436595744680851, + "grad_norm": 0.7285072803497314, + "loss": 4.9414, + "lr": 0.00036181818181818185, + "step": 5064, + "tokens_trained": 0.481328504 + }, + { + "epoch": 1.4371631205673758, + "grad_norm": 0.7610006332397461, + "loss": 4.9742, + "lr": 0.00036153846153846154, + "step": 5066, + "tokens_trained": 0.481519352 + }, + { + "epoch": 1.4377304964539008, + "grad_norm": 0.6971138715744019, + "loss": 4.998, + "lr": 0.0003612587412587412, + "step": 5068, + "tokens_trained": 0.4817108 + }, + { + "epoch": 1.4382978723404256, + "grad_norm": 0.7477296590805054, + "loss": 4.9079, + "lr": 0.00036097902097902097, + "step": 5070, + "tokens_trained": 0.481902 + }, + { + "epoch": 1.4388652482269504, + "grad_norm": 0.7010331153869629, + "loss": 5.0261, + "lr": 0.00036069930069930066, + "step": 5072, + "tokens_trained": 0.48209196 + }, + { + "epoch": 1.4394326241134752, + "grad_norm": 0.7054550647735596, + "loss": 4.9881, + "lr": 0.00036041958041958046, + "step": 5074, + "tokens_trained": 0.48228332 + }, + { + "epoch": 1.44, + "grad_norm": 0.7022992968559265, + "loss": 4.9847, + "lr": 0.00036013986013986015, + "step": 5076, + "tokens_trained": 0.482474376 + }, + { + "epoch": 1.4405673758865247, + "grad_norm": 0.716465175151825, + "loss": 4.8994, + "lr": 0.0003598601398601399, + "step": 5078, + "tokens_trained": 0.48266244 + }, + { + "epoch": 1.4411347517730497, + "grad_norm": 0.6937554478645325, + "loss": 4.9677, + "lr": 0.0003595804195804196, + "step": 5080, + "tokens_trained": 0.482851648 + }, + { + "epoch": 1.4417021276595745, + "grad_norm": 0.7124615907669067, + "loss": 4.9537, + "lr": 0.00035930069930069933, + "step": 5082, + "tokens_trained": 0.483039216 + }, + { + "epoch": 1.4422695035460993, + "grad_norm": 0.6647019386291504, + "loss": 4.9518, + "lr": 0.000359020979020979, + "step": 5084, + "tokens_trained": 0.483229912 + }, + { + "epoch": 1.442836879432624, + "grad_norm": 0.7044801712036133, + "loss": 4.9696, + "lr": 0.0003587412587412587, + "step": 5086, + "tokens_trained": 0.483419848 + }, + { + "epoch": 1.443404255319149, + "grad_norm": 0.7027294039726257, + "loss": 4.9886, + "lr": 0.00035846153846153846, + "step": 5088, + "tokens_trained": 0.483608528 + }, + { + "epoch": 1.4439716312056738, + "grad_norm": 0.7377288341522217, + "loss": 4.8964, + "lr": 0.00035818181818181815, + "step": 5090, + "tokens_trained": 0.483799232 + }, + { + "epoch": 1.4445390070921986, + "grad_norm": 0.7174035310745239, + "loss": 4.9365, + "lr": 0.00035790209790209794, + "step": 5092, + "tokens_trained": 0.48398796 + }, + { + "epoch": 1.4451063829787234, + "grad_norm": 0.7121730446815491, + "loss": 4.936, + "lr": 0.00035762237762237763, + "step": 5094, + "tokens_trained": 0.484177608 + }, + { + "epoch": 1.4456737588652482, + "grad_norm": 0.7427595853805542, + "loss": 4.968, + "lr": 0.0003573426573426574, + "step": 5096, + "tokens_trained": 0.484368912 + }, + { + "epoch": 1.446241134751773, + "grad_norm": 0.7151100635528564, + "loss": 4.866, + "lr": 0.00035706293706293707, + "step": 5098, + "tokens_trained": 0.484558872 + }, + { + "epoch": 1.4468085106382977, + "grad_norm": 0.7151250243186951, + "loss": 4.9275, + "lr": 0.0003567832167832168, + "step": 5100, + "tokens_trained": 0.484746272 + }, + { + "epoch": 1.4473758865248227, + "grad_norm": 0.7005182504653931, + "loss": 4.956, + "lr": 0.0003565034965034965, + "step": 5102, + "tokens_trained": 0.484937928 + }, + { + "epoch": 1.4479432624113475, + "grad_norm": 0.7152060270309448, + "loss": 4.9544, + "lr": 0.0003562237762237762, + "step": 5104, + "tokens_trained": 0.485127264 + }, + { + "epoch": 1.4485106382978723, + "grad_norm": 0.7763362526893616, + "loss": 4.8919, + "lr": 0.00035594405594405594, + "step": 5106, + "tokens_trained": 0.485317256 + }, + { + "epoch": 1.4490780141843973, + "grad_norm": 0.7702814936637878, + "loss": 4.9243, + "lr": 0.00035566433566433563, + "step": 5108, + "tokens_trained": 0.485507224 + }, + { + "epoch": 1.449645390070922, + "grad_norm": 0.7871324419975281, + "loss": 5.0174, + "lr": 0.00035538461538461543, + "step": 5110, + "tokens_trained": 0.485695736 + }, + { + "epoch": 1.4502127659574469, + "grad_norm": 0.7191143035888672, + "loss": 4.8973, + "lr": 0.0003551048951048951, + "step": 5112, + "tokens_trained": 0.485886528 + }, + { + "epoch": 1.4507801418439716, + "grad_norm": 0.6869152188301086, + "loss": 4.894, + "lr": 0.00035482517482517486, + "step": 5114, + "tokens_trained": 0.486074896 + }, + { + "epoch": 1.4513475177304964, + "grad_norm": 0.7272975444793701, + "loss": 4.9682, + "lr": 0.00035454545454545455, + "step": 5116, + "tokens_trained": 0.486265864 + }, + { + "epoch": 1.4519148936170212, + "grad_norm": 0.6644308567047119, + "loss": 4.9707, + "lr": 0.0003542657342657343, + "step": 5118, + "tokens_trained": 0.4864564 + }, + { + "epoch": 1.452482269503546, + "grad_norm": 0.7381615042686462, + "loss": 4.9609, + "lr": 0.000353986013986014, + "step": 5120, + "tokens_trained": 0.486645736 + }, + { + "epoch": 1.453049645390071, + "grad_norm": 0.7426425814628601, + "loss": 4.9685, + "lr": 0.0003537062937062937, + "step": 5122, + "tokens_trained": 0.486836424 + }, + { + "epoch": 1.4536170212765958, + "grad_norm": 0.682476818561554, + "loss": 5.0294, + "lr": 0.0003534265734265734, + "step": 5124, + "tokens_trained": 0.48702528 + }, + { + "epoch": 1.4539007092198581, + "eval_loss": 4.962060451507568, + "eval_runtime": 20.8404, + "step": 5125, + "tokens_trained": 0.487120248 + }, + { + "epoch": 1.4541843971631205, + "grad_norm": 0.7397556900978088, + "loss": 4.9378, + "lr": 0.0003531468531468531, + "step": 5126, + "tokens_trained": 0.487216208 + }, + { + "epoch": 1.4547517730496453, + "grad_norm": 0.8119034171104431, + "loss": 4.9408, + "lr": 0.0003528671328671329, + "step": 5128, + "tokens_trained": 0.487405496 + }, + { + "epoch": 1.4553191489361703, + "grad_norm": 0.7072781324386597, + "loss": 4.9543, + "lr": 0.0003525874125874126, + "step": 5130, + "tokens_trained": 0.487596552 + }, + { + "epoch": 1.455886524822695, + "grad_norm": 0.839381217956543, + "loss": 4.9204, + "lr": 0.00035230769230769235, + "step": 5132, + "tokens_trained": 0.487786936 + }, + { + "epoch": 1.4564539007092199, + "grad_norm": 0.8116129636764526, + "loss": 4.8954, + "lr": 0.00035202797202797204, + "step": 5134, + "tokens_trained": 0.48797788 + }, + { + "epoch": 1.4570212765957447, + "grad_norm": 0.6917586326599121, + "loss": 4.9926, + "lr": 0.0003517482517482518, + "step": 5136, + "tokens_trained": 0.488167992 + }, + { + "epoch": 1.4575886524822694, + "grad_norm": 0.7610443830490112, + "loss": 4.9908, + "lr": 0.0003514685314685315, + "step": 5138, + "tokens_trained": 0.488357824 + }, + { + "epoch": 1.4581560283687942, + "grad_norm": 0.6879466772079468, + "loss": 4.9405, + "lr": 0.00035118881118881116, + "step": 5140, + "tokens_trained": 0.488546976 + }, + { + "epoch": 1.4587234042553192, + "grad_norm": 0.7296876311302185, + "loss": 4.9302, + "lr": 0.0003509090909090909, + "step": 5142, + "tokens_trained": 0.488736728 + }, + { + "epoch": 1.459290780141844, + "grad_norm": 0.726078987121582, + "loss": 4.953, + "lr": 0.0003506293706293706, + "step": 5144, + "tokens_trained": 0.488924216 + }, + { + "epoch": 1.4598581560283688, + "grad_norm": 0.7201434969902039, + "loss": 4.9273, + "lr": 0.0003503496503496504, + "step": 5146, + "tokens_trained": 0.489115432 + }, + { + "epoch": 1.4604255319148935, + "grad_norm": 0.7064175605773926, + "loss": 4.878, + "lr": 0.0003500699300699301, + "step": 5148, + "tokens_trained": 0.489305352 + }, + { + "epoch": 1.4609929078014185, + "grad_norm": 0.6968701481819153, + "loss": 4.9705, + "lr": 0.00034979020979020983, + "step": 5150, + "tokens_trained": 0.489495768 + }, + { + "epoch": 1.4615602836879433, + "grad_norm": 0.6772524118423462, + "loss": 4.8796, + "lr": 0.0003495104895104895, + "step": 5152, + "tokens_trained": 0.489685904 + }, + { + "epoch": 1.462127659574468, + "grad_norm": 0.6674978137016296, + "loss": 4.9093, + "lr": 0.00034923076923076927, + "step": 5154, + "tokens_trained": 0.489876272 + }, + { + "epoch": 1.4626950354609929, + "grad_norm": 0.7403509616851807, + "loss": 4.9312, + "lr": 0.00034895104895104896, + "step": 5156, + "tokens_trained": 0.490066952 + }, + { + "epoch": 1.4632624113475177, + "grad_norm": 0.7287116050720215, + "loss": 4.9127, + "lr": 0.00034867132867132865, + "step": 5158, + "tokens_trained": 0.490257024 + }, + { + "epoch": 1.4638297872340424, + "grad_norm": 0.7658629417419434, + "loss": 4.9002, + "lr": 0.0003483916083916084, + "step": 5160, + "tokens_trained": 0.490445832 + }, + { + "epoch": 1.4643971631205674, + "grad_norm": 0.7551032304763794, + "loss": 4.9557, + "lr": 0.0003481118881118881, + "step": 5162, + "tokens_trained": 0.490636 + }, + { + "epoch": 1.4649645390070922, + "grad_norm": 0.6556824445724487, + "loss": 4.968, + "lr": 0.0003478321678321678, + "step": 5164, + "tokens_trained": 0.490826024 + }, + { + "epoch": 1.465531914893617, + "grad_norm": 0.6782544255256653, + "loss": 4.9682, + "lr": 0.00034755244755244757, + "step": 5166, + "tokens_trained": 0.49101488 + }, + { + "epoch": 1.4660992907801418, + "grad_norm": 0.6604062914848328, + "loss": 4.9523, + "lr": 0.0003472727272727273, + "step": 5168, + "tokens_trained": 0.491205576 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 0.7062127590179443, + "loss": 4.9148, + "lr": 0.000346993006993007, + "step": 5170, + "tokens_trained": 0.491397208 + }, + { + "epoch": 1.4672340425531916, + "grad_norm": 0.7056037187576294, + "loss": 4.9568, + "lr": 0.00034671328671328675, + "step": 5172, + "tokens_trained": 0.491588976 + }, + { + "epoch": 1.4678014184397163, + "grad_norm": 0.6791336536407471, + "loss": 4.9242, + "lr": 0.00034643356643356644, + "step": 5174, + "tokens_trained": 0.491779592 + }, + { + "epoch": 1.4683687943262411, + "grad_norm": 0.694888710975647, + "loss": 4.9874, + "lr": 0.00034615384615384613, + "step": 5176, + "tokens_trained": 0.49197004 + }, + { + "epoch": 1.468936170212766, + "grad_norm": 0.7048712968826294, + "loss": 4.914, + "lr": 0.0003458741258741259, + "step": 5178, + "tokens_trained": 0.492157104 + }, + { + "epoch": 1.4695035460992907, + "grad_norm": 0.6525787711143494, + "loss": 4.9394, + "lr": 0.00034559440559440557, + "step": 5180, + "tokens_trained": 0.492347352 + }, + { + "epoch": 1.4700709219858157, + "grad_norm": 0.719822883605957, + "loss": 4.9142, + "lr": 0.0003453146853146853, + "step": 5182, + "tokens_trained": 0.492540024 + }, + { + "epoch": 1.4706382978723405, + "grad_norm": 0.6324074268341064, + "loss": 5.0143, + "lr": 0.00034503496503496506, + "step": 5184, + "tokens_trained": 0.49273128 + }, + { + "epoch": 1.4712056737588652, + "grad_norm": 0.7017198204994202, + "loss": 4.9955, + "lr": 0.0003447552447552448, + "step": 5186, + "tokens_trained": 0.492921624 + }, + { + "epoch": 1.47177304964539, + "grad_norm": 0.6721644997596741, + "loss": 4.8586, + "lr": 0.0003444755244755245, + "step": 5188, + "tokens_trained": 0.493110224 + }, + { + "epoch": 1.472340425531915, + "grad_norm": 0.6856983304023743, + "loss": 4.9897, + "lr": 0.0003441958041958042, + "step": 5190, + "tokens_trained": 0.493301296 + }, + { + "epoch": 1.4729078014184398, + "grad_norm": 0.7391275763511658, + "loss": 4.9807, + "lr": 0.0003439160839160839, + "step": 5192, + "tokens_trained": 0.493490904 + }, + { + "epoch": 1.4734751773049646, + "grad_norm": 0.7362062931060791, + "loss": 4.9474, + "lr": 0.0003436363636363636, + "step": 5194, + "tokens_trained": 0.493680368 + }, + { + "epoch": 1.4740425531914894, + "grad_norm": 0.7283117175102234, + "loss": 4.9375, + "lr": 0.00034335664335664336, + "step": 5196, + "tokens_trained": 0.493869344 + }, + { + "epoch": 1.4746099290780141, + "grad_norm": 0.6644704937934875, + "loss": 5.0159, + "lr": 0.00034307692307692305, + "step": 5198, + "tokens_trained": 0.494059664 + }, + { + "epoch": 1.475177304964539, + "grad_norm": 0.7252303957939148, + "loss": 4.9307, + "lr": 0.0003427972027972028, + "step": 5200, + "tokens_trained": 0.494250216 + }, + { + "epoch": 1.4757446808510637, + "grad_norm": 0.6854642033576965, + "loss": 4.9407, + "lr": 0.00034251748251748254, + "step": 5202, + "tokens_trained": 0.494437472 + }, + { + "epoch": 1.4763120567375887, + "grad_norm": 0.7645247578620911, + "loss": 4.9242, + "lr": 0.0003422377622377623, + "step": 5204, + "tokens_trained": 0.494627872 + }, + { + "epoch": 1.4768794326241135, + "grad_norm": 0.7982824444770813, + "loss": 4.9241, + "lr": 0.000341958041958042, + "step": 5206, + "tokens_trained": 0.494819832 + }, + { + "epoch": 1.4774468085106383, + "grad_norm": 0.7241318225860596, + "loss": 4.9255, + "lr": 0.00034167832167832167, + "step": 5208, + "tokens_trained": 0.495010952 + }, + { + "epoch": 1.4780141843971633, + "grad_norm": 0.7253429293632507, + "loss": 4.9332, + "lr": 0.0003413986013986014, + "step": 5210, + "tokens_trained": 0.49520108 + }, + { + "epoch": 1.478581560283688, + "grad_norm": 0.7978675365447998, + "loss": 4.9115, + "lr": 0.0003411188811188811, + "step": 5212, + "tokens_trained": 0.495390568 + }, + { + "epoch": 1.4791489361702128, + "grad_norm": 0.7228849530220032, + "loss": 4.9242, + "lr": 0.00034083916083916084, + "step": 5214, + "tokens_trained": 0.495579944 + }, + { + "epoch": 1.4797163120567376, + "grad_norm": 0.6821274757385254, + "loss": 4.9012, + "lr": 0.00034055944055944054, + "step": 5216, + "tokens_trained": 0.495770832 + }, + { + "epoch": 1.4802836879432624, + "grad_norm": 0.7085686922073364, + "loss": 4.8695, + "lr": 0.0003402797202797203, + "step": 5218, + "tokens_trained": 0.495959744 + }, + { + "epoch": 1.4808510638297872, + "grad_norm": 0.6809284090995789, + "loss": 4.9484, + "lr": 0.00034, + "step": 5220, + "tokens_trained": 0.496149904 + }, + { + "epoch": 1.481418439716312, + "grad_norm": 0.8035463690757751, + "loss": 4.9195, + "lr": 0.00033972027972027977, + "step": 5222, + "tokens_trained": 0.496339944 + }, + { + "epoch": 1.481985815602837, + "grad_norm": 0.6803924441337585, + "loss": 5.0083, + "lr": 0.00033944055944055946, + "step": 5224, + "tokens_trained": 0.49653112 + }, + { + "epoch": 1.4825531914893617, + "grad_norm": 0.7047116756439209, + "loss": 4.9358, + "lr": 0.00033916083916083915, + "step": 5226, + "tokens_trained": 0.496722816 + }, + { + "epoch": 1.4831205673758865, + "grad_norm": 0.6624785661697388, + "loss": 4.9749, + "lr": 0.0003388811188811189, + "step": 5228, + "tokens_trained": 0.496914168 + }, + { + "epoch": 1.4836879432624113, + "grad_norm": 0.7224833965301514, + "loss": 4.9043, + "lr": 0.0003386013986013986, + "step": 5230, + "tokens_trained": 0.497105128 + }, + { + "epoch": 1.4842553191489363, + "grad_norm": 0.7224262952804565, + "loss": 4.9132, + "lr": 0.00033832167832167833, + "step": 5232, + "tokens_trained": 0.497294728 + }, + { + "epoch": 1.484822695035461, + "grad_norm": 0.7377181053161621, + "loss": 4.9376, + "lr": 0.000338041958041958, + "step": 5234, + "tokens_trained": 0.49748684 + }, + { + "epoch": 1.4853900709219858, + "grad_norm": 0.6763118505477905, + "loss": 4.8603, + "lr": 0.00033776223776223776, + "step": 5236, + "tokens_trained": 0.49767768 + }, + { + "epoch": 1.4859574468085106, + "grad_norm": 0.6546086668968201, + "loss": 4.9397, + "lr": 0.0003374825174825175, + "step": 5238, + "tokens_trained": 0.49786732 + }, + { + "epoch": 1.4865248226950354, + "grad_norm": 0.6710076928138733, + "loss": 4.9352, + "lr": 0.00033720279720279725, + "step": 5240, + "tokens_trained": 0.49805788 + }, + { + "epoch": 1.4870921985815602, + "grad_norm": 0.6867020726203918, + "loss": 4.9736, + "lr": 0.00033692307692307694, + "step": 5242, + "tokens_trained": 0.498249072 + }, + { + "epoch": 1.4876595744680852, + "grad_norm": 0.7198293209075928, + "loss": 4.951, + "lr": 0.00033664335664335663, + "step": 5244, + "tokens_trained": 0.498438568 + }, + { + "epoch": 1.48822695035461, + "grad_norm": 0.7505615949630737, + "loss": 4.9478, + "lr": 0.0003363636363636364, + "step": 5246, + "tokens_trained": 0.498628128 + }, + { + "epoch": 1.4887943262411347, + "grad_norm": 0.7085391879081726, + "loss": 4.9528, + "lr": 0.00033608391608391607, + "step": 5248, + "tokens_trained": 0.498817976 + }, + { + "epoch": 1.4893617021276595, + "grad_norm": 0.6769458651542664, + "loss": 4.9422, + "lr": 0.0003358041958041958, + "step": 5250, + "tokens_trained": 0.499007344 + }, + { + "epoch": 1.4893617021276595, + "eval_loss": 4.961794376373291, + "eval_runtime": 20.5965, + "step": 5250, + "tokens_trained": 0.499007344 + } + ], + "logging_steps": 2, + "max_steps": 7650, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 125, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}