diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15321 @@ +{ + "best_metric": 0.26657968759536743, + "best_model_checkpoint": "/scratch/czm5kz/llama2-7b_8_500_0.0003_sg_finetuned_combined/checkpoint-8480", + "epoch": 500.0, + "eval_steps": 20, + "global_step": 8500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.29, + "grad_norm": 5.123568058013916, + "learning_rate": 0.0002998588235294117, + "loss": 5.4502, + "step": 5 + }, + { + "epoch": 0.59, + "grad_norm": 1.3123226165771484, + "learning_rate": 0.00029975294117647055, + "loss": 5.0514, + "step": 10 + }, + { + "epoch": 0.88, + "grad_norm": 2.6887731552124023, + "learning_rate": 0.0002995764705882353, + "loss": 4.4102, + "step": 15 + }, + { + "epoch": 1.18, + "grad_norm": 2.039410352706909, + "learning_rate": 0.00029939999999999996, + "loss": 3.7543, + "step": 20 + }, + { + "epoch": 1.18, + "eval_loss": 3.3241162300109863, + "eval_runtime": 1.9296, + "eval_samples_per_second": 69.444, + "eval_steps_per_second": 8.81, + "step": 20 + }, + { + "epoch": 1.47, + "grad_norm": 2.350649833679199, + "learning_rate": 0.0002992235294117647, + "loss": 3.0935, + "step": 25 + }, + { + "epoch": 1.76, + "grad_norm": 1.9300975799560547, + "learning_rate": 0.00029904705882352937, + "loss": 2.7956, + "step": 30 + }, + { + "epoch": 2.06, + "grad_norm": 2.085968017578125, + "learning_rate": 0.0002988705882352941, + "loss": 2.5441, + "step": 35 + }, + { + "epoch": 2.35, + "grad_norm": 2.5128731727600098, + "learning_rate": 0.0002986941176470588, + "loss": 2.3348, + "step": 40 + }, + { + "epoch": 2.35, + "eval_loss": 2.127983331680298, + "eval_runtime": 1.9339, + "eval_samples_per_second": 69.29, + "eval_steps_per_second": 8.791, + "step": 40 + }, + { + "epoch": 2.65, + "grad_norm": 3.9887444972991943, + "learning_rate": 0.0002985176470588235, + "loss": 2.0991, + "step": 45 + }, + { + "epoch": 2.94, + "grad_norm": 2.811306953430176, + "learning_rate": 0.00029834117647058823, + "loss": 2.007, + "step": 50 + }, + { + "epoch": 3.24, + "grad_norm": 4.2949604988098145, + "learning_rate": 0.0002981647058823529, + "loss": 1.5835, + "step": 55 + }, + { + "epoch": 3.53, + "grad_norm": 3.780212163925171, + "learning_rate": 0.0002979882352941176, + "loss": 1.5016, + "step": 60 + }, + { + "epoch": 3.53, + "eval_loss": 1.2937872409820557, + "eval_runtime": 1.9306, + "eval_samples_per_second": 69.407, + "eval_steps_per_second": 8.805, + "step": 60 + }, + { + "epoch": 3.82, + "grad_norm": 4.925411701202393, + "learning_rate": 0.0002978117647058823, + "loss": 1.484, + "step": 65 + }, + { + "epoch": 4.12, + "grad_norm": 4.242525577545166, + "learning_rate": 0.00029763529411764705, + "loss": 1.1715, + "step": 70 + }, + { + "epoch": 4.41, + "grad_norm": 6.573999881744385, + "learning_rate": 0.0002974588235294117, + "loss": 0.9396, + "step": 75 + }, + { + "epoch": 4.71, + "grad_norm": 4.906064510345459, + "learning_rate": 0.00029728235294117645, + "loss": 0.9192, + "step": 80 + }, + { + "epoch": 4.71, + "eval_loss": 0.7313734292984009, + "eval_runtime": 1.9355, + "eval_samples_per_second": 69.234, + "eval_steps_per_second": 8.783, + "step": 80 + }, + { + "epoch": 5.0, + "grad_norm": 4.575896739959717, + "learning_rate": 0.00029710588235294113, + "loss": 0.9155, + "step": 85 + }, + { + "epoch": 5.29, + "grad_norm": 4.148166179656982, + "learning_rate": 0.00029692941176470586, + "loss": 0.6521, + "step": 90 + }, + { + "epoch": 5.59, + "grad_norm": 5.4380974769592285, + "learning_rate": 0.0002967529411764706, + "loss": 0.7093, + "step": 95 + }, + { + "epoch": 5.88, + "grad_norm": 3.571953535079956, + "learning_rate": 0.00029657647058823526, + "loss": 0.6178, + "step": 100 + }, + { + "epoch": 5.88, + "eval_loss": 0.5012210011482239, + "eval_runtime": 1.9386, + "eval_samples_per_second": 69.121, + "eval_steps_per_second": 8.769, + "step": 100 + }, + { + "epoch": 6.18, + "grad_norm": 3.71437406539917, + "learning_rate": 0.0002964, + "loss": 0.6074, + "step": 105 + }, + { + "epoch": 6.47, + "grad_norm": 4.273383617401123, + "learning_rate": 0.00029622352941176467, + "loss": 0.4938, + "step": 110 + }, + { + "epoch": 6.76, + "grad_norm": 4.195381164550781, + "learning_rate": 0.00029604705882352935, + "loss": 0.5625, + "step": 115 + }, + { + "epoch": 7.06, + "grad_norm": 2.1271698474884033, + "learning_rate": 0.0002958705882352941, + "loss": 0.5118, + "step": 120 + }, + { + "epoch": 7.06, + "eval_loss": 0.4166089594364166, + "eval_runtime": 1.941, + "eval_samples_per_second": 69.035, + "eval_steps_per_second": 8.758, + "step": 120 + }, + { + "epoch": 7.35, + "grad_norm": 2.2126407623291016, + "learning_rate": 0.0002956941176470588, + "loss": 0.4213, + "step": 125 + }, + { + "epoch": 7.65, + "grad_norm": 3.431960105895996, + "learning_rate": 0.0002955176470588235, + "loss": 0.443, + "step": 130 + }, + { + "epoch": 7.94, + "grad_norm": 2.0391957759857178, + "learning_rate": 0.0002953411764705882, + "loss": 0.4688, + "step": 135 + }, + { + "epoch": 8.24, + "grad_norm": 3.118192434310913, + "learning_rate": 0.0002951647058823529, + "loss": 0.4485, + "step": 140 + }, + { + "epoch": 8.24, + "eval_loss": 0.3702048063278198, + "eval_runtime": 1.9422, + "eval_samples_per_second": 68.993, + "eval_steps_per_second": 8.753, + "step": 140 + }, + { + "epoch": 8.53, + "grad_norm": 2.033215284347534, + "learning_rate": 0.0002949882352941176, + "loss": 0.3868, + "step": 145 + }, + { + "epoch": 8.82, + "grad_norm": 3.0348260402679443, + "learning_rate": 0.00029481176470588235, + "loss": 0.4512, + "step": 150 + }, + { + "epoch": 9.12, + "grad_norm": 1.7389098405838013, + "learning_rate": 0.000294635294117647, + "loss": 0.4194, + "step": 155 + }, + { + "epoch": 9.41, + "grad_norm": 3.17331862449646, + "learning_rate": 0.00029445882352941176, + "loss": 0.3609, + "step": 160 + }, + { + "epoch": 9.41, + "eval_loss": 0.3844582736492157, + "eval_runtime": 1.9445, + "eval_samples_per_second": 68.912, + "eval_steps_per_second": 8.743, + "step": 160 + }, + { + "epoch": 9.71, + "grad_norm": 2.272548198699951, + "learning_rate": 0.00029428235294117643, + "loss": 0.4528, + "step": 165 + }, + { + "epoch": 10.0, + "grad_norm": 2.1458122730255127, + "learning_rate": 0.0002941058823529411, + "loss": 0.4474, + "step": 170 + }, + { + "epoch": 10.29, + "grad_norm": 4.103682041168213, + "learning_rate": 0.0002939294117647059, + "loss": 0.3669, + "step": 175 + }, + { + "epoch": 10.59, + "grad_norm": 5.2478508949279785, + "learning_rate": 0.00029375294117647057, + "loss": 0.404, + "step": 180 + }, + { + "epoch": 10.59, + "eval_loss": 0.35352790355682373, + "eval_runtime": 1.9457, + "eval_samples_per_second": 68.869, + "eval_steps_per_second": 8.737, + "step": 180 + }, + { + "epoch": 10.88, + "grad_norm": 2.0928146839141846, + "learning_rate": 0.00029357647058823525, + "loss": 0.4123, + "step": 185 + }, + { + "epoch": 11.18, + "grad_norm": 1.1831574440002441, + "learning_rate": 0.0002934, + "loss": 0.3525, + "step": 190 + }, + { + "epoch": 11.47, + "grad_norm": 2.2243642807006836, + "learning_rate": 0.00029322352941176465, + "loss": 0.363, + "step": 195 + }, + { + "epoch": 11.76, + "grad_norm": 2.043579578399658, + "learning_rate": 0.0002930470588235294, + "loss": 0.3756, + "step": 200 + }, + { + "epoch": 11.76, + "eval_loss": 0.32814812660217285, + "eval_runtime": 1.9447, + "eval_samples_per_second": 68.905, + "eval_steps_per_second": 8.742, + "step": 200 + }, + { + "epoch": 12.06, + "grad_norm": 1.6755690574645996, + "learning_rate": 0.0002928705882352941, + "loss": 0.435, + "step": 205 + }, + { + "epoch": 12.35, + "grad_norm": 1.916886329650879, + "learning_rate": 0.0002926941176470588, + "loss": 0.3436, + "step": 210 + }, + { + "epoch": 12.65, + "grad_norm": 1.5790131092071533, + "learning_rate": 0.0002925176470588235, + "loss": 0.3899, + "step": 215 + }, + { + "epoch": 12.94, + "grad_norm": 1.0257099866867065, + "learning_rate": 0.0002923411764705882, + "loss": 0.3519, + "step": 220 + }, + { + "epoch": 12.94, + "eval_loss": 0.3028455078601837, + "eval_runtime": 1.9479, + "eval_samples_per_second": 68.792, + "eval_steps_per_second": 8.727, + "step": 220 + }, + { + "epoch": 13.24, + "grad_norm": 1.0522886514663696, + "learning_rate": 0.0002921647058823529, + "loss": 0.3456, + "step": 225 + }, + { + "epoch": 13.53, + "grad_norm": 1.4017285108566284, + "learning_rate": 0.00029198823529411766, + "loss": 0.335, + "step": 230 + }, + { + "epoch": 13.82, + "grad_norm": 1.4035940170288086, + "learning_rate": 0.00029181176470588233, + "loss": 0.368, + "step": 235 + }, + { + "epoch": 14.12, + "grad_norm": 1.5863103866577148, + "learning_rate": 0.000291635294117647, + "loss": 0.3771, + "step": 240 + }, + { + "epoch": 14.12, + "eval_loss": 0.3220466077327728, + "eval_runtime": 1.9518, + "eval_samples_per_second": 68.656, + "eval_steps_per_second": 8.71, + "step": 240 + }, + { + "epoch": 14.41, + "grad_norm": 1.2461905479431152, + "learning_rate": 0.00029145882352941174, + "loss": 0.3568, + "step": 245 + }, + { + "epoch": 14.71, + "grad_norm": 1.529754877090454, + "learning_rate": 0.0002912823529411764, + "loss": 0.3526, + "step": 250 + }, + { + "epoch": 15.0, + "grad_norm": 1.448266863822937, + "learning_rate": 0.00029110588235294114, + "loss": 0.3577, + "step": 255 + }, + { + "epoch": 15.29, + "grad_norm": 1.3801586627960205, + "learning_rate": 0.0002909294117647059, + "loss": 0.3383, + "step": 260 + }, + { + "epoch": 15.29, + "eval_loss": 0.31107404828071594, + "eval_runtime": 1.9498, + "eval_samples_per_second": 68.725, + "eval_steps_per_second": 8.719, + "step": 260 + }, + { + "epoch": 15.59, + "grad_norm": 1.309239149093628, + "learning_rate": 0.00029075294117647055, + "loss": 0.3184, + "step": 265 + }, + { + "epoch": 15.88, + "grad_norm": 2.740022897720337, + "learning_rate": 0.0002905764705882353, + "loss": 0.3344, + "step": 270 + }, + { + "epoch": 16.18, + "grad_norm": 0.921286940574646, + "learning_rate": 0.00029039999999999996, + "loss": 0.3644, + "step": 275 + }, + { + "epoch": 16.47, + "grad_norm": 1.0704553127288818, + "learning_rate": 0.0002902235294117647, + "loss": 0.3167, + "step": 280 + }, + { + "epoch": 16.47, + "eval_loss": 0.30516672134399414, + "eval_runtime": 1.9513, + "eval_samples_per_second": 68.674, + "eval_steps_per_second": 8.712, + "step": 280 + }, + { + "epoch": 16.76, + "grad_norm": 1.0932073593139648, + "learning_rate": 0.0002900470588235294, + "loss": 0.3273, + "step": 285 + }, + { + "epoch": 17.06, + "grad_norm": 2.395887613296509, + "learning_rate": 0.0002898705882352941, + "loss": 0.3804, + "step": 290 + }, + { + "epoch": 17.35, + "grad_norm": 1.1885228157043457, + "learning_rate": 0.0002896941176470588, + "loss": 0.2905, + "step": 295 + }, + { + "epoch": 17.65, + "grad_norm": 1.0925136804580688, + "learning_rate": 0.0002895176470588235, + "loss": 0.3446, + "step": 300 + }, + { + "epoch": 17.65, + "eval_loss": 0.2982863187789917, + "eval_runtime": 1.9525, + "eval_samples_per_second": 68.629, + "eval_steps_per_second": 8.707, + "step": 300 + }, + { + "epoch": 17.94, + "grad_norm": 0.7705384492874146, + "learning_rate": 0.0002893411764705882, + "loss": 0.3229, + "step": 305 + }, + { + "epoch": 18.24, + "grad_norm": 2.2011444568634033, + "learning_rate": 0.0002891647058823529, + "loss": 0.3189, + "step": 310 + }, + { + "epoch": 18.53, + "grad_norm": 0.9696877002716064, + "learning_rate": 0.00028898823529411764, + "loss": 0.3199, + "step": 315 + }, + { + "epoch": 18.82, + "grad_norm": 0.8738492131233215, + "learning_rate": 0.0002888117647058823, + "loss": 0.3446, + "step": 320 + }, + { + "epoch": 18.82, + "eval_loss": 0.2908709645271301, + "eval_runtime": 1.9517, + "eval_samples_per_second": 68.66, + "eval_steps_per_second": 8.711, + "step": 320 + }, + { + "epoch": 19.12, + "grad_norm": 0.704424262046814, + "learning_rate": 0.00028863529411764704, + "loss": 0.327, + "step": 325 + }, + { + "epoch": 19.41, + "grad_norm": 0.8467631936073303, + "learning_rate": 0.0002884588235294117, + "loss": 0.2996, + "step": 330 + }, + { + "epoch": 19.71, + "grad_norm": 0.9181163907051086, + "learning_rate": 0.00028828235294117645, + "loss": 0.3622, + "step": 335 + }, + { + "epoch": 20.0, + "grad_norm": 1.2167885303497314, + "learning_rate": 0.0002881058823529412, + "loss": 0.3641, + "step": 340 + }, + { + "epoch": 20.0, + "eval_loss": 0.29205262660980225, + "eval_runtime": 1.9519, + "eval_samples_per_second": 68.652, + "eval_steps_per_second": 8.71, + "step": 340 + }, + { + "epoch": 20.29, + "grad_norm": 0.892876923084259, + "learning_rate": 0.00028792941176470586, + "loss": 0.3, + "step": 345 + }, + { + "epoch": 20.59, + "grad_norm": 0.9650051593780518, + "learning_rate": 0.0002877529411764706, + "loss": 0.3102, + "step": 350 + }, + { + "epoch": 20.88, + "grad_norm": 1.0151336193084717, + "learning_rate": 0.00028757647058823526, + "loss": 0.3238, + "step": 355 + }, + { + "epoch": 21.18, + "grad_norm": 0.8064928650856018, + "learning_rate": 0.00028739999999999994, + "loss": 0.3063, + "step": 360 + }, + { + "epoch": 21.18, + "eval_loss": 0.284064382314682, + "eval_runtime": 1.9521, + "eval_samples_per_second": 68.643, + "eval_steps_per_second": 8.708, + "step": 360 + }, + { + "epoch": 21.47, + "grad_norm": 0.7681912183761597, + "learning_rate": 0.0002872235294117647, + "loss": 0.3043, + "step": 365 + }, + { + "epoch": 21.76, + "grad_norm": 0.8123140931129456, + "learning_rate": 0.0002870470588235294, + "loss": 0.3112, + "step": 370 + }, + { + "epoch": 22.06, + "grad_norm": 0.7973850965499878, + "learning_rate": 0.0002868705882352941, + "loss": 0.3505, + "step": 375 + }, + { + "epoch": 22.35, + "grad_norm": 1.6452568769454956, + "learning_rate": 0.0002866941176470588, + "loss": 0.2959, + "step": 380 + }, + { + "epoch": 22.35, + "eval_loss": 0.28800833225250244, + "eval_runtime": 1.9722, + "eval_samples_per_second": 67.945, + "eval_steps_per_second": 8.62, + "step": 380 + }, + { + "epoch": 22.65, + "grad_norm": 0.9361231327056885, + "learning_rate": 0.0002865176470588235, + "loss": 0.2984, + "step": 385 + }, + { + "epoch": 22.94, + "grad_norm": 0.6799980998039246, + "learning_rate": 0.0002863411764705882, + "loss": 0.329, + "step": 390 + }, + { + "epoch": 23.24, + "grad_norm": 2.908402919769287, + "learning_rate": 0.00028616470588235294, + "loss": 0.2983, + "step": 395 + }, + { + "epoch": 23.53, + "grad_norm": 0.6607904434204102, + "learning_rate": 0.0002859882352941176, + "loss": 0.317, + "step": 400 + }, + { + "epoch": 23.53, + "eval_loss": 0.28526613116264343, + "eval_runtime": 1.9523, + "eval_samples_per_second": 68.638, + "eval_steps_per_second": 8.708, + "step": 400 + }, + { + "epoch": 23.82, + "grad_norm": 1.24152672290802, + "learning_rate": 0.00028581176470588235, + "loss": 0.3026, + "step": 405 + }, + { + "epoch": 24.12, + "grad_norm": 0.6178376078605652, + "learning_rate": 0.000285635294117647, + "loss": 0.3154, + "step": 410 + }, + { + "epoch": 24.41, + "grad_norm": 0.6836390495300293, + "learning_rate": 0.00028545882352941175, + "loss": 0.317, + "step": 415 + }, + { + "epoch": 24.71, + "grad_norm": 0.7111595273017883, + "learning_rate": 0.0002852823529411765, + "loss": 0.3171, + "step": 420 + }, + { + "epoch": 24.71, + "eval_loss": 0.2867472469806671, + "eval_runtime": 1.9521, + "eval_samples_per_second": 68.643, + "eval_steps_per_second": 8.709, + "step": 420 + }, + { + "epoch": 25.0, + "grad_norm": 3.3374156951904297, + "learning_rate": 0.00028510588235294116, + "loss": 0.3041, + "step": 425 + }, + { + "epoch": 25.29, + "grad_norm": 0.8345808386802673, + "learning_rate": 0.00028492941176470584, + "loss": 0.2897, + "step": 430 + }, + { + "epoch": 25.59, + "grad_norm": 1.0496177673339844, + "learning_rate": 0.00028475294117647057, + "loss": 0.2931, + "step": 435 + }, + { + "epoch": 25.88, + "grad_norm": 1.5688951015472412, + "learning_rate": 0.00028457647058823524, + "loss": 0.3332, + "step": 440 + }, + { + "epoch": 25.88, + "eval_loss": 0.2801815867424011, + "eval_runtime": 1.9536, + "eval_samples_per_second": 68.591, + "eval_steps_per_second": 8.702, + "step": 440 + }, + { + "epoch": 26.18, + "grad_norm": 0.9920473098754883, + "learning_rate": 0.0002844, + "loss": 0.3088, + "step": 445 + }, + { + "epoch": 26.47, + "grad_norm": 0.7296784520149231, + "learning_rate": 0.0002842235294117647, + "loss": 0.2866, + "step": 450 + }, + { + "epoch": 26.76, + "grad_norm": 0.5926627516746521, + "learning_rate": 0.0002840470588235294, + "loss": 0.3044, + "step": 455 + }, + { + "epoch": 27.06, + "grad_norm": 5.869891166687012, + "learning_rate": 0.0002838705882352941, + "loss": 0.3191, + "step": 460 + }, + { + "epoch": 27.06, + "eval_loss": 0.28161150217056274, + "eval_runtime": 1.9639, + "eval_samples_per_second": 68.231, + "eval_steps_per_second": 8.656, + "step": 460 + }, + { + "epoch": 27.35, + "grad_norm": 0.6099969148635864, + "learning_rate": 0.0002836941176470588, + "loss": 0.2823, + "step": 465 + }, + { + "epoch": 27.65, + "grad_norm": 0.8896517157554626, + "learning_rate": 0.0002835176470588235, + "loss": 0.3265, + "step": 470 + }, + { + "epoch": 27.94, + "grad_norm": 0.9603505730628967, + "learning_rate": 0.00028334117647058825, + "loss": 0.3238, + "step": 475 + }, + { + "epoch": 28.24, + "grad_norm": 1.5040206909179688, + "learning_rate": 0.0002831647058823529, + "loss": 0.325, + "step": 480 + }, + { + "epoch": 28.24, + "eval_loss": 0.2924734652042389, + "eval_runtime": 1.9531, + "eval_samples_per_second": 68.61, + "eval_steps_per_second": 8.704, + "step": 480 + }, + { + "epoch": 28.53, + "grad_norm": 0.844178318977356, + "learning_rate": 0.00028298823529411765, + "loss": 0.312, + "step": 485 + }, + { + "epoch": 28.82, + "grad_norm": 0.6218575835227966, + "learning_rate": 0.00028281176470588233, + "loss": 0.2908, + "step": 490 + }, + { + "epoch": 29.12, + "grad_norm": 0.6082823276519775, + "learning_rate": 0.000282635294117647, + "loss": 0.3179, + "step": 495 + }, + { + "epoch": 29.41, + "grad_norm": 0.6815407872200012, + "learning_rate": 0.00028245882352941174, + "loss": 0.3009, + "step": 500 + }, + { + "epoch": 29.41, + "eval_loss": 0.2948806583881378, + "eval_runtime": 1.9574, + "eval_samples_per_second": 68.457, + "eval_steps_per_second": 8.685, + "step": 500 + }, + { + "epoch": 29.71, + "grad_norm": 2.2132604122161865, + "learning_rate": 0.00028228235294117647, + "loss": 0.3173, + "step": 505 + }, + { + "epoch": 30.0, + "grad_norm": 0.98082435131073, + "learning_rate": 0.00028210588235294114, + "loss": 0.3383, + "step": 510 + }, + { + "epoch": 30.29, + "grad_norm": 0.7199916243553162, + "learning_rate": 0.00028192941176470587, + "loss": 0.2971, + "step": 515 + }, + { + "epoch": 30.59, + "grad_norm": 0.8390836715698242, + "learning_rate": 0.00028175294117647055, + "loss": 0.2972, + "step": 520 + }, + { + "epoch": 30.59, + "eval_loss": 0.2906944453716278, + "eval_runtime": 1.9519, + "eval_samples_per_second": 68.651, + "eval_steps_per_second": 8.709, + "step": 520 + }, + { + "epoch": 30.88, + "grad_norm": 2.17277193069458, + "learning_rate": 0.0002815764705882353, + "loss": 0.3437, + "step": 525 + }, + { + "epoch": 31.18, + "grad_norm": 0.6228814721107483, + "learning_rate": 0.00028139999999999996, + "loss": 0.2722, + "step": 530 + }, + { + "epoch": 31.47, + "grad_norm": 0.6469265222549438, + "learning_rate": 0.0002812235294117647, + "loss": 0.2897, + "step": 535 + }, + { + "epoch": 31.76, + "grad_norm": 0.8024590015411377, + "learning_rate": 0.0002810470588235294, + "loss": 0.33, + "step": 540 + }, + { + "epoch": 31.76, + "eval_loss": 0.2866993844509125, + "eval_runtime": 1.9637, + "eval_samples_per_second": 68.239, + "eval_steps_per_second": 8.657, + "step": 540 + }, + { + "epoch": 32.06, + "grad_norm": 0.8460168838500977, + "learning_rate": 0.0002808705882352941, + "loss": 0.3225, + "step": 545 + }, + { + "epoch": 32.35, + "grad_norm": 0.5334822535514832, + "learning_rate": 0.00028069411764705877, + "loss": 0.2754, + "step": 550 + }, + { + "epoch": 32.65, + "grad_norm": 0.7353460788726807, + "learning_rate": 0.0002805176470588235, + "loss": 0.303, + "step": 555 + }, + { + "epoch": 32.94, + "grad_norm": 1.0487838983535767, + "learning_rate": 0.00028034117647058823, + "loss": 0.3085, + "step": 560 + }, + { + "epoch": 32.94, + "eval_loss": 0.28210318088531494, + "eval_runtime": 1.9514, + "eval_samples_per_second": 68.669, + "eval_steps_per_second": 8.712, + "step": 560 + }, + { + "epoch": 33.24, + "grad_norm": 0.6352580189704895, + "learning_rate": 0.0002801647058823529, + "loss": 0.2946, + "step": 565 + }, + { + "epoch": 33.53, + "grad_norm": 0.8723613023757935, + "learning_rate": 0.00027998823529411763, + "loss": 0.2857, + "step": 570 + }, + { + "epoch": 33.82, + "grad_norm": 1.4288231134414673, + "learning_rate": 0.0002798117647058823, + "loss": 0.3502, + "step": 575 + }, + { + "epoch": 34.12, + "grad_norm": 0.585587203502655, + "learning_rate": 0.00027963529411764704, + "loss": 0.2954, + "step": 580 + }, + { + "epoch": 34.12, + "eval_loss": 0.28595367074012756, + "eval_runtime": 1.956, + "eval_samples_per_second": 68.508, + "eval_steps_per_second": 8.691, + "step": 580 + }, + { + "epoch": 34.41, + "grad_norm": 1.0765429735183716, + "learning_rate": 0.0002794588235294117, + "loss": 0.2998, + "step": 585 + }, + { + "epoch": 34.71, + "grad_norm": 0.6804926991462708, + "learning_rate": 0.00027928235294117645, + "loss": 0.291, + "step": 590 + }, + { + "epoch": 35.0, + "grad_norm": 0.44599294662475586, + "learning_rate": 0.0002791058823529412, + "loss": 0.3057, + "step": 595 + }, + { + "epoch": 35.29, + "grad_norm": 0.7748059034347534, + "learning_rate": 0.00027892941176470585, + "loss": 0.2933, + "step": 600 + }, + { + "epoch": 35.29, + "eval_loss": 0.2836548388004303, + "eval_runtime": 1.9536, + "eval_samples_per_second": 68.59, + "eval_steps_per_second": 8.702, + "step": 600 + }, + { + "epoch": 35.59, + "grad_norm": 0.5564882159233093, + "learning_rate": 0.00027875294117647053, + "loss": 0.3043, + "step": 605 + }, + { + "epoch": 35.88, + "grad_norm": 0.5883165001869202, + "learning_rate": 0.00027857647058823526, + "loss": 0.2938, + "step": 610 + }, + { + "epoch": 36.18, + "grad_norm": 0.42169690132141113, + "learning_rate": 0.0002784, + "loss": 0.302, + "step": 615 + }, + { + "epoch": 36.47, + "grad_norm": 0.4851395785808563, + "learning_rate": 0.00027822352941176467, + "loss": 0.2838, + "step": 620 + }, + { + "epoch": 36.47, + "eval_loss": 0.284923791885376, + "eval_runtime": 1.9512, + "eval_samples_per_second": 68.677, + "eval_steps_per_second": 8.713, + "step": 620 + }, + { + "epoch": 36.76, + "grad_norm": 0.6681527495384216, + "learning_rate": 0.0002780470588235294, + "loss": 0.3358, + "step": 625 + }, + { + "epoch": 37.06, + "grad_norm": 0.483582079410553, + "learning_rate": 0.0002778705882352941, + "loss": 0.3146, + "step": 630 + }, + { + "epoch": 37.35, + "grad_norm": 0.6509492993354797, + "learning_rate": 0.0002776941176470588, + "loss": 0.282, + "step": 635 + }, + { + "epoch": 37.65, + "grad_norm": 0.6812987327575684, + "learning_rate": 0.0002775176470588235, + "loss": 0.2995, + "step": 640 + }, + { + "epoch": 37.65, + "eval_loss": 0.2773895263671875, + "eval_runtime": 1.9525, + "eval_samples_per_second": 68.629, + "eval_steps_per_second": 8.707, + "step": 640 + }, + { + "epoch": 37.94, + "grad_norm": 0.6441000699996948, + "learning_rate": 0.0002773411764705882, + "loss": 0.3043, + "step": 645 + }, + { + "epoch": 38.24, + "grad_norm": 0.5422846674919128, + "learning_rate": 0.00027716470588235294, + "loss": 0.2581, + "step": 650 + }, + { + "epoch": 38.53, + "grad_norm": 0.6752066612243652, + "learning_rate": 0.0002769882352941176, + "loss": 0.3076, + "step": 655 + }, + { + "epoch": 38.82, + "grad_norm": 0.6421564817428589, + "learning_rate": 0.00027681176470588235, + "loss": 0.3149, + "step": 660 + }, + { + "epoch": 38.82, + "eval_loss": 0.2785627245903015, + "eval_runtime": 1.9548, + "eval_samples_per_second": 68.551, + "eval_steps_per_second": 8.697, + "step": 660 + }, + { + "epoch": 39.12, + "grad_norm": 1.1004048585891724, + "learning_rate": 0.000276635294117647, + "loss": 0.2719, + "step": 665 + }, + { + "epoch": 39.41, + "grad_norm": 0.5431386828422546, + "learning_rate": 0.00027645882352941175, + "loss": 0.2767, + "step": 670 + }, + { + "epoch": 39.71, + "grad_norm": 0.47728753089904785, + "learning_rate": 0.00027628235294117643, + "loss": 0.279, + "step": 675 + }, + { + "epoch": 40.0, + "grad_norm": 1.076058268547058, + "learning_rate": 0.00027610588235294116, + "loss": 0.3405, + "step": 680 + }, + { + "epoch": 40.0, + "eval_loss": 0.2742210626602173, + "eval_runtime": 1.9541, + "eval_samples_per_second": 68.575, + "eval_steps_per_second": 8.7, + "step": 680 + }, + { + "epoch": 40.29, + "grad_norm": 0.6353278160095215, + "learning_rate": 0.00027592941176470584, + "loss": 0.2748, + "step": 685 + }, + { + "epoch": 40.59, + "grad_norm": 0.6093863248825073, + "learning_rate": 0.00027575294117647057, + "loss": 0.3009, + "step": 690 + }, + { + "epoch": 40.88, + "grad_norm": 0.5480519533157349, + "learning_rate": 0.00027557647058823524, + "loss": 0.2991, + "step": 695 + }, + { + "epoch": 41.18, + "grad_norm": 0.3784177303314209, + "learning_rate": 0.00027539999999999997, + "loss": 0.2837, + "step": 700 + }, + { + "epoch": 41.18, + "eval_loss": 0.27853742241859436, + "eval_runtime": 1.9502, + "eval_samples_per_second": 68.712, + "eval_steps_per_second": 8.717, + "step": 700 + }, + { + "epoch": 41.47, + "grad_norm": 0.5141330361366272, + "learning_rate": 0.0002752235294117647, + "loss": 0.2714, + "step": 705 + }, + { + "epoch": 41.76, + "grad_norm": 0.6757667064666748, + "learning_rate": 0.0002750470588235294, + "loss": 0.3072, + "step": 710 + }, + { + "epoch": 42.06, + "grad_norm": 0.5985231995582581, + "learning_rate": 0.0002748705882352941, + "loss": 0.307, + "step": 715 + }, + { + "epoch": 42.35, + "grad_norm": 0.6961740851402283, + "learning_rate": 0.0002746941176470588, + "loss": 0.3036, + "step": 720 + }, + { + "epoch": 42.35, + "eval_loss": 0.2767024338245392, + "eval_runtime": 1.9536, + "eval_samples_per_second": 68.593, + "eval_steps_per_second": 8.702, + "step": 720 + }, + { + "epoch": 42.65, + "grad_norm": 0.5391381978988647, + "learning_rate": 0.0002745176470588235, + "loss": 0.279, + "step": 725 + }, + { + "epoch": 42.94, + "grad_norm": 0.5576377511024475, + "learning_rate": 0.00027434117647058825, + "loss": 0.3033, + "step": 730 + }, + { + "epoch": 43.24, + "grad_norm": 0.5860000848770142, + "learning_rate": 0.0002741647058823529, + "loss": 0.2788, + "step": 735 + }, + { + "epoch": 43.53, + "grad_norm": 0.4636955261230469, + "learning_rate": 0.0002739882352941176, + "loss": 0.2806, + "step": 740 + }, + { + "epoch": 43.53, + "eval_loss": 0.27654823660850525, + "eval_runtime": 1.9508, + "eval_samples_per_second": 68.69, + "eval_steps_per_second": 8.714, + "step": 740 + }, + { + "epoch": 43.82, + "grad_norm": 0.6484056115150452, + "learning_rate": 0.00027381176470588233, + "loss": 0.2992, + "step": 745 + }, + { + "epoch": 44.12, + "grad_norm": 0.6432157158851624, + "learning_rate": 0.000273635294117647, + "loss": 0.3008, + "step": 750 + }, + { + "epoch": 44.41, + "grad_norm": 0.596067488193512, + "learning_rate": 0.00027345882352941173, + "loss": 0.2835, + "step": 755 + }, + { + "epoch": 44.71, + "grad_norm": 0.564532458782196, + "learning_rate": 0.00027328235294117646, + "loss": 0.2872, + "step": 760 + }, + { + "epoch": 44.71, + "eval_loss": 0.2745668292045593, + "eval_runtime": 1.9536, + "eval_samples_per_second": 68.59, + "eval_steps_per_second": 8.702, + "step": 760 + }, + { + "epoch": 45.0, + "grad_norm": 0.7748390436172485, + "learning_rate": 0.00027310588235294114, + "loss": 0.291, + "step": 765 + }, + { + "epoch": 45.29, + "grad_norm": 0.5681774616241455, + "learning_rate": 0.00027292941176470587, + "loss": 0.276, + "step": 770 + }, + { + "epoch": 45.59, + "grad_norm": 0.5081149339675903, + "learning_rate": 0.00027275294117647055, + "loss": 0.2913, + "step": 775 + }, + { + "epoch": 45.88, + "grad_norm": 0.5528217554092407, + "learning_rate": 0.0002725764705882353, + "loss": 0.304, + "step": 780 + }, + { + "epoch": 45.88, + "eval_loss": 0.2734004855155945, + "eval_runtime": 1.9524, + "eval_samples_per_second": 68.634, + "eval_steps_per_second": 8.707, + "step": 780 + }, + { + "epoch": 46.18, + "grad_norm": 0.5401836633682251, + "learning_rate": 0.0002724, + "loss": 0.2785, + "step": 785 + }, + { + "epoch": 46.47, + "grad_norm": 0.6658024191856384, + "learning_rate": 0.0002722235294117647, + "loss": 0.2796, + "step": 790 + }, + { + "epoch": 46.76, + "grad_norm": 0.4573308825492859, + "learning_rate": 0.00027204705882352936, + "loss": 0.3, + "step": 795 + }, + { + "epoch": 47.06, + "grad_norm": 0.3817196190357208, + "learning_rate": 0.0002718705882352941, + "loss": 0.2778, + "step": 800 + }, + { + "epoch": 47.06, + "eval_loss": 0.27235957980155945, + "eval_runtime": 1.955, + "eval_samples_per_second": 68.541, + "eval_steps_per_second": 8.696, + "step": 800 + }, + { + "epoch": 47.35, + "grad_norm": 0.5332309603691101, + "learning_rate": 0.00027169411764705877, + "loss": 0.2783, + "step": 805 + }, + { + "epoch": 47.65, + "grad_norm": 0.5310131311416626, + "learning_rate": 0.0002715176470588235, + "loss": 0.268, + "step": 810 + }, + { + "epoch": 47.94, + "grad_norm": 0.7193430066108704, + "learning_rate": 0.0002713411764705882, + "loss": 0.3268, + "step": 815 + }, + { + "epoch": 48.24, + "grad_norm": 0.4694163501262665, + "learning_rate": 0.0002711647058823529, + "loss": 0.2796, + "step": 820 + }, + { + "epoch": 48.24, + "eval_loss": 0.27389633655548096, + "eval_runtime": 1.9523, + "eval_samples_per_second": 68.637, + "eval_steps_per_second": 8.708, + "step": 820 + }, + { + "epoch": 48.53, + "grad_norm": 0.4959717094898224, + "learning_rate": 0.00027098823529411763, + "loss": 0.2807, + "step": 825 + }, + { + "epoch": 48.82, + "grad_norm": 0.5667008757591248, + "learning_rate": 0.0002708117647058823, + "loss": 0.2953, + "step": 830 + }, + { + "epoch": 49.12, + "grad_norm": 0.4282715916633606, + "learning_rate": 0.00027063529411764704, + "loss": 0.2978, + "step": 835 + }, + { + "epoch": 49.41, + "grad_norm": 0.5538773536682129, + "learning_rate": 0.00027045882352941177, + "loss": 0.285, + "step": 840 + }, + { + "epoch": 49.41, + "eval_loss": 0.2780298888683319, + "eval_runtime": 1.954, + "eval_samples_per_second": 68.579, + "eval_steps_per_second": 8.7, + "step": 840 + }, + { + "epoch": 49.71, + "grad_norm": 0.6341338157653809, + "learning_rate": 0.00027028235294117645, + "loss": 0.2958, + "step": 845 + }, + { + "epoch": 50.0, + "grad_norm": 0.561484158039093, + "learning_rate": 0.0002701058823529412, + "loss": 0.2914, + "step": 850 + }, + { + "epoch": 50.29, + "grad_norm": 0.5157304406166077, + "learning_rate": 0.00026992941176470585, + "loss": 0.2644, + "step": 855 + }, + { + "epoch": 50.59, + "grad_norm": 0.6163771152496338, + "learning_rate": 0.00026975294117647053, + "loss": 0.3025, + "step": 860 + }, + { + "epoch": 50.59, + "eval_loss": 0.27420225739479065, + "eval_runtime": 1.9541, + "eval_samples_per_second": 68.574, + "eval_steps_per_second": 8.7, + "step": 860 + }, + { + "epoch": 50.88, + "grad_norm": 0.459975928068161, + "learning_rate": 0.00026957647058823526, + "loss": 0.2946, + "step": 865 + }, + { + "epoch": 51.18, + "grad_norm": 0.48077988624572754, + "learning_rate": 0.0002694, + "loss": 0.2789, + "step": 870 + }, + { + "epoch": 51.47, + "grad_norm": 0.4449954628944397, + "learning_rate": 0.00026922352941176466, + "loss": 0.2861, + "step": 875 + }, + { + "epoch": 51.76, + "grad_norm": 0.41058850288391113, + "learning_rate": 0.0002690470588235294, + "loss": 0.3099, + "step": 880 + }, + { + "epoch": 51.76, + "eval_loss": 0.2756434679031372, + "eval_runtime": 1.9546, + "eval_samples_per_second": 68.556, + "eval_steps_per_second": 8.697, + "step": 880 + }, + { + "epoch": 52.06, + "grad_norm": 0.46851596236228943, + "learning_rate": 0.00026887058823529407, + "loss": 0.2753, + "step": 885 + }, + { + "epoch": 52.35, + "grad_norm": 0.42595791816711426, + "learning_rate": 0.0002686941176470588, + "loss": 0.2618, + "step": 890 + }, + { + "epoch": 52.65, + "grad_norm": 0.4970473647117615, + "learning_rate": 0.00026851764705882353, + "loss": 0.2791, + "step": 895 + }, + { + "epoch": 52.94, + "grad_norm": 0.7363806366920471, + "learning_rate": 0.0002683411764705882, + "loss": 0.3301, + "step": 900 + }, + { + "epoch": 52.94, + "eval_loss": 0.2720503509044647, + "eval_runtime": 1.9515, + "eval_samples_per_second": 68.664, + "eval_steps_per_second": 8.711, + "step": 900 + }, + { + "epoch": 53.24, + "grad_norm": 0.4821537733078003, + "learning_rate": 0.00026816470588235294, + "loss": 0.272, + "step": 905 + }, + { + "epoch": 53.53, + "grad_norm": 0.46624499559402466, + "learning_rate": 0.0002679882352941176, + "loss": 0.2691, + "step": 910 + }, + { + "epoch": 53.82, + "grad_norm": 0.46243298053741455, + "learning_rate": 0.0002678117647058823, + "loss": 0.3059, + "step": 915 + }, + { + "epoch": 54.12, + "grad_norm": 0.5359281301498413, + "learning_rate": 0.000267635294117647, + "loss": 0.2912, + "step": 920 + }, + { + "epoch": 54.12, + "eval_loss": 0.27168896794319153, + "eval_runtime": 1.952, + "eval_samples_per_second": 68.646, + "eval_steps_per_second": 8.709, + "step": 920 + }, + { + "epoch": 54.41, + "grad_norm": 0.5566356182098389, + "learning_rate": 0.00026745882352941175, + "loss": 0.2745, + "step": 925 + }, + { + "epoch": 54.71, + "grad_norm": 0.5852227210998535, + "learning_rate": 0.00026728235294117643, + "loss": 0.2875, + "step": 930 + }, + { + "epoch": 55.0, + "grad_norm": 0.6479737758636475, + "learning_rate": 0.00026710588235294116, + "loss": 0.3031, + "step": 935 + }, + { + "epoch": 55.29, + "grad_norm": 0.4862912595272064, + "learning_rate": 0.00026692941176470583, + "loss": 0.2759, + "step": 940 + }, + { + "epoch": 55.29, + "eval_loss": 0.27377134561538696, + "eval_runtime": 1.9548, + "eval_samples_per_second": 68.551, + "eval_steps_per_second": 8.697, + "step": 940 + }, + { + "epoch": 55.59, + "grad_norm": 0.38759374618530273, + "learning_rate": 0.00026675294117647056, + "loss": 0.2671, + "step": 945 + }, + { + "epoch": 55.88, + "grad_norm": 0.4383208751678467, + "learning_rate": 0.0002665764705882353, + "loss": 0.2939, + "step": 950 + }, + { + "epoch": 56.18, + "grad_norm": 0.5393241047859192, + "learning_rate": 0.00026639999999999997, + "loss": 0.3155, + "step": 955 + }, + { + "epoch": 56.47, + "grad_norm": 0.44239819049835205, + "learning_rate": 0.0002662235294117647, + "loss": 0.2905, + "step": 960 + }, + { + "epoch": 56.47, + "eval_loss": 0.27207040786743164, + "eval_runtime": 1.953, + "eval_samples_per_second": 68.614, + "eval_steps_per_second": 8.705, + "step": 960 + }, + { + "epoch": 56.76, + "grad_norm": 0.3884721100330353, + "learning_rate": 0.0002660470588235294, + "loss": 0.2843, + "step": 965 + }, + { + "epoch": 57.06, + "grad_norm": 0.4377553164958954, + "learning_rate": 0.00026587058823529405, + "loss": 0.2758, + "step": 970 + }, + { + "epoch": 57.35, + "grad_norm": 0.4321759343147278, + "learning_rate": 0.00026569411764705884, + "loss": 0.2694, + "step": 975 + }, + { + "epoch": 57.65, + "grad_norm": 0.6309795379638672, + "learning_rate": 0.0002655176470588235, + "loss": 0.3099, + "step": 980 + }, + { + "epoch": 57.65, + "eval_loss": 0.2731949985027313, + "eval_runtime": 1.953, + "eval_samples_per_second": 68.611, + "eval_steps_per_second": 8.704, + "step": 980 + }, + { + "epoch": 57.94, + "grad_norm": 0.5466398596763611, + "learning_rate": 0.0002653411764705882, + "loss": 0.2886, + "step": 985 + }, + { + "epoch": 58.24, + "grad_norm": 0.4965384304523468, + "learning_rate": 0.0002651647058823529, + "loss": 0.2715, + "step": 990 + }, + { + "epoch": 58.53, + "grad_norm": 0.5091078281402588, + "learning_rate": 0.0002649882352941176, + "loss": 0.2744, + "step": 995 + }, + { + "epoch": 58.82, + "grad_norm": 0.3896671235561371, + "learning_rate": 0.0002648117647058823, + "loss": 0.2673, + "step": 1000 + }, + { + "epoch": 58.82, + "eval_loss": 0.273307740688324, + "eval_runtime": 1.9517, + "eval_samples_per_second": 68.658, + "eval_steps_per_second": 8.71, + "step": 1000 + }, + { + "epoch": 59.12, + "grad_norm": 0.3623935878276825, + "learning_rate": 0.00026463529411764706, + "loss": 0.3197, + "step": 1005 + }, + { + "epoch": 59.41, + "grad_norm": 0.34094226360321045, + "learning_rate": 0.00026445882352941173, + "loss": 0.2598, + "step": 1010 + }, + { + "epoch": 59.71, + "grad_norm": 0.47524309158325195, + "learning_rate": 0.00026428235294117646, + "loss": 0.2951, + "step": 1015 + }, + { + "epoch": 60.0, + "grad_norm": 0.5470165014266968, + "learning_rate": 0.00026410588235294114, + "loss": 0.2984, + "step": 1020 + }, + { + "epoch": 60.0, + "eval_loss": 0.27134189009666443, + "eval_runtime": 1.953, + "eval_samples_per_second": 68.614, + "eval_steps_per_second": 8.705, + "step": 1020 + }, + { + "epoch": 60.29, + "grad_norm": 0.5740846991539001, + "learning_rate": 0.00026392941176470587, + "loss": 0.2845, + "step": 1025 + }, + { + "epoch": 60.59, + "grad_norm": 0.5390617847442627, + "learning_rate": 0.0002637529411764706, + "loss": 0.2737, + "step": 1030 + }, + { + "epoch": 60.88, + "grad_norm": 0.39359235763549805, + "learning_rate": 0.0002635764705882353, + "loss": 0.2924, + "step": 1035 + }, + { + "epoch": 61.18, + "grad_norm": 0.420967698097229, + "learning_rate": 0.00026339999999999995, + "loss": 0.2788, + "step": 1040 + }, + { + "epoch": 61.18, + "eval_loss": 0.2706230580806732, + "eval_runtime": 1.9517, + "eval_samples_per_second": 68.657, + "eval_steps_per_second": 8.71, + "step": 1040 + }, + { + "epoch": 61.47, + "grad_norm": 0.42066413164138794, + "learning_rate": 0.0002632235294117647, + "loss": 0.2691, + "step": 1045 + }, + { + "epoch": 61.76, + "grad_norm": 0.5592681765556335, + "learning_rate": 0.00026304705882352936, + "loss": 0.3034, + "step": 1050 + }, + { + "epoch": 62.06, + "grad_norm": 0.3660678565502167, + "learning_rate": 0.0002628705882352941, + "loss": 0.2879, + "step": 1055 + }, + { + "epoch": 62.35, + "grad_norm": 0.5801591277122498, + "learning_rate": 0.0002626941176470588, + "loss": 0.2848, + "step": 1060 + }, + { + "epoch": 62.35, + "eval_loss": 0.2737047076225281, + "eval_runtime": 1.9509, + "eval_samples_per_second": 68.686, + "eval_steps_per_second": 8.714, + "step": 1060 + }, + { + "epoch": 62.65, + "grad_norm": 0.4952467381954193, + "learning_rate": 0.0002625176470588235, + "loss": 0.301, + "step": 1065 + }, + { + "epoch": 62.94, + "grad_norm": 0.47585591673851013, + "learning_rate": 0.0002623411764705882, + "loss": 0.2767, + "step": 1070 + }, + { + "epoch": 63.24, + "grad_norm": 0.3574976325035095, + "learning_rate": 0.0002621647058823529, + "loss": 0.2954, + "step": 1075 + }, + { + "epoch": 63.53, + "grad_norm": 0.43940040469169617, + "learning_rate": 0.00026198823529411763, + "loss": 0.2707, + "step": 1080 + }, + { + "epoch": 63.53, + "eval_loss": 0.27542659640312195, + "eval_runtime": 1.9664, + "eval_samples_per_second": 68.145, + "eval_steps_per_second": 8.645, + "step": 1080 + }, + { + "epoch": 63.82, + "grad_norm": 0.44467687606811523, + "learning_rate": 0.00026181176470588236, + "loss": 0.284, + "step": 1085 + }, + { + "epoch": 64.12, + "grad_norm": 0.3564075231552124, + "learning_rate": 0.00026163529411764704, + "loss": 0.295, + "step": 1090 + }, + { + "epoch": 64.41, + "grad_norm": 0.4368317425251007, + "learning_rate": 0.00026145882352941177, + "loss": 0.2954, + "step": 1095 + }, + { + "epoch": 64.71, + "grad_norm": 0.44492024183273315, + "learning_rate": 0.00026128235294117644, + "loss": 0.2716, + "step": 1100 + }, + { + "epoch": 64.71, + "eval_loss": 0.27507930994033813, + "eval_runtime": 1.9524, + "eval_samples_per_second": 68.632, + "eval_steps_per_second": 8.707, + "step": 1100 + }, + { + "epoch": 65.0, + "grad_norm": 0.5535992383956909, + "learning_rate": 0.0002611058823529411, + "loss": 0.3068, + "step": 1105 + }, + { + "epoch": 65.29, + "grad_norm": 0.42365413904190063, + "learning_rate": 0.00026092941176470585, + "loss": 0.2774, + "step": 1110 + }, + { + "epoch": 65.59, + "grad_norm": 0.5739246010780334, + "learning_rate": 0.0002607529411764706, + "loss": 0.269, + "step": 1115 + }, + { + "epoch": 65.88, + "grad_norm": 0.3806311786174774, + "learning_rate": 0.00026057647058823526, + "loss": 0.2863, + "step": 1120 + }, + { + "epoch": 65.88, + "eval_loss": 0.2721364498138428, + "eval_runtime": 1.9538, + "eval_samples_per_second": 68.586, + "eval_steps_per_second": 8.701, + "step": 1120 + }, + { + "epoch": 66.18, + "grad_norm": 0.43212905526161194, + "learning_rate": 0.0002604, + "loss": 0.2977, + "step": 1125 + }, + { + "epoch": 66.47, + "grad_norm": 0.5891084671020508, + "learning_rate": 0.00026022352941176466, + "loss": 0.2983, + "step": 1130 + }, + { + "epoch": 66.76, + "grad_norm": 0.4200720489025116, + "learning_rate": 0.0002600470588235294, + "loss": 0.2853, + "step": 1135 + }, + { + "epoch": 67.06, + "grad_norm": 0.44674333930015564, + "learning_rate": 0.0002598705882352941, + "loss": 0.2901, + "step": 1140 + }, + { + "epoch": 67.06, + "eval_loss": 0.2710944712162018, + "eval_runtime": 1.9529, + "eval_samples_per_second": 68.615, + "eval_steps_per_second": 8.705, + "step": 1140 + }, + { + "epoch": 67.35, + "grad_norm": 0.5090762376785278, + "learning_rate": 0.0002596941176470588, + "loss": 0.2895, + "step": 1145 + }, + { + "epoch": 67.65, + "grad_norm": 0.4323594570159912, + "learning_rate": 0.00025951764705882353, + "loss": 0.2794, + "step": 1150 + }, + { + "epoch": 67.94, + "grad_norm": 0.4847664535045624, + "learning_rate": 0.0002593411764705882, + "loss": 0.2814, + "step": 1155 + }, + { + "epoch": 68.24, + "grad_norm": 0.44665852189064026, + "learning_rate": 0.0002591647058823529, + "loss": 0.2803, + "step": 1160 + }, + { + "epoch": 68.24, + "eval_loss": 0.27070555090904236, + "eval_runtime": 1.951, + "eval_samples_per_second": 68.683, + "eval_steps_per_second": 8.713, + "step": 1160 + }, + { + "epoch": 68.53, + "grad_norm": 0.36899805068969727, + "learning_rate": 0.00025898823529411767, + "loss": 0.2641, + "step": 1165 + }, + { + "epoch": 68.82, + "grad_norm": 0.4446316361427307, + "learning_rate": 0.00025881176470588234, + "loss": 0.3071, + "step": 1170 + }, + { + "epoch": 69.12, + "grad_norm": 0.4194595217704773, + "learning_rate": 0.000258635294117647, + "loss": 0.2758, + "step": 1175 + }, + { + "epoch": 69.41, + "grad_norm": 0.49983254075050354, + "learning_rate": 0.00025845882352941175, + "loss": 0.2608, + "step": 1180 + }, + { + "epoch": 69.41, + "eval_loss": 0.2714843451976776, + "eval_runtime": 1.9522, + "eval_samples_per_second": 68.642, + "eval_steps_per_second": 8.708, + "step": 1180 + }, + { + "epoch": 69.71, + "grad_norm": 0.4583798944950104, + "learning_rate": 0.0002582823529411764, + "loss": 0.2905, + "step": 1185 + }, + { + "epoch": 70.0, + "grad_norm": 0.7360429763793945, + "learning_rate": 0.00025810588235294116, + "loss": 0.3118, + "step": 1190 + }, + { + "epoch": 70.29, + "grad_norm": 0.3636053204536438, + "learning_rate": 0.0002579294117647059, + "loss": 0.2461, + "step": 1195 + }, + { + "epoch": 70.59, + "grad_norm": 0.3528292179107666, + "learning_rate": 0.00025775294117647056, + "loss": 0.2954, + "step": 1200 + }, + { + "epoch": 70.59, + "eval_loss": 0.2728196382522583, + "eval_runtime": 1.9521, + "eval_samples_per_second": 68.643, + "eval_steps_per_second": 8.708, + "step": 1200 + }, + { + "epoch": 70.88, + "grad_norm": 0.6297979950904846, + "learning_rate": 0.0002575764705882353, + "loss": 0.3027, + "step": 1205 + }, + { + "epoch": 71.18, + "grad_norm": 0.3765307664871216, + "learning_rate": 0.00025739999999999997, + "loss": 0.2796, + "step": 1210 + }, + { + "epoch": 71.47, + "grad_norm": 0.40963253378868103, + "learning_rate": 0.0002572235294117647, + "loss": 0.2703, + "step": 1215 + }, + { + "epoch": 71.76, + "grad_norm": 0.34185245633125305, + "learning_rate": 0.0002570470588235294, + "loss": 0.2838, + "step": 1220 + }, + { + "epoch": 71.76, + "eval_loss": 0.27210521697998047, + "eval_runtime": 1.9527, + "eval_samples_per_second": 68.623, + "eval_steps_per_second": 8.706, + "step": 1220 + }, + { + "epoch": 72.06, + "grad_norm": 0.3344729542732239, + "learning_rate": 0.0002568705882352941, + "loss": 0.292, + "step": 1225 + }, + { + "epoch": 72.35, + "grad_norm": 0.388359010219574, + "learning_rate": 0.0002566941176470588, + "loss": 0.2749, + "step": 1230 + }, + { + "epoch": 72.65, + "grad_norm": 0.47340095043182373, + "learning_rate": 0.0002565176470588235, + "loss": 0.2818, + "step": 1235 + }, + { + "epoch": 72.94, + "grad_norm": 0.5147826075553894, + "learning_rate": 0.0002563411764705882, + "loss": 0.3041, + "step": 1240 + }, + { + "epoch": 72.94, + "eval_loss": 0.2707369327545166, + "eval_runtime": 1.9514, + "eval_samples_per_second": 68.67, + "eval_steps_per_second": 8.712, + "step": 1240 + }, + { + "epoch": 73.24, + "grad_norm": 0.44949597120285034, + "learning_rate": 0.0002561647058823529, + "loss": 0.2687, + "step": 1245 + }, + { + "epoch": 73.53, + "grad_norm": 0.35800760984420776, + "learning_rate": 0.00025598823529411765, + "loss": 0.268, + "step": 1250 + }, + { + "epoch": 73.82, + "grad_norm": 0.3977588713169098, + "learning_rate": 0.0002558117647058823, + "loss": 0.2863, + "step": 1255 + }, + { + "epoch": 74.12, + "grad_norm": 0.4873248338699341, + "learning_rate": 0.00025563529411764705, + "loss": 0.3077, + "step": 1260 + }, + { + "epoch": 74.12, + "eval_loss": 0.26996681094169617, + "eval_runtime": 1.9518, + "eval_samples_per_second": 68.656, + "eval_steps_per_second": 8.71, + "step": 1260 + }, + { + "epoch": 74.41, + "grad_norm": 0.3870217800140381, + "learning_rate": 0.00025545882352941173, + "loss": 0.2779, + "step": 1265 + }, + { + "epoch": 74.71, + "grad_norm": 0.4877471923828125, + "learning_rate": 0.00025528235294117646, + "loss": 0.2669, + "step": 1270 + }, + { + "epoch": 75.0, + "grad_norm": 0.3868286907672882, + "learning_rate": 0.00025510588235294114, + "loss": 0.2941, + "step": 1275 + }, + { + "epoch": 75.29, + "grad_norm": 0.44130975008010864, + "learning_rate": 0.00025492941176470587, + "loss": 0.2623, + "step": 1280 + }, + { + "epoch": 75.29, + "eval_loss": 0.27081385254859924, + "eval_runtime": 1.9518, + "eval_samples_per_second": 68.655, + "eval_steps_per_second": 8.71, + "step": 1280 + }, + { + "epoch": 75.59, + "grad_norm": 0.3881152868270874, + "learning_rate": 0.0002547529411764706, + "loss": 0.296, + "step": 1285 + }, + { + "epoch": 75.88, + "grad_norm": 0.469176322221756, + "learning_rate": 0.0002545764705882353, + "loss": 0.2955, + "step": 1290 + }, + { + "epoch": 76.18, + "grad_norm": 0.4273076355457306, + "learning_rate": 0.00025439999999999995, + "loss": 0.2648, + "step": 1295 + }, + { + "epoch": 76.47, + "grad_norm": 0.3832271099090576, + "learning_rate": 0.0002542235294117647, + "loss": 0.2613, + "step": 1300 + }, + { + "epoch": 76.47, + "eval_loss": 0.2735311686992645, + "eval_runtime": 1.9517, + "eval_samples_per_second": 68.658, + "eval_steps_per_second": 8.71, + "step": 1300 + }, + { + "epoch": 76.76, + "grad_norm": 0.39410656690597534, + "learning_rate": 0.0002540470588235294, + "loss": 0.2769, + "step": 1305 + }, + { + "epoch": 77.06, + "grad_norm": 0.3950161635875702, + "learning_rate": 0.0002538705882352941, + "loss": 0.3176, + "step": 1310 + }, + { + "epoch": 77.35, + "grad_norm": 0.42553427815437317, + "learning_rate": 0.0002536941176470588, + "loss": 0.2788, + "step": 1315 + }, + { + "epoch": 77.65, + "grad_norm": 0.35006698966026306, + "learning_rate": 0.0002535176470588235, + "loss": 0.2808, + "step": 1320 + }, + { + "epoch": 77.65, + "eval_loss": 0.27642661333084106, + "eval_runtime": 1.9516, + "eval_samples_per_second": 68.663, + "eval_steps_per_second": 8.711, + "step": 1320 + }, + { + "epoch": 77.94, + "grad_norm": 0.5055559277534485, + "learning_rate": 0.0002533411764705882, + "loss": 0.3024, + "step": 1325 + }, + { + "epoch": 78.24, + "grad_norm": 0.28948742151260376, + "learning_rate": 0.0002531647058823529, + "loss": 0.2621, + "step": 1330 + }, + { + "epoch": 78.53, + "grad_norm": 0.3765624761581421, + "learning_rate": 0.00025298823529411763, + "loss": 0.2893, + "step": 1335 + }, + { + "epoch": 78.82, + "grad_norm": 0.5073124170303345, + "learning_rate": 0.00025281176470588236, + "loss": 0.2765, + "step": 1340 + }, + { + "epoch": 78.82, + "eval_loss": 0.2720426619052887, + "eval_runtime": 1.9534, + "eval_samples_per_second": 68.6, + "eval_steps_per_second": 8.703, + "step": 1340 + }, + { + "epoch": 79.12, + "grad_norm": 0.4559653103351593, + "learning_rate": 0.00025263529411764704, + "loss": 0.311, + "step": 1345 + }, + { + "epoch": 79.41, + "grad_norm": 0.37519559264183044, + "learning_rate": 0.0002524588235294117, + "loss": 0.2909, + "step": 1350 + }, + { + "epoch": 79.71, + "grad_norm": 0.459194540977478, + "learning_rate": 0.00025228235294117644, + "loss": 0.2758, + "step": 1355 + }, + { + "epoch": 80.0, + "grad_norm": 0.5487371683120728, + "learning_rate": 0.00025210588235294117, + "loss": 0.286, + "step": 1360 + }, + { + "epoch": 80.0, + "eval_loss": 0.2706363797187805, + "eval_runtime": 1.9542, + "eval_samples_per_second": 68.571, + "eval_steps_per_second": 8.699, + "step": 1360 + }, + { + "epoch": 80.29, + "grad_norm": 0.37640294432640076, + "learning_rate": 0.00025192941176470585, + "loss": 0.2791, + "step": 1365 + }, + { + "epoch": 80.59, + "grad_norm": 0.4144321382045746, + "learning_rate": 0.0002517529411764706, + "loss": 0.2762, + "step": 1370 + }, + { + "epoch": 80.88, + "grad_norm": 0.41748863458633423, + "learning_rate": 0.00025157647058823525, + "loss": 0.2978, + "step": 1375 + }, + { + "epoch": 81.18, + "grad_norm": 0.5341853499412537, + "learning_rate": 0.0002514, + "loss": 0.2869, + "step": 1380 + }, + { + "epoch": 81.18, + "eval_loss": 0.27049124240875244, + "eval_runtime": 1.9602, + "eval_samples_per_second": 68.36, + "eval_steps_per_second": 8.673, + "step": 1380 + }, + { + "epoch": 81.47, + "grad_norm": 0.4712367057800293, + "learning_rate": 0.00025122352941176466, + "loss": 0.279, + "step": 1385 + }, + { + "epoch": 81.76, + "grad_norm": 0.41937902569770813, + "learning_rate": 0.0002510470588235294, + "loss": 0.2888, + "step": 1390 + }, + { + "epoch": 82.06, + "grad_norm": 0.46588072180747986, + "learning_rate": 0.0002508705882352941, + "loss": 0.2802, + "step": 1395 + }, + { + "epoch": 82.35, + "grad_norm": 0.43458470702171326, + "learning_rate": 0.0002506941176470588, + "loss": 0.2696, + "step": 1400 + }, + { + "epoch": 82.35, + "eval_loss": 0.27054351568222046, + "eval_runtime": 1.9531, + "eval_samples_per_second": 68.61, + "eval_steps_per_second": 8.704, + "step": 1400 + }, + { + "epoch": 82.65, + "grad_norm": 0.3337225914001465, + "learning_rate": 0.0002505176470588235, + "loss": 0.2485, + "step": 1405 + }, + { + "epoch": 82.94, + "grad_norm": 0.5289632678031921, + "learning_rate": 0.0002503411764705882, + "loss": 0.307, + "step": 1410 + }, + { + "epoch": 83.24, + "grad_norm": 0.40830889344215393, + "learning_rate": 0.00025016470588235293, + "loss": 0.2737, + "step": 1415 + }, + { + "epoch": 83.53, + "grad_norm": 0.5299372673034668, + "learning_rate": 0.0002499882352941176, + "loss": 0.2853, + "step": 1420 + }, + { + "epoch": 83.53, + "eval_loss": 0.27093368768692017, + "eval_runtime": 1.9531, + "eval_samples_per_second": 68.609, + "eval_steps_per_second": 8.704, + "step": 1420 + }, + { + "epoch": 83.82, + "grad_norm": 0.5337343215942383, + "learning_rate": 0.00024981176470588234, + "loss": 0.2954, + "step": 1425 + }, + { + "epoch": 84.12, + "grad_norm": 0.32468709349632263, + "learning_rate": 0.000249635294117647, + "loss": 0.2824, + "step": 1430 + }, + { + "epoch": 84.41, + "grad_norm": 0.41730302572250366, + "learning_rate": 0.00024945882352941175, + "loss": 0.2635, + "step": 1435 + }, + { + "epoch": 84.71, + "grad_norm": 0.5523222088813782, + "learning_rate": 0.0002492823529411764, + "loss": 0.3033, + "step": 1440 + }, + { + "epoch": 84.71, + "eval_loss": 0.27260833978652954, + "eval_runtime": 1.9525, + "eval_samples_per_second": 68.63, + "eval_steps_per_second": 8.707, + "step": 1440 + }, + { + "epoch": 85.0, + "grad_norm": 0.9240580201148987, + "learning_rate": 0.00024910588235294115, + "loss": 0.3099, + "step": 1445 + }, + { + "epoch": 85.29, + "grad_norm": 0.4818173050880432, + "learning_rate": 0.0002489294117647059, + "loss": 0.2802, + "step": 1450 + }, + { + "epoch": 85.59, + "grad_norm": 0.43186154961586, + "learning_rate": 0.00024875294117647056, + "loss": 0.2643, + "step": 1455 + }, + { + "epoch": 85.88, + "grad_norm": 0.5872223377227783, + "learning_rate": 0.0002485764705882353, + "loss": 0.2977, + "step": 1460 + }, + { + "epoch": 85.88, + "eval_loss": 0.2729244828224182, + "eval_runtime": 1.9525, + "eval_samples_per_second": 68.629, + "eval_steps_per_second": 8.707, + "step": 1460 + }, + { + "epoch": 86.18, + "grad_norm": 0.38999661803245544, + "learning_rate": 0.00024839999999999997, + "loss": 0.2733, + "step": 1465 + }, + { + "epoch": 86.47, + "grad_norm": 0.43925678730010986, + "learning_rate": 0.0002482235294117647, + "loss": 0.2864, + "step": 1470 + }, + { + "epoch": 86.76, + "grad_norm": 0.4166542589664459, + "learning_rate": 0.00024804705882352937, + "loss": 0.2842, + "step": 1475 + }, + { + "epoch": 87.06, + "grad_norm": 0.388384073972702, + "learning_rate": 0.0002478705882352941, + "loss": 0.297, + "step": 1480 + }, + { + "epoch": 87.06, + "eval_loss": 0.27032384276390076, + "eval_runtime": 1.9519, + "eval_samples_per_second": 68.652, + "eval_steps_per_second": 8.71, + "step": 1480 + }, + { + "epoch": 87.35, + "grad_norm": 0.39126166701316833, + "learning_rate": 0.0002476941176470588, + "loss": 0.2748, + "step": 1485 + }, + { + "epoch": 87.65, + "grad_norm": 0.3944039046764374, + "learning_rate": 0.0002475176470588235, + "loss": 0.2802, + "step": 1490 + }, + { + "epoch": 87.94, + "grad_norm": 0.48129957914352417, + "learning_rate": 0.0002473411764705882, + "loss": 0.2822, + "step": 1495 + }, + { + "epoch": 88.24, + "grad_norm": 0.3660375475883484, + "learning_rate": 0.0002471647058823529, + "loss": 0.2858, + "step": 1500 + }, + { + "epoch": 88.24, + "eval_loss": 0.27084389328956604, + "eval_runtime": 1.9535, + "eval_samples_per_second": 68.596, + "eval_steps_per_second": 8.703, + "step": 1500 + }, + { + "epoch": 88.53, + "grad_norm": 0.529747486114502, + "learning_rate": 0.00024698823529411765, + "loss": 0.2895, + "step": 1505 + }, + { + "epoch": 88.82, + "grad_norm": 0.40685632824897766, + "learning_rate": 0.0002468117647058823, + "loss": 0.287, + "step": 1510 + }, + { + "epoch": 89.12, + "grad_norm": 0.3383892774581909, + "learning_rate": 0.00024663529411764705, + "loss": 0.2771, + "step": 1515 + }, + { + "epoch": 89.41, + "grad_norm": 0.37899449467658997, + "learning_rate": 0.00024645882352941173, + "loss": 0.2677, + "step": 1520 + }, + { + "epoch": 89.41, + "eval_loss": 0.2708883583545685, + "eval_runtime": 1.953, + "eval_samples_per_second": 68.614, + "eval_steps_per_second": 8.705, + "step": 1520 + }, + { + "epoch": 89.71, + "grad_norm": 0.35947713255882263, + "learning_rate": 0.00024628235294117646, + "loss": 0.276, + "step": 1525 + }, + { + "epoch": 90.0, + "grad_norm": 0.5110882520675659, + "learning_rate": 0.0002461058823529412, + "loss": 0.3072, + "step": 1530 + }, + { + "epoch": 90.29, + "grad_norm": 0.4167642891407013, + "learning_rate": 0.00024592941176470587, + "loss": 0.269, + "step": 1535 + }, + { + "epoch": 90.59, + "grad_norm": 0.4180889129638672, + "learning_rate": 0.00024575294117647054, + "loss": 0.2807, + "step": 1540 + }, + { + "epoch": 90.59, + "eval_loss": 0.27177131175994873, + "eval_runtime": 1.9524, + "eval_samples_per_second": 68.633, + "eval_steps_per_second": 8.707, + "step": 1540 + }, + { + "epoch": 90.88, + "grad_norm": 0.45757177472114563, + "learning_rate": 0.00024557647058823527, + "loss": 0.2879, + "step": 1545 + }, + { + "epoch": 91.18, + "grad_norm": 0.3319392204284668, + "learning_rate": 0.00024539999999999995, + "loss": 0.2816, + "step": 1550 + }, + { + "epoch": 91.47, + "grad_norm": 0.36250099539756775, + "learning_rate": 0.0002452235294117647, + "loss": 0.2653, + "step": 1555 + }, + { + "epoch": 91.76, + "grad_norm": 0.3545467257499695, + "learning_rate": 0.0002450470588235294, + "loss": 0.291, + "step": 1560 + }, + { + "epoch": 91.76, + "eval_loss": 0.2722358703613281, + "eval_runtime": 1.9521, + "eval_samples_per_second": 68.644, + "eval_steps_per_second": 8.709, + "step": 1560 + }, + { + "epoch": 92.06, + "grad_norm": 0.40520554780960083, + "learning_rate": 0.0002448705882352941, + "loss": 0.2927, + "step": 1565 + }, + { + "epoch": 92.35, + "grad_norm": 0.46311572194099426, + "learning_rate": 0.0002446941176470588, + "loss": 0.2742, + "step": 1570 + }, + { + "epoch": 92.65, + "grad_norm": 0.4097675085067749, + "learning_rate": 0.0002445176470588235, + "loss": 0.2743, + "step": 1575 + }, + { + "epoch": 92.94, + "grad_norm": 0.41074514389038086, + "learning_rate": 0.0002443411764705882, + "loss": 0.2995, + "step": 1580 + }, + { + "epoch": 92.94, + "eval_loss": 0.26951560378074646, + "eval_runtime": 1.9546, + "eval_samples_per_second": 68.555, + "eval_steps_per_second": 8.697, + "step": 1580 + }, + { + "epoch": 93.24, + "grad_norm": 0.4202812910079956, + "learning_rate": 0.00024416470588235295, + "loss": 0.2812, + "step": 1585 + }, + { + "epoch": 93.53, + "grad_norm": 0.5208330750465393, + "learning_rate": 0.00024398823529411763, + "loss": 0.2769, + "step": 1590 + }, + { + "epoch": 93.82, + "grad_norm": 0.3760271370410919, + "learning_rate": 0.00024381176470588233, + "loss": 0.2906, + "step": 1595 + }, + { + "epoch": 94.12, + "grad_norm": 0.40006768703460693, + "learning_rate": 0.00024363529411764703, + "loss": 0.279, + "step": 1600 + }, + { + "epoch": 94.12, + "eval_loss": 0.2711445391178131, + "eval_runtime": 1.9533, + "eval_samples_per_second": 68.601, + "eval_steps_per_second": 8.703, + "step": 1600 + }, + { + "epoch": 94.41, + "grad_norm": 0.6618837714195251, + "learning_rate": 0.00024345882352941174, + "loss": 0.2828, + "step": 1605 + }, + { + "epoch": 94.71, + "grad_norm": 0.37664109468460083, + "learning_rate": 0.00024328235294117647, + "loss": 0.2793, + "step": 1610 + }, + { + "epoch": 95.0, + "grad_norm": 0.4667171537876129, + "learning_rate": 0.00024310588235294117, + "loss": 0.2941, + "step": 1615 + }, + { + "epoch": 95.29, + "grad_norm": 0.4416274428367615, + "learning_rate": 0.00024292941176470587, + "loss": 0.2684, + "step": 1620 + }, + { + "epoch": 95.29, + "eval_loss": 0.270527184009552, + "eval_runtime": 1.9527, + "eval_samples_per_second": 68.622, + "eval_steps_per_second": 8.706, + "step": 1620 + }, + { + "epoch": 95.59, + "grad_norm": 0.3745880126953125, + "learning_rate": 0.00024275294117647055, + "loss": 0.2676, + "step": 1625 + }, + { + "epoch": 95.88, + "grad_norm": 0.4286365509033203, + "learning_rate": 0.00024257647058823525, + "loss": 0.3004, + "step": 1630 + }, + { + "epoch": 96.18, + "grad_norm": 0.4646395742893219, + "learning_rate": 0.00024239999999999998, + "loss": 0.2841, + "step": 1635 + }, + { + "epoch": 96.47, + "grad_norm": 0.41549715399742126, + "learning_rate": 0.00024222352941176469, + "loss": 0.2645, + "step": 1640 + }, + { + "epoch": 96.47, + "eval_loss": 0.2707136869430542, + "eval_runtime": 1.9519, + "eval_samples_per_second": 68.651, + "eval_steps_per_second": 8.709, + "step": 1640 + }, + { + "epoch": 96.76, + "grad_norm": 0.5121225714683533, + "learning_rate": 0.0002420470588235294, + "loss": 0.3055, + "step": 1645 + }, + { + "epoch": 97.06, + "grad_norm": 0.3128992021083832, + "learning_rate": 0.0002418705882352941, + "loss": 0.2724, + "step": 1650 + }, + { + "epoch": 97.35, + "grad_norm": 0.3368048071861267, + "learning_rate": 0.0002416941176470588, + "loss": 0.2943, + "step": 1655 + }, + { + "epoch": 97.65, + "grad_norm": 0.38309842348098755, + "learning_rate": 0.0002415176470588235, + "loss": 0.2732, + "step": 1660 + }, + { + "epoch": 97.65, + "eval_loss": 0.2713993191719055, + "eval_runtime": 1.9528, + "eval_samples_per_second": 68.62, + "eval_steps_per_second": 8.706, + "step": 1660 + }, + { + "epoch": 97.94, + "grad_norm": 0.425066202878952, + "learning_rate": 0.00024134117647058823, + "loss": 0.2928, + "step": 1665 + }, + { + "epoch": 98.24, + "grad_norm": 0.4347911775112152, + "learning_rate": 0.00024116470588235293, + "loss": 0.2739, + "step": 1670 + }, + { + "epoch": 98.53, + "grad_norm": 0.44052696228027344, + "learning_rate": 0.00024098823529411764, + "loss": 0.2749, + "step": 1675 + }, + { + "epoch": 98.82, + "grad_norm": 0.37960100173950195, + "learning_rate": 0.00024081176470588234, + "loss": 0.2723, + "step": 1680 + }, + { + "epoch": 98.82, + "eval_loss": 0.2713424563407898, + "eval_runtime": 1.9673, + "eval_samples_per_second": 68.115, + "eval_steps_per_second": 8.641, + "step": 1680 + }, + { + "epoch": 99.12, + "grad_norm": 0.43037664890289307, + "learning_rate": 0.00024063529411764701, + "loss": 0.2924, + "step": 1685 + }, + { + "epoch": 99.41, + "grad_norm": 0.480788916349411, + "learning_rate": 0.00024045882352941177, + "loss": 0.2871, + "step": 1690 + }, + { + "epoch": 99.71, + "grad_norm": 0.40092575550079346, + "learning_rate": 0.00024028235294117645, + "loss": 0.2731, + "step": 1695 + }, + { + "epoch": 100.0, + "grad_norm": 0.3944762647151947, + "learning_rate": 0.00024010588235294115, + "loss": 0.2754, + "step": 1700 + }, + { + "epoch": 100.0, + "eval_loss": 0.2695128917694092, + "eval_runtime": 1.9527, + "eval_samples_per_second": 68.621, + "eval_steps_per_second": 8.706, + "step": 1700 + }, + { + "epoch": 100.29, + "grad_norm": 0.3967213034629822, + "learning_rate": 0.00023992941176470585, + "loss": 0.2607, + "step": 1705 + }, + { + "epoch": 100.59, + "grad_norm": 0.3542157709598541, + "learning_rate": 0.00023975294117647056, + "loss": 0.2682, + "step": 1710 + }, + { + "epoch": 100.88, + "grad_norm": 0.549462616443634, + "learning_rate": 0.00023957647058823526, + "loss": 0.3053, + "step": 1715 + }, + { + "epoch": 101.18, + "grad_norm": 0.40259239077568054, + "learning_rate": 0.0002394, + "loss": 0.2813, + "step": 1720 + }, + { + "epoch": 101.18, + "eval_loss": 0.2688409388065338, + "eval_runtime": 1.951, + "eval_samples_per_second": 68.683, + "eval_steps_per_second": 8.713, + "step": 1720 + }, + { + "epoch": 101.47, + "grad_norm": 0.4322134852409363, + "learning_rate": 0.0002392235294117647, + "loss": 0.2742, + "step": 1725 + }, + { + "epoch": 101.76, + "grad_norm": 0.37753593921661377, + "learning_rate": 0.0002390470588235294, + "loss": 0.2863, + "step": 1730 + }, + { + "epoch": 102.06, + "grad_norm": 0.39912375807762146, + "learning_rate": 0.0002388705882352941, + "loss": 0.2721, + "step": 1735 + }, + { + "epoch": 102.35, + "grad_norm": 0.4623911380767822, + "learning_rate": 0.0002386941176470588, + "loss": 0.2659, + "step": 1740 + }, + { + "epoch": 102.35, + "eval_loss": 0.27146780490875244, + "eval_runtime": 1.9523, + "eval_samples_per_second": 68.636, + "eval_steps_per_second": 8.708, + "step": 1740 + }, + { + "epoch": 102.65, + "grad_norm": 0.4268170893192291, + "learning_rate": 0.00023851764705882353, + "loss": 0.2794, + "step": 1745 + }, + { + "epoch": 102.94, + "grad_norm": 0.27750104665756226, + "learning_rate": 0.00023834117647058824, + "loss": 0.2881, + "step": 1750 + }, + { + "epoch": 103.24, + "grad_norm": 0.30894964933395386, + "learning_rate": 0.00023816470588235291, + "loss": 0.3009, + "step": 1755 + }, + { + "epoch": 103.53, + "grad_norm": 0.40685799717903137, + "learning_rate": 0.00023798823529411762, + "loss": 0.2796, + "step": 1760 + }, + { + "epoch": 103.53, + "eval_loss": 0.27360790967941284, + "eval_runtime": 1.964, + "eval_samples_per_second": 68.228, + "eval_steps_per_second": 8.656, + "step": 1760 + }, + { + "epoch": 103.82, + "grad_norm": 0.3822905421257019, + "learning_rate": 0.00023781176470588232, + "loss": 0.2789, + "step": 1765 + }, + { + "epoch": 104.12, + "grad_norm": 0.30802851915359497, + "learning_rate": 0.00023763529411764702, + "loss": 0.2913, + "step": 1770 + }, + { + "epoch": 104.41, + "grad_norm": 0.31410786509513855, + "learning_rate": 0.00023745882352941175, + "loss": 0.2692, + "step": 1775 + }, + { + "epoch": 104.71, + "grad_norm": 0.5326993465423584, + "learning_rate": 0.00023728235294117646, + "loss": 0.3052, + "step": 1780 + }, + { + "epoch": 104.71, + "eval_loss": 0.27087274193763733, + "eval_runtime": 1.9523, + "eval_samples_per_second": 68.637, + "eval_steps_per_second": 8.708, + "step": 1780 + }, + { + "epoch": 105.0, + "grad_norm": 0.3950839042663574, + "learning_rate": 0.00023710588235294116, + "loss": 0.2833, + "step": 1785 + }, + { + "epoch": 105.29, + "grad_norm": 0.3136695921421051, + "learning_rate": 0.00023692941176470586, + "loss": 0.2606, + "step": 1790 + }, + { + "epoch": 105.59, + "grad_norm": 0.34495916962623596, + "learning_rate": 0.00023675294117647057, + "loss": 0.3037, + "step": 1795 + }, + { + "epoch": 105.88, + "grad_norm": 0.3969443440437317, + "learning_rate": 0.0002365764705882353, + "loss": 0.2732, + "step": 1800 + }, + { + "epoch": 105.88, + "eval_loss": 0.2706298232078552, + "eval_runtime": 1.9527, + "eval_samples_per_second": 68.623, + "eval_steps_per_second": 8.706, + "step": 1800 + }, + { + "epoch": 106.18, + "grad_norm": 0.38792774081230164, + "learning_rate": 0.0002364, + "loss": 0.2885, + "step": 1805 + }, + { + "epoch": 106.47, + "grad_norm": 0.362547904253006, + "learning_rate": 0.00023622352941176468, + "loss": 0.2826, + "step": 1810 + }, + { + "epoch": 106.76, + "grad_norm": 0.4238462448120117, + "learning_rate": 0.00023604705882352938, + "loss": 0.2824, + "step": 1815 + }, + { + "epoch": 107.06, + "grad_norm": 0.5373244881629944, + "learning_rate": 0.00023587058823529408, + "loss": 0.2905, + "step": 1820 + }, + { + "epoch": 107.06, + "eval_loss": 0.2691216468811035, + "eval_runtime": 1.9541, + "eval_samples_per_second": 68.573, + "eval_steps_per_second": 8.7, + "step": 1820 + }, + { + "epoch": 107.35, + "grad_norm": 0.4316276013851166, + "learning_rate": 0.00023569411764705879, + "loss": 0.2692, + "step": 1825 + }, + { + "epoch": 107.65, + "grad_norm": 0.4820720851421356, + "learning_rate": 0.00023551764705882352, + "loss": 0.2795, + "step": 1830 + }, + { + "epoch": 107.94, + "grad_norm": 0.42356351017951965, + "learning_rate": 0.00023534117647058822, + "loss": 0.2851, + "step": 1835 + }, + { + "epoch": 108.24, + "grad_norm": 0.4265184998512268, + "learning_rate": 0.00023516470588235292, + "loss": 0.2791, + "step": 1840 + }, + { + "epoch": 108.24, + "eval_loss": 0.2702525556087494, + "eval_runtime": 1.966, + "eval_samples_per_second": 68.159, + "eval_steps_per_second": 8.647, + "step": 1840 + }, + { + "epoch": 108.53, + "grad_norm": 0.3817729949951172, + "learning_rate": 0.00023498823529411763, + "loss": 0.2877, + "step": 1845 + }, + { + "epoch": 108.82, + "grad_norm": 0.408011257648468, + "learning_rate": 0.00023481176470588233, + "loss": 0.2701, + "step": 1850 + }, + { + "epoch": 109.12, + "grad_norm": 0.3448140323162079, + "learning_rate": 0.00023463529411764706, + "loss": 0.299, + "step": 1855 + }, + { + "epoch": 109.41, + "grad_norm": 0.30877605080604553, + "learning_rate": 0.00023445882352941176, + "loss": 0.2609, + "step": 1860 + }, + { + "epoch": 109.41, + "eval_loss": 0.2709581255912781, + "eval_runtime": 1.9516, + "eval_samples_per_second": 68.661, + "eval_steps_per_second": 8.711, + "step": 1860 + }, + { + "epoch": 109.71, + "grad_norm": 0.3939575254917145, + "learning_rate": 0.00023428235294117647, + "loss": 0.2915, + "step": 1865 + }, + { + "epoch": 110.0, + "grad_norm": 0.5049751400947571, + "learning_rate": 0.00023410588235294114, + "loss": 0.2853, + "step": 1870 + }, + { + "epoch": 110.29, + "grad_norm": 0.40651261806488037, + "learning_rate": 0.00023392941176470584, + "loss": 0.2701, + "step": 1875 + }, + { + "epoch": 110.59, + "grad_norm": 0.3701951503753662, + "learning_rate": 0.00023375294117647055, + "loss": 0.2792, + "step": 1880 + }, + { + "epoch": 110.59, + "eval_loss": 0.2697772681713104, + "eval_runtime": 1.9535, + "eval_samples_per_second": 68.596, + "eval_steps_per_second": 8.703, + "step": 1880 + }, + { + "epoch": 110.88, + "grad_norm": 0.41404488682746887, + "learning_rate": 0.00023357647058823528, + "loss": 0.2714, + "step": 1885 + }, + { + "epoch": 111.18, + "grad_norm": 0.399906188249588, + "learning_rate": 0.00023339999999999998, + "loss": 0.2901, + "step": 1890 + }, + { + "epoch": 111.47, + "grad_norm": 0.41661447286605835, + "learning_rate": 0.00023322352941176468, + "loss": 0.2628, + "step": 1895 + }, + { + "epoch": 111.76, + "grad_norm": 0.34493348002433777, + "learning_rate": 0.0002330470588235294, + "loss": 0.2734, + "step": 1900 + }, + { + "epoch": 111.76, + "eval_loss": 0.27055203914642334, + "eval_runtime": 1.9521, + "eval_samples_per_second": 68.643, + "eval_steps_per_second": 8.708, + "step": 1900 + }, + { + "epoch": 112.06, + "grad_norm": 0.37161219120025635, + "learning_rate": 0.0002328705882352941, + "loss": 0.3029, + "step": 1905 + }, + { + "epoch": 112.35, + "grad_norm": 0.3913039267063141, + "learning_rate": 0.00023269411764705882, + "loss": 0.2809, + "step": 1910 + }, + { + "epoch": 112.65, + "grad_norm": 0.4279155135154724, + "learning_rate": 0.00023251764705882352, + "loss": 0.2721, + "step": 1915 + }, + { + "epoch": 112.94, + "grad_norm": 0.3168346881866455, + "learning_rate": 0.00023234117647058823, + "loss": 0.2743, + "step": 1920 + }, + { + "epoch": 112.94, + "eval_loss": 0.26932182908058167, + "eval_runtime": 1.9509, + "eval_samples_per_second": 68.685, + "eval_steps_per_second": 8.714, + "step": 1920 + }, + { + "epoch": 113.24, + "grad_norm": 0.386318564414978, + "learning_rate": 0.00023216470588235293, + "loss": 0.2788, + "step": 1925 + }, + { + "epoch": 113.53, + "grad_norm": 0.3696093261241913, + "learning_rate": 0.0002319882352941176, + "loss": 0.2705, + "step": 1930 + }, + { + "epoch": 113.82, + "grad_norm": 0.3300880491733551, + "learning_rate": 0.0002318117647058823, + "loss": 0.2763, + "step": 1935 + }, + { + "epoch": 114.12, + "grad_norm": 0.4725533425807953, + "learning_rate": 0.00023163529411764704, + "loss": 0.2982, + "step": 1940 + }, + { + "epoch": 114.12, + "eval_loss": 0.2685394883155823, + "eval_runtime": 1.9512, + "eval_samples_per_second": 68.677, + "eval_steps_per_second": 8.713, + "step": 1940 + }, + { + "epoch": 114.41, + "grad_norm": 0.516589343547821, + "learning_rate": 0.00023145882352941174, + "loss": 0.269, + "step": 1945 + }, + { + "epoch": 114.71, + "grad_norm": 0.4309942424297333, + "learning_rate": 0.00023128235294117645, + "loss": 0.2826, + "step": 1950 + }, + { + "epoch": 115.0, + "grad_norm": 0.49151453375816345, + "learning_rate": 0.00023110588235294115, + "loss": 0.2838, + "step": 1955 + }, + { + "epoch": 115.29, + "grad_norm": 0.35360270738601685, + "learning_rate": 0.00023092941176470585, + "loss": 0.268, + "step": 1960 + }, + { + "epoch": 115.29, + "eval_loss": 0.269430935382843, + "eval_runtime": 1.9502, + "eval_samples_per_second": 68.711, + "eval_steps_per_second": 8.717, + "step": 1960 + }, + { + "epoch": 115.59, + "grad_norm": 0.4440082013607025, + "learning_rate": 0.00023075294117647056, + "loss": 0.2838, + "step": 1965 + }, + { + "epoch": 115.88, + "grad_norm": 0.4656224548816681, + "learning_rate": 0.00023057647058823529, + "loss": 0.2857, + "step": 1970 + }, + { + "epoch": 116.18, + "grad_norm": 0.40396907925605774, + "learning_rate": 0.0002304, + "loss": 0.2667, + "step": 1975 + }, + { + "epoch": 116.47, + "grad_norm": 0.36848071217536926, + "learning_rate": 0.0002302235294117647, + "loss": 0.2797, + "step": 1980 + }, + { + "epoch": 116.47, + "eval_loss": 0.27135777473449707, + "eval_runtime": 1.9523, + "eval_samples_per_second": 68.638, + "eval_steps_per_second": 8.708, + "step": 1980 + }, + { + "epoch": 116.76, + "grad_norm": 0.4206145107746124, + "learning_rate": 0.0002300470588235294, + "loss": 0.2828, + "step": 1985 + }, + { + "epoch": 117.06, + "grad_norm": 0.45262378454208374, + "learning_rate": 0.00022987058823529407, + "loss": 0.2993, + "step": 1990 + }, + { + "epoch": 117.35, + "grad_norm": 0.38523104786872864, + "learning_rate": 0.00022969411764705883, + "loss": 0.2588, + "step": 1995 + }, + { + "epoch": 117.65, + "grad_norm": 0.36803358793258667, + "learning_rate": 0.0002295176470588235, + "loss": 0.2814, + "step": 2000 + }, + { + "epoch": 117.65, + "eval_loss": 0.2702518701553345, + "eval_runtime": 1.9518, + "eval_samples_per_second": 68.653, + "eval_steps_per_second": 8.71, + "step": 2000 + }, + { + "epoch": 117.94, + "grad_norm": 0.37821269035339355, + "learning_rate": 0.0002293411764705882, + "loss": 0.2943, + "step": 2005 + }, + { + "epoch": 118.24, + "grad_norm": 0.28126540780067444, + "learning_rate": 0.0002291647058823529, + "loss": 0.2698, + "step": 2010 + }, + { + "epoch": 118.53, + "grad_norm": 0.37001892924308777, + "learning_rate": 0.00022898823529411762, + "loss": 0.2699, + "step": 2015 + }, + { + "epoch": 118.82, + "grad_norm": 0.41379040479660034, + "learning_rate": 0.00022881176470588232, + "loss": 0.3199, + "step": 2020 + }, + { + "epoch": 118.82, + "eval_loss": 0.27001115679740906, + "eval_runtime": 1.9532, + "eval_samples_per_second": 68.607, + "eval_steps_per_second": 8.704, + "step": 2020 + }, + { + "epoch": 119.12, + "grad_norm": 0.3919861316680908, + "learning_rate": 0.00022863529411764705, + "loss": 0.2578, + "step": 2025 + }, + { + "epoch": 119.41, + "grad_norm": 0.36724957823753357, + "learning_rate": 0.00022845882352941175, + "loss": 0.2719, + "step": 2030 + }, + { + "epoch": 119.71, + "grad_norm": 0.4745596945285797, + "learning_rate": 0.00022828235294117645, + "loss": 0.2867, + "step": 2035 + }, + { + "epoch": 120.0, + "grad_norm": 0.5758944153785706, + "learning_rate": 0.00022810588235294116, + "loss": 0.3055, + "step": 2040 + }, + { + "epoch": 120.0, + "eval_loss": 0.2696186602115631, + "eval_runtime": 1.9528, + "eval_samples_per_second": 68.619, + "eval_steps_per_second": 8.705, + "step": 2040 + }, + { + "epoch": 120.29, + "grad_norm": 0.39728063344955444, + "learning_rate": 0.00022792941176470586, + "loss": 0.279, + "step": 2045 + }, + { + "epoch": 120.59, + "grad_norm": 0.38617196679115295, + "learning_rate": 0.0002277529411764706, + "loss": 0.2718, + "step": 2050 + }, + { + "epoch": 120.88, + "grad_norm": 0.34619051218032837, + "learning_rate": 0.0002275764705882353, + "loss": 0.2858, + "step": 2055 + }, + { + "epoch": 121.18, + "grad_norm": 0.4280933141708374, + "learning_rate": 0.00022739999999999997, + "loss": 0.2895, + "step": 2060 + }, + { + "epoch": 121.18, + "eval_loss": 0.26897379755973816, + "eval_runtime": 1.9546, + "eval_samples_per_second": 68.557, + "eval_steps_per_second": 8.698, + "step": 2060 + }, + { + "epoch": 121.47, + "grad_norm": 0.340308278799057, + "learning_rate": 0.00022722352941176467, + "loss": 0.2692, + "step": 2065 + }, + { + "epoch": 121.76, + "grad_norm": 0.5012789368629456, + "learning_rate": 0.00022704705882352938, + "loss": 0.2955, + "step": 2070 + }, + { + "epoch": 122.06, + "grad_norm": 0.3827960789203644, + "learning_rate": 0.00022687058823529408, + "loss": 0.259, + "step": 2075 + }, + { + "epoch": 122.35, + "grad_norm": 0.48568883538246155, + "learning_rate": 0.0002266941176470588, + "loss": 0.2806, + "step": 2080 + }, + { + "epoch": 122.35, + "eval_loss": 0.2693471908569336, + "eval_runtime": 1.9521, + "eval_samples_per_second": 68.645, + "eval_steps_per_second": 8.709, + "step": 2080 + }, + { + "epoch": 122.65, + "grad_norm": 0.4443628191947937, + "learning_rate": 0.00022651764705882351, + "loss": 0.2939, + "step": 2085 + }, + { + "epoch": 122.94, + "grad_norm": 0.3295034170150757, + "learning_rate": 0.00022634117647058822, + "loss": 0.2716, + "step": 2090 + }, + { + "epoch": 123.24, + "grad_norm": 0.28784361481666565, + "learning_rate": 0.00022616470588235292, + "loss": 0.2647, + "step": 2095 + }, + { + "epoch": 123.53, + "grad_norm": 0.45908254384994507, + "learning_rate": 0.00022598823529411762, + "loss": 0.2649, + "step": 2100 + }, + { + "epoch": 123.53, + "eval_loss": 0.2702420651912689, + "eval_runtime": 1.9528, + "eval_samples_per_second": 68.619, + "eval_steps_per_second": 8.705, + "step": 2100 + }, + { + "epoch": 123.82, + "grad_norm": 0.47809335589408875, + "learning_rate": 0.00022581176470588235, + "loss": 0.2949, + "step": 2105 + }, + { + "epoch": 124.12, + "grad_norm": 0.4468798041343689, + "learning_rate": 0.00022563529411764706, + "loss": 0.2947, + "step": 2110 + }, + { + "epoch": 124.41, + "grad_norm": 0.3335013687610626, + "learning_rate": 0.00022545882352941176, + "loss": 0.273, + "step": 2115 + }, + { + "epoch": 124.71, + "grad_norm": 0.3635866343975067, + "learning_rate": 0.00022528235294117644, + "loss": 0.2712, + "step": 2120 + }, + { + "epoch": 124.71, + "eval_loss": 0.27148982882499695, + "eval_runtime": 1.956, + "eval_samples_per_second": 68.507, + "eval_steps_per_second": 8.691, + "step": 2120 + }, + { + "epoch": 125.0, + "grad_norm": 0.5919917225837708, + "learning_rate": 0.00022510588235294114, + "loss": 0.3035, + "step": 2125 + }, + { + "epoch": 125.29, + "grad_norm": 0.39383575320243835, + "learning_rate": 0.00022492941176470584, + "loss": 0.2678, + "step": 2130 + }, + { + "epoch": 125.59, + "grad_norm": 0.33777979016304016, + "learning_rate": 0.00022475294117647057, + "loss": 0.2583, + "step": 2135 + }, + { + "epoch": 125.88, + "grad_norm": 0.38143444061279297, + "learning_rate": 0.00022457647058823528, + "loss": 0.3101, + "step": 2140 + }, + { + "epoch": 125.88, + "eval_loss": 0.2697526216506958, + "eval_runtime": 1.9523, + "eval_samples_per_second": 68.638, + "eval_steps_per_second": 8.708, + "step": 2140 + }, + { + "epoch": 126.18, + "grad_norm": 0.4453074038028717, + "learning_rate": 0.00022439999999999998, + "loss": 0.2883, + "step": 2145 + }, + { + "epoch": 126.47, + "grad_norm": 0.39076629281044006, + "learning_rate": 0.00022422352941176468, + "loss": 0.2722, + "step": 2150 + }, + { + "epoch": 126.76, + "grad_norm": 0.3728918135166168, + "learning_rate": 0.00022404705882352939, + "loss": 0.2575, + "step": 2155 + }, + { + "epoch": 127.06, + "grad_norm": 0.4023815393447876, + "learning_rate": 0.00022387058823529412, + "loss": 0.2933, + "step": 2160 + }, + { + "epoch": 127.06, + "eval_loss": 0.26871320605278015, + "eval_runtime": 1.9527, + "eval_samples_per_second": 68.624, + "eval_steps_per_second": 8.706, + "step": 2160 + }, + { + "epoch": 127.35, + "grad_norm": 0.5080409646034241, + "learning_rate": 0.00022369411764705882, + "loss": 0.2857, + "step": 2165 + }, + { + "epoch": 127.65, + "grad_norm": 0.5384616255760193, + "learning_rate": 0.00022351764705882352, + "loss": 0.2818, + "step": 2170 + }, + { + "epoch": 127.94, + "grad_norm": 0.34214597940444946, + "learning_rate": 0.00022334117647058823, + "loss": 0.2887, + "step": 2175 + }, + { + "epoch": 128.24, + "grad_norm": 0.34358489513397217, + "learning_rate": 0.0002231647058823529, + "loss": 0.2432, + "step": 2180 + }, + { + "epoch": 128.24, + "eval_loss": 0.27017608284950256, + "eval_runtime": 1.9528, + "eval_samples_per_second": 68.62, + "eval_steps_per_second": 8.705, + "step": 2180 + }, + { + "epoch": 128.53, + "grad_norm": 0.3332640826702118, + "learning_rate": 0.0002229882352941176, + "loss": 0.274, + "step": 2185 + }, + { + "epoch": 128.82, + "grad_norm": 0.3623349666595459, + "learning_rate": 0.00022281176470588234, + "loss": 0.3181, + "step": 2190 + }, + { + "epoch": 129.12, + "grad_norm": 0.46436458826065063, + "learning_rate": 0.00022263529411764704, + "loss": 0.2718, + "step": 2195 + }, + { + "epoch": 129.41, + "grad_norm": 0.29910969734191895, + "learning_rate": 0.00022245882352941174, + "loss": 0.254, + "step": 2200 + }, + { + "epoch": 129.41, + "eval_loss": 0.26953765749931335, + "eval_runtime": 1.9526, + "eval_samples_per_second": 68.627, + "eval_steps_per_second": 8.706, + "step": 2200 + }, + { + "epoch": 129.71, + "grad_norm": 0.4297507405281067, + "learning_rate": 0.00022228235294117644, + "loss": 0.3163, + "step": 2205 + }, + { + "epoch": 130.0, + "grad_norm": 0.42012396454811096, + "learning_rate": 0.00022210588235294115, + "loss": 0.2717, + "step": 2210 + }, + { + "epoch": 130.29, + "grad_norm": 0.35827213525772095, + "learning_rate": 0.00022192941176470588, + "loss": 0.2537, + "step": 2215 + }, + { + "epoch": 130.59, + "grad_norm": 0.4755214750766754, + "learning_rate": 0.00022175294117647058, + "loss": 0.2894, + "step": 2220 + }, + { + "epoch": 130.59, + "eval_loss": 0.26998716592788696, + "eval_runtime": 1.9528, + "eval_samples_per_second": 68.618, + "eval_steps_per_second": 8.705, + "step": 2220 + }, + { + "epoch": 130.88, + "grad_norm": 0.38798657059669495, + "learning_rate": 0.00022157647058823528, + "loss": 0.3001, + "step": 2225 + }, + { + "epoch": 131.18, + "grad_norm": 0.3155636787414551, + "learning_rate": 0.0002214, + "loss": 0.2552, + "step": 2230 + }, + { + "epoch": 131.47, + "grad_norm": 0.3551139831542969, + "learning_rate": 0.00022122352941176466, + "loss": 0.2811, + "step": 2235 + }, + { + "epoch": 131.76, + "grad_norm": 0.4360581934452057, + "learning_rate": 0.00022104705882352937, + "loss": 0.2917, + "step": 2240 + }, + { + "epoch": 131.76, + "eval_loss": 0.2703915238380432, + "eval_runtime": 1.9511, + "eval_samples_per_second": 68.68, + "eval_steps_per_second": 8.713, + "step": 2240 + }, + { + "epoch": 132.06, + "grad_norm": 0.4037160277366638, + "learning_rate": 0.0002208705882352941, + "loss": 0.2774, + "step": 2245 + }, + { + "epoch": 132.35, + "grad_norm": 0.37959492206573486, + "learning_rate": 0.0002206941176470588, + "loss": 0.2871, + "step": 2250 + }, + { + "epoch": 132.65, + "grad_norm": 0.32695892453193665, + "learning_rate": 0.0002205176470588235, + "loss": 0.258, + "step": 2255 + }, + { + "epoch": 132.94, + "grad_norm": 0.38112396001815796, + "learning_rate": 0.0002203411764705882, + "loss": 0.2876, + "step": 2260 + }, + { + "epoch": 132.94, + "eval_loss": 0.26909029483795166, + "eval_runtime": 1.951, + "eval_samples_per_second": 68.681, + "eval_steps_per_second": 8.713, + "step": 2260 + }, + { + "epoch": 133.24, + "grad_norm": 0.404988557100296, + "learning_rate": 0.0002201647058823529, + "loss": 0.2638, + "step": 2265 + }, + { + "epoch": 133.53, + "grad_norm": 0.34814000129699707, + "learning_rate": 0.00021998823529411764, + "loss": 0.2765, + "step": 2270 + }, + { + "epoch": 133.82, + "grad_norm": 0.3351640999317169, + "learning_rate": 0.00021981176470588234, + "loss": 0.2758, + "step": 2275 + }, + { + "epoch": 134.12, + "grad_norm": 0.47089263796806335, + "learning_rate": 0.00021963529411764705, + "loss": 0.2956, + "step": 2280 + }, + { + "epoch": 134.12, + "eval_loss": 0.2686498761177063, + "eval_runtime": 1.9515, + "eval_samples_per_second": 68.666, + "eval_steps_per_second": 8.711, + "step": 2280 + }, + { + "epoch": 134.41, + "grad_norm": 0.42077234387397766, + "learning_rate": 0.00021945882352941175, + "loss": 0.2645, + "step": 2285 + }, + { + "epoch": 134.71, + "grad_norm": 0.38745513558387756, + "learning_rate": 0.00021928235294117645, + "loss": 0.27, + "step": 2290 + }, + { + "epoch": 135.0, + "grad_norm": 0.5919201970100403, + "learning_rate": 0.00021910588235294113, + "loss": 0.3024, + "step": 2295 + }, + { + "epoch": 135.29, + "grad_norm": 0.4451242685317993, + "learning_rate": 0.0002189294117647059, + "loss": 0.2829, + "step": 2300 + }, + { + "epoch": 135.29, + "eval_loss": 0.2689654529094696, + "eval_runtime": 1.9588, + "eval_samples_per_second": 68.409, + "eval_steps_per_second": 8.679, + "step": 2300 + }, + { + "epoch": 135.59, + "grad_norm": 0.3423871695995331, + "learning_rate": 0.00021875294117647056, + "loss": 0.2889, + "step": 2305 + }, + { + "epoch": 135.88, + "grad_norm": 0.3486490547657013, + "learning_rate": 0.00021857647058823527, + "loss": 0.2733, + "step": 2310 + }, + { + "epoch": 136.18, + "grad_norm": 0.3516980707645416, + "learning_rate": 0.00021839999999999997, + "loss": 0.273, + "step": 2315 + }, + { + "epoch": 136.47, + "grad_norm": 0.4885893166065216, + "learning_rate": 0.00021822352941176467, + "loss": 0.271, + "step": 2320 + }, + { + "epoch": 136.47, + "eval_loss": 0.2697407901287079, + "eval_runtime": 1.953, + "eval_samples_per_second": 68.612, + "eval_steps_per_second": 8.704, + "step": 2320 + }, + { + "epoch": 136.76, + "grad_norm": 0.37122470140457153, + "learning_rate": 0.0002180470588235294, + "loss": 0.2881, + "step": 2325 + }, + { + "epoch": 137.06, + "grad_norm": 0.38461658358573914, + "learning_rate": 0.0002178705882352941, + "loss": 0.2744, + "step": 2330 + }, + { + "epoch": 137.35, + "grad_norm": 0.3464495539665222, + "learning_rate": 0.0002176941176470588, + "loss": 0.2748, + "step": 2335 + }, + { + "epoch": 137.65, + "grad_norm": 0.3527907729148865, + "learning_rate": 0.0002175176470588235, + "loss": 0.2779, + "step": 2340 + }, + { + "epoch": 137.65, + "eval_loss": 0.2701054513454437, + "eval_runtime": 1.9542, + "eval_samples_per_second": 68.571, + "eval_steps_per_second": 8.699, + "step": 2340 + }, + { + "epoch": 137.94, + "grad_norm": 0.4384912848472595, + "learning_rate": 0.00021734117647058822, + "loss": 0.2742, + "step": 2345 + }, + { + "epoch": 138.24, + "grad_norm": 0.3290651738643646, + "learning_rate": 0.00021716470588235292, + "loss": 0.2518, + "step": 2350 + }, + { + "epoch": 138.53, + "grad_norm": 0.3489058315753937, + "learning_rate": 0.00021698823529411765, + "loss": 0.2924, + "step": 2355 + }, + { + "epoch": 138.82, + "grad_norm": 0.3761179447174072, + "learning_rate": 0.00021681176470588235, + "loss": 0.2881, + "step": 2360 + }, + { + "epoch": 138.82, + "eval_loss": 0.2705743610858917, + "eval_runtime": 1.9528, + "eval_samples_per_second": 68.62, + "eval_steps_per_second": 8.706, + "step": 2360 + }, + { + "epoch": 139.12, + "grad_norm": 0.41285768151283264, + "learning_rate": 0.00021663529411764703, + "loss": 0.3079, + "step": 2365 + }, + { + "epoch": 139.41, + "grad_norm": 0.26681023836135864, + "learning_rate": 0.00021645882352941173, + "loss": 0.2741, + "step": 2370 + }, + { + "epoch": 139.71, + "grad_norm": 0.34571850299835205, + "learning_rate": 0.00021628235294117643, + "loss": 0.2784, + "step": 2375 + }, + { + "epoch": 140.0, + "grad_norm": 0.4329462945461273, + "learning_rate": 0.00021610588235294116, + "loss": 0.2759, + "step": 2380 + }, + { + "epoch": 140.0, + "eval_loss": 0.26940566301345825, + "eval_runtime": 1.9515, + "eval_samples_per_second": 68.667, + "eval_steps_per_second": 8.711, + "step": 2380 + }, + { + "epoch": 140.29, + "grad_norm": 0.3937593996524811, + "learning_rate": 0.00021592941176470587, + "loss": 0.2659, + "step": 2385 + }, + { + "epoch": 140.59, + "grad_norm": 0.3073898255825043, + "learning_rate": 0.00021575294117647057, + "loss": 0.2633, + "step": 2390 + }, + { + "epoch": 140.88, + "grad_norm": 0.4458857476711273, + "learning_rate": 0.00021557647058823527, + "loss": 0.2922, + "step": 2395 + }, + { + "epoch": 141.18, + "grad_norm": 0.3490404188632965, + "learning_rate": 0.00021539999999999998, + "loss": 0.3153, + "step": 2400 + }, + { + "epoch": 141.18, + "eval_loss": 0.26831305027008057, + "eval_runtime": 1.9523, + "eval_samples_per_second": 68.637, + "eval_steps_per_second": 8.708, + "step": 2400 + }, + { + "epoch": 141.47, + "grad_norm": 0.38704177737236023, + "learning_rate": 0.00021522352941176468, + "loss": 0.2678, + "step": 2405 + }, + { + "epoch": 141.76, + "grad_norm": 0.37403926253318787, + "learning_rate": 0.0002150470588235294, + "loss": 0.2741, + "step": 2410 + }, + { + "epoch": 142.06, + "grad_norm": 0.403532475233078, + "learning_rate": 0.00021487058823529411, + "loss": 0.2887, + "step": 2415 + }, + { + "epoch": 142.35, + "grad_norm": 0.44494929909706116, + "learning_rate": 0.00021469411764705882, + "loss": 0.2735, + "step": 2420 + }, + { + "epoch": 142.35, + "eval_loss": 0.2697713375091553, + "eval_runtime": 1.9546, + "eval_samples_per_second": 68.556, + "eval_steps_per_second": 8.697, + "step": 2420 + }, + { + "epoch": 142.65, + "grad_norm": 0.527877688407898, + "learning_rate": 0.0002145176470588235, + "loss": 0.2905, + "step": 2425 + }, + { + "epoch": 142.94, + "grad_norm": 0.39474526047706604, + "learning_rate": 0.0002143411764705882, + "loss": 0.2705, + "step": 2430 + }, + { + "epoch": 143.24, + "grad_norm": 0.4721439778804779, + "learning_rate": 0.00021416470588235293, + "loss": 0.288, + "step": 2435 + }, + { + "epoch": 143.53, + "grad_norm": 0.4839733839035034, + "learning_rate": 0.00021398823529411763, + "loss": 0.2772, + "step": 2440 + }, + { + "epoch": 143.53, + "eval_loss": 0.26988181471824646, + "eval_runtime": 1.955, + "eval_samples_per_second": 68.543, + "eval_steps_per_second": 8.696, + "step": 2440 + }, + { + "epoch": 143.82, + "grad_norm": 0.39183154702186584, + "learning_rate": 0.00021381176470588233, + "loss": 0.2796, + "step": 2445 + }, + { + "epoch": 144.12, + "grad_norm": 0.3333718776702881, + "learning_rate": 0.00021363529411764704, + "loss": 0.2681, + "step": 2450 + }, + { + "epoch": 144.41, + "grad_norm": 0.34942755103111267, + "learning_rate": 0.00021345882352941174, + "loss": 0.2726, + "step": 2455 + }, + { + "epoch": 144.71, + "grad_norm": 0.41581347584724426, + "learning_rate": 0.00021328235294117644, + "loss": 0.2701, + "step": 2460 + }, + { + "epoch": 144.71, + "eval_loss": 0.26986077427864075, + "eval_runtime": 1.9866, + "eval_samples_per_second": 67.453, + "eval_steps_per_second": 8.557, + "step": 2460 + }, + { + "epoch": 145.0, + "grad_norm": 0.5140141844749451, + "learning_rate": 0.00021310588235294117, + "loss": 0.3078, + "step": 2465 + }, + { + "epoch": 145.29, + "grad_norm": 0.2735491394996643, + "learning_rate": 0.00021292941176470588, + "loss": 0.2594, + "step": 2470 + }, + { + "epoch": 145.59, + "grad_norm": 0.3331889510154724, + "learning_rate": 0.00021275294117647058, + "loss": 0.2811, + "step": 2475 + }, + { + "epoch": 145.88, + "grad_norm": 0.3898256719112396, + "learning_rate": 0.00021257647058823528, + "loss": 0.298, + "step": 2480 + }, + { + "epoch": 145.88, + "eval_loss": 0.2701750099658966, + "eval_runtime": 1.9517, + "eval_samples_per_second": 68.659, + "eval_steps_per_second": 8.71, + "step": 2480 + }, + { + "epoch": 146.18, + "grad_norm": 0.3296191394329071, + "learning_rate": 0.00021239999999999996, + "loss": 0.268, + "step": 2485 + }, + { + "epoch": 146.47, + "grad_norm": 0.30124610662460327, + "learning_rate": 0.00021222352941176472, + "loss": 0.2625, + "step": 2490 + }, + { + "epoch": 146.76, + "grad_norm": 0.35350728034973145, + "learning_rate": 0.0002120470588235294, + "loss": 0.2735, + "step": 2495 + }, + { + "epoch": 147.06, + "grad_norm": 0.33914923667907715, + "learning_rate": 0.0002118705882352941, + "loss": 0.2964, + "step": 2500 + }, + { + "epoch": 147.06, + "eval_loss": 0.2691487669944763, + "eval_runtime": 1.9508, + "eval_samples_per_second": 68.69, + "eval_steps_per_second": 8.714, + "step": 2500 + }, + { + "epoch": 147.35, + "grad_norm": 0.38127443194389343, + "learning_rate": 0.0002116941176470588, + "loss": 0.2656, + "step": 2505 + }, + { + "epoch": 147.65, + "grad_norm": 0.365024596452713, + "learning_rate": 0.0002115176470588235, + "loss": 0.2858, + "step": 2510 + }, + { + "epoch": 147.94, + "grad_norm": 0.4416399598121643, + "learning_rate": 0.0002113411764705882, + "loss": 0.2863, + "step": 2515 + }, + { + "epoch": 148.24, + "grad_norm": 0.3810214698314667, + "learning_rate": 0.00021116470588235294, + "loss": 0.2767, + "step": 2520 + }, + { + "epoch": 148.24, + "eval_loss": 0.2688581049442291, + "eval_runtime": 1.9521, + "eval_samples_per_second": 68.645, + "eval_steps_per_second": 8.709, + "step": 2520 + }, + { + "epoch": 148.53, + "grad_norm": 0.4380330741405487, + "learning_rate": 0.00021098823529411764, + "loss": 0.2739, + "step": 2525 + }, + { + "epoch": 148.82, + "grad_norm": 0.36188778281211853, + "learning_rate": 0.00021081176470588234, + "loss": 0.2823, + "step": 2530 + }, + { + "epoch": 149.12, + "grad_norm": 0.3600512742996216, + "learning_rate": 0.00021063529411764704, + "loss": 0.2836, + "step": 2535 + }, + { + "epoch": 149.41, + "grad_norm": 0.3698360323905945, + "learning_rate": 0.00021045882352941175, + "loss": 0.2735, + "step": 2540 + }, + { + "epoch": 149.41, + "eval_loss": 0.2692275941371918, + "eval_runtime": 1.9703, + "eval_samples_per_second": 68.01, + "eval_steps_per_second": 8.628, + "step": 2540 + }, + { + "epoch": 149.71, + "grad_norm": 0.40234825015068054, + "learning_rate": 0.00021028235294117648, + "loss": 0.267, + "step": 2545 + }, + { + "epoch": 150.0, + "grad_norm": 0.4509989023208618, + "learning_rate": 0.00021010588235294118, + "loss": 0.2922, + "step": 2550 + }, + { + "epoch": 150.29, + "grad_norm": 0.452462762594223, + "learning_rate": 0.00020992941176470586, + "loss": 0.2827, + "step": 2555 + }, + { + "epoch": 150.59, + "grad_norm": 0.33455464243888855, + "learning_rate": 0.00020975294117647056, + "loss": 0.2631, + "step": 2560 + }, + { + "epoch": 150.59, + "eval_loss": 0.2705185115337372, + "eval_runtime": 1.9513, + "eval_samples_per_second": 68.671, + "eval_steps_per_second": 8.712, + "step": 2560 + }, + { + "epoch": 150.88, + "grad_norm": 0.5142150521278381, + "learning_rate": 0.00020957647058823526, + "loss": 0.2801, + "step": 2565 + }, + { + "epoch": 151.18, + "grad_norm": 0.5128970146179199, + "learning_rate": 0.00020939999999999997, + "loss": 0.2786, + "step": 2570 + }, + { + "epoch": 151.47, + "grad_norm": 0.38353997468948364, + "learning_rate": 0.0002092235294117647, + "loss": 0.2861, + "step": 2575 + }, + { + "epoch": 151.76, + "grad_norm": 0.36522889137268066, + "learning_rate": 0.0002090470588235294, + "loss": 0.2841, + "step": 2580 + }, + { + "epoch": 151.76, + "eval_loss": 0.2704727351665497, + "eval_runtime": 1.9544, + "eval_samples_per_second": 68.563, + "eval_steps_per_second": 8.698, + "step": 2580 + }, + { + "epoch": 152.06, + "grad_norm": 0.31199711561203003, + "learning_rate": 0.0002088705882352941, + "loss": 0.2621, + "step": 2585 + }, + { + "epoch": 152.35, + "grad_norm": 0.34294259548187256, + "learning_rate": 0.0002086941176470588, + "loss": 0.2626, + "step": 2590 + }, + { + "epoch": 152.65, + "grad_norm": 0.4609375, + "learning_rate": 0.0002085176470588235, + "loss": 0.2828, + "step": 2595 + }, + { + "epoch": 152.94, + "grad_norm": 0.4297860860824585, + "learning_rate": 0.00020834117647058824, + "loss": 0.2956, + "step": 2600 + }, + { + "epoch": 152.94, + "eval_loss": 0.2691504657268524, + "eval_runtime": 1.9522, + "eval_samples_per_second": 68.641, + "eval_steps_per_second": 8.708, + "step": 2600 + }, + { + "epoch": 153.24, + "grad_norm": 0.5633710622787476, + "learning_rate": 0.00020816470588235294, + "loss": 0.3, + "step": 2605 + }, + { + "epoch": 153.53, + "grad_norm": 0.4655774235725403, + "learning_rate": 0.00020798823529411765, + "loss": 0.2708, + "step": 2610 + }, + { + "epoch": 153.82, + "grad_norm": 0.2590944468975067, + "learning_rate": 0.00020781176470588232, + "loss": 0.263, + "step": 2615 + }, + { + "epoch": 154.12, + "grad_norm": 0.28985559940338135, + "learning_rate": 0.00020763529411764703, + "loss": 0.2725, + "step": 2620 + }, + { + "epoch": 154.12, + "eval_loss": 0.2691247761249542, + "eval_runtime": 1.9528, + "eval_samples_per_second": 68.619, + "eval_steps_per_second": 8.705, + "step": 2620 + }, + { + "epoch": 154.41, + "grad_norm": 0.35489821434020996, + "learning_rate": 0.00020745882352941173, + "loss": 0.2863, + "step": 2625 + }, + { + "epoch": 154.71, + "grad_norm": 0.41455405950546265, + "learning_rate": 0.00020728235294117646, + "loss": 0.2904, + "step": 2630 + }, + { + "epoch": 155.0, + "grad_norm": 0.43387454748153687, + "learning_rate": 0.00020710588235294116, + "loss": 0.2707, + "step": 2635 + }, + { + "epoch": 155.29, + "grad_norm": 0.3901937007904053, + "learning_rate": 0.00020692941176470587, + "loss": 0.2707, + "step": 2640 + }, + { + "epoch": 155.29, + "eval_loss": 0.26900506019592285, + "eval_runtime": 1.9523, + "eval_samples_per_second": 68.635, + "eval_steps_per_second": 8.707, + "step": 2640 + }, + { + "epoch": 155.59, + "grad_norm": 0.3527050316333771, + "learning_rate": 0.00020675294117647057, + "loss": 0.2772, + "step": 2645 + }, + { + "epoch": 155.88, + "grad_norm": 0.38362547755241394, + "learning_rate": 0.00020657647058823527, + "loss": 0.2842, + "step": 2650 + }, + { + "epoch": 156.18, + "grad_norm": 0.29866862297058105, + "learning_rate": 0.00020639999999999998, + "loss": 0.2662, + "step": 2655 + }, + { + "epoch": 156.47, + "grad_norm": 0.3426850438117981, + "learning_rate": 0.0002062235294117647, + "loss": 0.2858, + "step": 2660 + }, + { + "epoch": 156.47, + "eval_loss": 0.2694990932941437, + "eval_runtime": 1.9542, + "eval_samples_per_second": 68.571, + "eval_steps_per_second": 8.699, + "step": 2660 + }, + { + "epoch": 156.76, + "grad_norm": 0.4102865159511566, + "learning_rate": 0.0002060470588235294, + "loss": 0.2855, + "step": 2665 + }, + { + "epoch": 157.06, + "grad_norm": 0.4020179212093353, + "learning_rate": 0.00020587058823529408, + "loss": 0.2747, + "step": 2670 + }, + { + "epoch": 157.35, + "grad_norm": 0.35271838307380676, + "learning_rate": 0.0002056941176470588, + "loss": 0.2729, + "step": 2675 + }, + { + "epoch": 157.65, + "grad_norm": 0.35912418365478516, + "learning_rate": 0.0002055176470588235, + "loss": 0.2707, + "step": 2680 + }, + { + "epoch": 157.65, + "eval_loss": 0.2697877585887909, + "eval_runtime": 1.9581, + "eval_samples_per_second": 68.435, + "eval_steps_per_second": 8.682, + "step": 2680 + }, + { + "epoch": 157.94, + "grad_norm": 0.34640374779701233, + "learning_rate": 0.00020534117647058822, + "loss": 0.289, + "step": 2685 + }, + { + "epoch": 158.24, + "grad_norm": 0.4719325602054596, + "learning_rate": 0.00020516470588235292, + "loss": 0.2831, + "step": 2690 + }, + { + "epoch": 158.53, + "grad_norm": 0.3103512227535248, + "learning_rate": 0.00020498823529411763, + "loss": 0.2537, + "step": 2695 + }, + { + "epoch": 158.82, + "grad_norm": 0.4930344521999359, + "learning_rate": 0.00020481176470588233, + "loss": 0.3008, + "step": 2700 + }, + { + "epoch": 158.82, + "eval_loss": 0.26933836936950684, + "eval_runtime": 1.9515, + "eval_samples_per_second": 68.664, + "eval_steps_per_second": 8.711, + "step": 2700 + }, + { + "epoch": 159.12, + "grad_norm": 0.3645342290401459, + "learning_rate": 0.00020463529411764703, + "loss": 0.2832, + "step": 2705 + }, + { + "epoch": 159.41, + "grad_norm": 0.3427056670188904, + "learning_rate": 0.00020445882352941174, + "loss": 0.2727, + "step": 2710 + }, + { + "epoch": 159.71, + "grad_norm": 0.3055691123008728, + "learning_rate": 0.00020428235294117647, + "loss": 0.2907, + "step": 2715 + }, + { + "epoch": 160.0, + "grad_norm": 0.4193490445613861, + "learning_rate": 0.00020410588235294117, + "loss": 0.261, + "step": 2720 + }, + { + "epoch": 160.0, + "eval_loss": 0.2685997188091278, + "eval_runtime": 1.9495, + "eval_samples_per_second": 68.736, + "eval_steps_per_second": 8.72, + "step": 2720 + }, + { + "epoch": 160.29, + "grad_norm": 0.2987631559371948, + "learning_rate": 0.00020392941176470587, + "loss": 0.2696, + "step": 2725 + }, + { + "epoch": 160.59, + "grad_norm": 0.3255651593208313, + "learning_rate": 0.00020375294117647055, + "loss": 0.2681, + "step": 2730 + }, + { + "epoch": 160.88, + "grad_norm": 0.3296229839324951, + "learning_rate": 0.00020357647058823525, + "loss": 0.2799, + "step": 2735 + }, + { + "epoch": 161.18, + "grad_norm": 0.3057269752025604, + "learning_rate": 0.00020339999999999998, + "loss": 0.2798, + "step": 2740 + }, + { + "epoch": 161.18, + "eval_loss": 0.2684324085712433, + "eval_runtime": 1.9511, + "eval_samples_per_second": 68.68, + "eval_steps_per_second": 8.713, + "step": 2740 + }, + { + "epoch": 161.47, + "grad_norm": 0.29660603404045105, + "learning_rate": 0.0002032235294117647, + "loss": 0.2688, + "step": 2745 + }, + { + "epoch": 161.76, + "grad_norm": 0.35086870193481445, + "learning_rate": 0.0002030470588235294, + "loss": 0.2877, + "step": 2750 + }, + { + "epoch": 162.06, + "grad_norm": 0.29996901750564575, + "learning_rate": 0.0002028705882352941, + "loss": 0.2739, + "step": 2755 + }, + { + "epoch": 162.35, + "grad_norm": 0.36324530839920044, + "learning_rate": 0.0002026941176470588, + "loss": 0.2861, + "step": 2760 + }, + { + "epoch": 162.35, + "eval_loss": 0.2688477039337158, + "eval_runtime": 1.9622, + "eval_samples_per_second": 68.29, + "eval_steps_per_second": 8.664, + "step": 2760 + }, + { + "epoch": 162.65, + "grad_norm": 0.4045323133468628, + "learning_rate": 0.0002025176470588235, + "loss": 0.2617, + "step": 2765 + }, + { + "epoch": 162.94, + "grad_norm": 0.2864224314689636, + "learning_rate": 0.00020234117647058823, + "loss": 0.2909, + "step": 2770 + }, + { + "epoch": 163.24, + "grad_norm": 0.3229076862335205, + "learning_rate": 0.00020216470588235293, + "loss": 0.2695, + "step": 2775 + }, + { + "epoch": 163.53, + "grad_norm": 0.267653226852417, + "learning_rate": 0.00020198823529411764, + "loss": 0.2669, + "step": 2780 + }, + { + "epoch": 163.53, + "eval_loss": 0.27076956629753113, + "eval_runtime": 1.9519, + "eval_samples_per_second": 68.653, + "eval_steps_per_second": 8.71, + "step": 2780 + }, + { + "epoch": 163.82, + "grad_norm": 0.43306273221969604, + "learning_rate": 0.00020181176470588234, + "loss": 0.3067, + "step": 2785 + }, + { + "epoch": 164.12, + "grad_norm": 0.4215552508831024, + "learning_rate": 0.00020163529411764702, + "loss": 0.2809, + "step": 2790 + }, + { + "epoch": 164.41, + "grad_norm": 0.3538247048854828, + "learning_rate": 0.00020145882352941177, + "loss": 0.2624, + "step": 2795 + }, + { + "epoch": 164.71, + "grad_norm": 0.4030228555202484, + "learning_rate": 0.00020128235294117645, + "loss": 0.2875, + "step": 2800 + }, + { + "epoch": 164.71, + "eval_loss": 0.2702454924583435, + "eval_runtime": 1.9515, + "eval_samples_per_second": 68.665, + "eval_steps_per_second": 8.711, + "step": 2800 + }, + { + "epoch": 165.0, + "grad_norm": 0.4923202693462372, + "learning_rate": 0.00020110588235294115, + "loss": 0.2859, + "step": 2805 + }, + { + "epoch": 165.29, + "grad_norm": 0.3518223762512207, + "learning_rate": 0.00020092941176470586, + "loss": 0.2763, + "step": 2810 + }, + { + "epoch": 165.59, + "grad_norm": 0.4788561463356018, + "learning_rate": 0.00020075294117647056, + "loss": 0.27, + "step": 2815 + }, + { + "epoch": 165.88, + "grad_norm": 0.3346441686153412, + "learning_rate": 0.00020057647058823526, + "loss": 0.2684, + "step": 2820 + }, + { + "epoch": 165.88, + "eval_loss": 0.26950228214263916, + "eval_runtime": 1.9509, + "eval_samples_per_second": 68.687, + "eval_steps_per_second": 8.714, + "step": 2820 + }, + { + "epoch": 166.18, + "grad_norm": 0.27431848645210266, + "learning_rate": 0.0002004, + "loss": 0.288, + "step": 2825 + }, + { + "epoch": 166.47, + "grad_norm": 0.37005746364593506, + "learning_rate": 0.0002002235294117647, + "loss": 0.27, + "step": 2830 + }, + { + "epoch": 166.76, + "grad_norm": 0.41117268800735474, + "learning_rate": 0.0002000470588235294, + "loss": 0.2885, + "step": 2835 + }, + { + "epoch": 167.06, + "grad_norm": 0.36931198835372925, + "learning_rate": 0.0001998705882352941, + "loss": 0.2914, + "step": 2840 + }, + { + "epoch": 167.06, + "eval_loss": 0.26874876022338867, + "eval_runtime": 1.9521, + "eval_samples_per_second": 68.644, + "eval_steps_per_second": 8.709, + "step": 2840 + }, + { + "epoch": 167.35, + "grad_norm": 0.3291415870189667, + "learning_rate": 0.0001996941176470588, + "loss": 0.261, + "step": 2845 + }, + { + "epoch": 167.65, + "grad_norm": 0.44939348101615906, + "learning_rate": 0.00019951764705882354, + "loss": 0.2896, + "step": 2850 + }, + { + "epoch": 167.94, + "grad_norm": 0.3321436643600464, + "learning_rate": 0.00019934117647058824, + "loss": 0.2881, + "step": 2855 + }, + { + "epoch": 168.24, + "grad_norm": 0.40025246143341064, + "learning_rate": 0.00019916470588235291, + "loss": 0.2647, + "step": 2860 + }, + { + "epoch": 168.24, + "eval_loss": 0.2683941423892975, + "eval_runtime": 1.9492, + "eval_samples_per_second": 68.748, + "eval_steps_per_second": 8.722, + "step": 2860 + }, + { + "epoch": 168.53, + "grad_norm": 0.3663346767425537, + "learning_rate": 0.00019898823529411762, + "loss": 0.2677, + "step": 2865 + }, + { + "epoch": 168.82, + "grad_norm": 0.3315924108028412, + "learning_rate": 0.00019881176470588232, + "loss": 0.2695, + "step": 2870 + }, + { + "epoch": 169.12, + "grad_norm": 0.3310810327529907, + "learning_rate": 0.00019863529411764702, + "loss": 0.2928, + "step": 2875 + }, + { + "epoch": 169.41, + "grad_norm": 0.4282606840133667, + "learning_rate": 0.00019845882352941175, + "loss": 0.2771, + "step": 2880 + }, + { + "epoch": 169.41, + "eval_loss": 0.26962751150131226, + "eval_runtime": 1.95, + "eval_samples_per_second": 68.719, + "eval_steps_per_second": 8.718, + "step": 2880 + }, + { + "epoch": 169.71, + "grad_norm": 0.34908995032310486, + "learning_rate": 0.00019828235294117646, + "loss": 0.2702, + "step": 2885 + }, + { + "epoch": 170.0, + "grad_norm": 0.5017049908638, + "learning_rate": 0.00019810588235294116, + "loss": 0.2854, + "step": 2890 + }, + { + "epoch": 170.29, + "grad_norm": 0.2808305621147156, + "learning_rate": 0.00019792941176470586, + "loss": 0.2661, + "step": 2895 + }, + { + "epoch": 170.59, + "grad_norm": 0.4168771207332611, + "learning_rate": 0.00019775294117647057, + "loss": 0.2998, + "step": 2900 + }, + { + "epoch": 170.59, + "eval_loss": 0.27029451727867126, + "eval_runtime": 1.9857, + "eval_samples_per_second": 67.482, + "eval_steps_per_second": 8.561, + "step": 2900 + }, + { + "epoch": 170.88, + "grad_norm": 0.33384671807289124, + "learning_rate": 0.0001975764705882353, + "loss": 0.2683, + "step": 2905 + }, + { + "epoch": 171.18, + "grad_norm": 0.3153441548347473, + "learning_rate": 0.0001974, + "loss": 0.2661, + "step": 2910 + }, + { + "epoch": 171.47, + "grad_norm": 0.33517536520957947, + "learning_rate": 0.0001972235294117647, + "loss": 0.2798, + "step": 2915 + }, + { + "epoch": 171.76, + "grad_norm": 0.2965943217277527, + "learning_rate": 0.00019704705882352938, + "loss": 0.2686, + "step": 2920 + }, + { + "epoch": 171.76, + "eval_loss": 0.26911482214927673, + "eval_runtime": 1.9506, + "eval_samples_per_second": 68.698, + "eval_steps_per_second": 8.715, + "step": 2920 + }, + { + "epoch": 172.06, + "grad_norm": 0.37422090768814087, + "learning_rate": 0.00019687058823529408, + "loss": 0.2962, + "step": 2925 + }, + { + "epoch": 172.35, + "grad_norm": 0.2916088104248047, + "learning_rate": 0.00019669411764705879, + "loss": 0.2635, + "step": 2930 + }, + { + "epoch": 172.65, + "grad_norm": 0.2642833888530731, + "learning_rate": 0.00019651764705882352, + "loss": 0.281, + "step": 2935 + }, + { + "epoch": 172.94, + "grad_norm": 0.45938625931739807, + "learning_rate": 0.00019634117647058822, + "loss": 0.2949, + "step": 2940 + }, + { + "epoch": 172.94, + "eval_loss": 0.26863551139831543, + "eval_runtime": 1.9514, + "eval_samples_per_second": 68.669, + "eval_steps_per_second": 8.712, + "step": 2940 + }, + { + "epoch": 173.24, + "grad_norm": 0.3415997326374054, + "learning_rate": 0.00019616470588235292, + "loss": 0.2718, + "step": 2945 + }, + { + "epoch": 173.53, + "grad_norm": 0.3150477111339569, + "learning_rate": 0.00019598823529411763, + "loss": 0.2665, + "step": 2950 + }, + { + "epoch": 173.82, + "grad_norm": 0.36632010340690613, + "learning_rate": 0.00019581176470588233, + "loss": 0.2846, + "step": 2955 + }, + { + "epoch": 174.12, + "grad_norm": 0.3656614124774933, + "learning_rate": 0.00019563529411764706, + "loss": 0.2834, + "step": 2960 + }, + { + "epoch": 174.12, + "eval_loss": 0.2683599591255188, + "eval_runtime": 1.9524, + "eval_samples_per_second": 68.635, + "eval_steps_per_second": 8.707, + "step": 2960 + }, + { + "epoch": 174.41, + "grad_norm": 0.36069437861442566, + "learning_rate": 0.00019545882352941176, + "loss": 0.2731, + "step": 2965 + }, + { + "epoch": 174.71, + "grad_norm": 0.3237954378128052, + "learning_rate": 0.00019528235294117647, + "loss": 0.2776, + "step": 2970 + }, + { + "epoch": 175.0, + "grad_norm": 0.3550858795642853, + "learning_rate": 0.00019510588235294117, + "loss": 0.2687, + "step": 2975 + }, + { + "epoch": 175.29, + "grad_norm": 0.32072868943214417, + "learning_rate": 0.00019492941176470585, + "loss": 0.2677, + "step": 2980 + }, + { + "epoch": 175.29, + "eval_loss": 0.26847606897354126, + "eval_runtime": 1.951, + "eval_samples_per_second": 68.684, + "eval_steps_per_second": 8.714, + "step": 2980 + }, + { + "epoch": 175.59, + "grad_norm": 0.24555237591266632, + "learning_rate": 0.00019475294117647055, + "loss": 0.2497, + "step": 2985 + }, + { + "epoch": 175.88, + "grad_norm": 0.43830403685569763, + "learning_rate": 0.00019457647058823528, + "loss": 0.3057, + "step": 2990 + }, + { + "epoch": 176.18, + "grad_norm": 0.40582191944122314, + "learning_rate": 0.00019439999999999998, + "loss": 0.2747, + "step": 2995 + }, + { + "epoch": 176.47, + "grad_norm": 0.3364960253238678, + "learning_rate": 0.00019422352941176469, + "loss": 0.269, + "step": 3000 + }, + { + "epoch": 176.47, + "eval_loss": 0.26868540048599243, + "eval_runtime": 1.9541, + "eval_samples_per_second": 68.573, + "eval_steps_per_second": 8.7, + "step": 3000 + }, + { + "epoch": 176.76, + "grad_norm": 0.3777320683002472, + "learning_rate": 0.0001940470588235294, + "loss": 0.2647, + "step": 3005 + }, + { + "epoch": 177.06, + "grad_norm": 0.3995269536972046, + "learning_rate": 0.0001938705882352941, + "loss": 0.3028, + "step": 3010 + }, + { + "epoch": 177.35, + "grad_norm": 0.35065150260925293, + "learning_rate": 0.00019369411764705882, + "loss": 0.2734, + "step": 3015 + }, + { + "epoch": 177.65, + "grad_norm": 0.3392142951488495, + "learning_rate": 0.00019351764705882352, + "loss": 0.2703, + "step": 3020 + }, + { + "epoch": 177.65, + "eval_loss": 0.2690936028957367, + "eval_runtime": 1.9517, + "eval_samples_per_second": 68.656, + "eval_steps_per_second": 8.71, + "step": 3020 + }, + { + "epoch": 177.94, + "grad_norm": 0.367732435464859, + "learning_rate": 0.00019334117647058823, + "loss": 0.2774, + "step": 3025 + }, + { + "epoch": 178.24, + "grad_norm": 0.41104987263679504, + "learning_rate": 0.00019316470588235293, + "loss": 0.2716, + "step": 3030 + }, + { + "epoch": 178.53, + "grad_norm": 0.444865882396698, + "learning_rate": 0.00019298823529411763, + "loss": 0.2846, + "step": 3035 + }, + { + "epoch": 178.82, + "grad_norm": 0.3704971671104431, + "learning_rate": 0.0001928117647058823, + "loss": 0.2993, + "step": 3040 + }, + { + "epoch": 178.82, + "eval_loss": 0.2689005732536316, + "eval_runtime": 1.9527, + "eval_samples_per_second": 68.621, + "eval_steps_per_second": 8.706, + "step": 3040 + }, + { + "epoch": 179.12, + "grad_norm": 0.3935631811618805, + "learning_rate": 0.00019263529411764704, + "loss": 0.2536, + "step": 3045 + }, + { + "epoch": 179.41, + "grad_norm": 0.31444981694221497, + "learning_rate": 0.00019245882352941174, + "loss": 0.29, + "step": 3050 + }, + { + "epoch": 179.71, + "grad_norm": 0.3283923864364624, + "learning_rate": 0.00019228235294117645, + "loss": 0.2543, + "step": 3055 + }, + { + "epoch": 180.0, + "grad_norm": 0.4967205822467804, + "learning_rate": 0.00019210588235294115, + "loss": 0.3065, + "step": 3060 + }, + { + "epoch": 180.0, + "eval_loss": 0.268365740776062, + "eval_runtime": 1.9513, + "eval_samples_per_second": 68.671, + "eval_steps_per_second": 8.712, + "step": 3060 + }, + { + "epoch": 180.29, + "grad_norm": 0.28206074237823486, + "learning_rate": 0.00019192941176470585, + "loss": 0.2425, + "step": 3065 + }, + { + "epoch": 180.59, + "grad_norm": 0.4075360894203186, + "learning_rate": 0.00019175294117647058, + "loss": 0.2874, + "step": 3070 + }, + { + "epoch": 180.88, + "grad_norm": 0.3878535032272339, + "learning_rate": 0.0001915764705882353, + "loss": 0.3017, + "step": 3075 + }, + { + "epoch": 181.18, + "grad_norm": 0.33224448561668396, + "learning_rate": 0.0001914, + "loss": 0.2906, + "step": 3080 + }, + { + "epoch": 181.18, + "eval_loss": 0.2688673734664917, + "eval_runtime": 1.9514, + "eval_samples_per_second": 68.667, + "eval_steps_per_second": 8.712, + "step": 3080 + }, + { + "epoch": 181.47, + "grad_norm": 0.262993186712265, + "learning_rate": 0.0001912235294117647, + "loss": 0.2634, + "step": 3085 + }, + { + "epoch": 181.76, + "grad_norm": 0.42793840169906616, + "learning_rate": 0.0001910470588235294, + "loss": 0.2826, + "step": 3090 + }, + { + "epoch": 182.06, + "grad_norm": 0.29369890689849854, + "learning_rate": 0.00019087058823529407, + "loss": 0.2698, + "step": 3095 + }, + { + "epoch": 182.35, + "grad_norm": 0.3196079730987549, + "learning_rate": 0.00019069411764705883, + "loss": 0.2538, + "step": 3100 + }, + { + "epoch": 182.35, + "eval_loss": 0.2690832018852234, + "eval_runtime": 1.9494, + "eval_samples_per_second": 68.738, + "eval_steps_per_second": 8.72, + "step": 3100 + }, + { + "epoch": 182.65, + "grad_norm": 0.3042619228363037, + "learning_rate": 0.0001905176470588235, + "loss": 0.2857, + "step": 3105 + }, + { + "epoch": 182.94, + "grad_norm": 0.36815038323402405, + "learning_rate": 0.0001903411764705882, + "loss": 0.2936, + "step": 3110 + }, + { + "epoch": 183.24, + "grad_norm": 0.29674026370048523, + "learning_rate": 0.0001901647058823529, + "loss": 0.2799, + "step": 3115 + }, + { + "epoch": 183.53, + "grad_norm": 0.40005770325660706, + "learning_rate": 0.00018998823529411762, + "loss": 0.2792, + "step": 3120 + }, + { + "epoch": 183.53, + "eval_loss": 0.2692555785179138, + "eval_runtime": 1.9489, + "eval_samples_per_second": 68.758, + "eval_steps_per_second": 8.723, + "step": 3120 + }, + { + "epoch": 183.82, + "grad_norm": 0.44748806953430176, + "learning_rate": 0.00018981176470588235, + "loss": 0.2807, + "step": 3125 + }, + { + "epoch": 184.12, + "grad_norm": 0.330707311630249, + "learning_rate": 0.00018963529411764705, + "loss": 0.2751, + "step": 3130 + }, + { + "epoch": 184.41, + "grad_norm": 0.3634668290615082, + "learning_rate": 0.00018945882352941175, + "loss": 0.2689, + "step": 3135 + }, + { + "epoch": 184.71, + "grad_norm": 0.30390873551368713, + "learning_rate": 0.00018928235294117646, + "loss": 0.2876, + "step": 3140 + }, + { + "epoch": 184.71, + "eval_loss": 0.26960647106170654, + "eval_runtime": 1.952, + "eval_samples_per_second": 68.647, + "eval_steps_per_second": 8.709, + "step": 3140 + }, + { + "epoch": 185.0, + "grad_norm": 0.5152026414871216, + "learning_rate": 0.00018910588235294116, + "loss": 0.2862, + "step": 3145 + }, + { + "epoch": 185.29, + "grad_norm": 0.27822214365005493, + "learning_rate": 0.00018892941176470586, + "loss": 0.2757, + "step": 3150 + }, + { + "epoch": 185.59, + "grad_norm": 0.287853866815567, + "learning_rate": 0.0001887529411764706, + "loss": 0.2607, + "step": 3155 + }, + { + "epoch": 185.88, + "grad_norm": 0.3760346472263336, + "learning_rate": 0.0001885764705882353, + "loss": 0.2826, + "step": 3160 + }, + { + "epoch": 185.88, + "eval_loss": 0.26953375339508057, + "eval_runtime": 1.9519, + "eval_samples_per_second": 68.652, + "eval_steps_per_second": 8.71, + "step": 3160 + }, + { + "epoch": 186.18, + "grad_norm": 0.30832841992378235, + "learning_rate": 0.00018839999999999997, + "loss": 0.2881, + "step": 3165 + }, + { + "epoch": 186.47, + "grad_norm": 0.33436018228530884, + "learning_rate": 0.00018822352941176467, + "loss": 0.2901, + "step": 3170 + }, + { + "epoch": 186.76, + "grad_norm": 0.38080456852912903, + "learning_rate": 0.00018804705882352938, + "loss": 0.279, + "step": 3175 + }, + { + "epoch": 187.06, + "grad_norm": 0.2727751135826111, + "learning_rate": 0.0001878705882352941, + "loss": 0.2713, + "step": 3180 + }, + { + "epoch": 187.06, + "eval_loss": 0.26871371269226074, + "eval_runtime": 1.9511, + "eval_samples_per_second": 68.678, + "eval_steps_per_second": 8.713, + "step": 3180 + }, + { + "epoch": 187.35, + "grad_norm": 0.35906288027763367, + "learning_rate": 0.0001876941176470588, + "loss": 0.2824, + "step": 3185 + }, + { + "epoch": 187.65, + "grad_norm": 0.3231754004955292, + "learning_rate": 0.00018751764705882351, + "loss": 0.2832, + "step": 3190 + }, + { + "epoch": 187.94, + "grad_norm": 0.2840198874473572, + "learning_rate": 0.00018734117647058822, + "loss": 0.2647, + "step": 3195 + }, + { + "epoch": 188.24, + "grad_norm": 0.3872297406196594, + "learning_rate": 0.00018716470588235292, + "loss": 0.294, + "step": 3200 + }, + { + "epoch": 188.24, + "eval_loss": 0.2681155502796173, + "eval_runtime": 1.9517, + "eval_samples_per_second": 68.659, + "eval_steps_per_second": 8.71, + "step": 3200 + }, + { + "epoch": 188.53, + "grad_norm": 0.3335537612438202, + "learning_rate": 0.00018698823529411762, + "loss": 0.2664, + "step": 3205 + }, + { + "epoch": 188.82, + "grad_norm": 0.35204872488975525, + "learning_rate": 0.00018681176470588235, + "loss": 0.2743, + "step": 3210 + }, + { + "epoch": 189.12, + "grad_norm": 0.3697923421859741, + "learning_rate": 0.00018663529411764706, + "loss": 0.2826, + "step": 3215 + }, + { + "epoch": 189.41, + "grad_norm": 0.27298083901405334, + "learning_rate": 0.00018645882352941176, + "loss": 0.2577, + "step": 3220 + }, + { + "epoch": 189.41, + "eval_loss": 0.2690584063529968, + "eval_runtime": 1.9507, + "eval_samples_per_second": 68.692, + "eval_steps_per_second": 8.715, + "step": 3220 + }, + { + "epoch": 189.71, + "grad_norm": 0.3299672603607178, + "learning_rate": 0.00018628235294117644, + "loss": 0.2818, + "step": 3225 + }, + { + "epoch": 190.0, + "grad_norm": 0.40124574303627014, + "learning_rate": 0.00018610588235294114, + "loss": 0.2893, + "step": 3230 + }, + { + "epoch": 190.29, + "grad_norm": 0.34692269563674927, + "learning_rate": 0.00018592941176470587, + "loss": 0.2716, + "step": 3235 + }, + { + "epoch": 190.59, + "grad_norm": 0.2971137762069702, + "learning_rate": 0.00018575294117647057, + "loss": 0.2719, + "step": 3240 + }, + { + "epoch": 190.59, + "eval_loss": 0.26943662762641907, + "eval_runtime": 1.9541, + "eval_samples_per_second": 68.575, + "eval_steps_per_second": 8.7, + "step": 3240 + }, + { + "epoch": 190.88, + "grad_norm": 0.3698229193687439, + "learning_rate": 0.00018557647058823528, + "loss": 0.2834, + "step": 3245 + }, + { + "epoch": 191.18, + "grad_norm": 0.32882949709892273, + "learning_rate": 0.00018539999999999998, + "loss": 0.2686, + "step": 3250 + }, + { + "epoch": 191.47, + "grad_norm": 0.4325163960456848, + "learning_rate": 0.00018522352941176468, + "loss": 0.2684, + "step": 3255 + }, + { + "epoch": 191.76, + "grad_norm": 0.39144089818000793, + "learning_rate": 0.00018504705882352939, + "loss": 0.2787, + "step": 3260 + }, + { + "epoch": 191.76, + "eval_loss": 0.2686680257320404, + "eval_runtime": 1.9528, + "eval_samples_per_second": 68.621, + "eval_steps_per_second": 8.706, + "step": 3260 + }, + { + "epoch": 192.06, + "grad_norm": 0.3248187303543091, + "learning_rate": 0.00018487058823529412, + "loss": 0.2815, + "step": 3265 + }, + { + "epoch": 192.35, + "grad_norm": 0.3604396879673004, + "learning_rate": 0.00018469411764705882, + "loss": 0.2702, + "step": 3270 + }, + { + "epoch": 192.65, + "grad_norm": 0.5014637112617493, + "learning_rate": 0.00018451764705882352, + "loss": 0.2799, + "step": 3275 + }, + { + "epoch": 192.94, + "grad_norm": 0.26431596279144287, + "learning_rate": 0.00018434117647058823, + "loss": 0.2761, + "step": 3280 + }, + { + "epoch": 192.94, + "eval_loss": 0.26852673292160034, + "eval_runtime": 1.9526, + "eval_samples_per_second": 68.625, + "eval_steps_per_second": 8.706, + "step": 3280 + }, + { + "epoch": 193.24, + "grad_norm": 0.3636572062969208, + "learning_rate": 0.0001841647058823529, + "loss": 0.3067, + "step": 3285 + }, + { + "epoch": 193.53, + "grad_norm": 0.2593145966529846, + "learning_rate": 0.00018398823529411766, + "loss": 0.2464, + "step": 3290 + }, + { + "epoch": 193.82, + "grad_norm": 0.4175991714000702, + "learning_rate": 0.00018381176470588234, + "loss": 0.2923, + "step": 3295 + }, + { + "epoch": 194.12, + "grad_norm": 0.2949036657810211, + "learning_rate": 0.00018363529411764704, + "loss": 0.2629, + "step": 3300 + }, + { + "epoch": 194.12, + "eval_loss": 0.2683328688144684, + "eval_runtime": 1.9501, + "eval_samples_per_second": 68.714, + "eval_steps_per_second": 8.717, + "step": 3300 + }, + { + "epoch": 194.41, + "grad_norm": 0.3554116189479828, + "learning_rate": 0.00018345882352941174, + "loss": 0.2733, + "step": 3305 + }, + { + "epoch": 194.71, + "grad_norm": 0.4547765254974365, + "learning_rate": 0.00018328235294117645, + "loss": 0.287, + "step": 3310 + }, + { + "epoch": 195.0, + "grad_norm": 0.556479811668396, + "learning_rate": 0.00018310588235294115, + "loss": 0.2792, + "step": 3315 + }, + { + "epoch": 195.29, + "grad_norm": 0.3767550587654114, + "learning_rate": 0.00018292941176470588, + "loss": 0.2616, + "step": 3320 + }, + { + "epoch": 195.29, + "eval_loss": 0.26889336109161377, + "eval_runtime": 1.9529, + "eval_samples_per_second": 68.614, + "eval_steps_per_second": 8.705, + "step": 3320 + }, + { + "epoch": 195.59, + "grad_norm": 0.3662166893482208, + "learning_rate": 0.00018275294117647058, + "loss": 0.2743, + "step": 3325 + }, + { + "epoch": 195.88, + "grad_norm": 0.26912805438041687, + "learning_rate": 0.00018257647058823529, + "loss": 0.2945, + "step": 3330 + }, + { + "epoch": 196.18, + "grad_norm": 0.33123278617858887, + "learning_rate": 0.0001824, + "loss": 0.2698, + "step": 3335 + }, + { + "epoch": 196.47, + "grad_norm": 0.2607968747615814, + "learning_rate": 0.0001822235294117647, + "loss": 0.2476, + "step": 3340 + }, + { + "epoch": 196.47, + "eval_loss": 0.268909215927124, + "eval_runtime": 1.9571, + "eval_samples_per_second": 68.47, + "eval_steps_per_second": 8.686, + "step": 3340 + }, + { + "epoch": 196.76, + "grad_norm": 0.31745481491088867, + "learning_rate": 0.00018204705882352937, + "loss": 0.2961, + "step": 3345 + }, + { + "epoch": 197.06, + "grad_norm": 0.3695240318775177, + "learning_rate": 0.00018187058823529413, + "loss": 0.3059, + "step": 3350 + }, + { + "epoch": 197.35, + "grad_norm": 0.2966747581958771, + "learning_rate": 0.0001816941176470588, + "loss": 0.2687, + "step": 3355 + }, + { + "epoch": 197.65, + "grad_norm": 0.32601168751716614, + "learning_rate": 0.0001815176470588235, + "loss": 0.2556, + "step": 3360 + }, + { + "epoch": 197.65, + "eval_loss": 0.26882460713386536, + "eval_runtime": 1.9555, + "eval_samples_per_second": 68.525, + "eval_steps_per_second": 8.693, + "step": 3360 + }, + { + "epoch": 197.94, + "grad_norm": 0.34764987230300903, + "learning_rate": 0.0001813411764705882, + "loss": 0.2999, + "step": 3365 + }, + { + "epoch": 198.24, + "grad_norm": 0.3237784802913666, + "learning_rate": 0.0001811647058823529, + "loss": 0.2615, + "step": 3370 + }, + { + "epoch": 198.53, + "grad_norm": 0.423705518245697, + "learning_rate": 0.00018098823529411764, + "loss": 0.275, + "step": 3375 + }, + { + "epoch": 198.82, + "grad_norm": 0.3280170261859894, + "learning_rate": 0.00018081176470588234, + "loss": 0.2819, + "step": 3380 + }, + { + "epoch": 198.82, + "eval_loss": 0.2689860165119171, + "eval_runtime": 1.953, + "eval_samples_per_second": 68.612, + "eval_steps_per_second": 8.705, + "step": 3380 + }, + { + "epoch": 199.12, + "grad_norm": 0.48247596621513367, + "learning_rate": 0.00018063529411764705, + "loss": 0.3022, + "step": 3385 + }, + { + "epoch": 199.41, + "grad_norm": 0.39836814999580383, + "learning_rate": 0.00018045882352941175, + "loss": 0.274, + "step": 3390 + }, + { + "epoch": 199.71, + "grad_norm": 0.3473406732082367, + "learning_rate": 0.00018028235294117645, + "loss": 0.2788, + "step": 3395 + }, + { + "epoch": 200.0, + "grad_norm": 0.44988372921943665, + "learning_rate": 0.00018010588235294116, + "loss": 0.2729, + "step": 3400 + }, + { + "epoch": 200.0, + "eval_loss": 0.2691396176815033, + "eval_runtime": 1.953, + "eval_samples_per_second": 68.612, + "eval_steps_per_second": 8.705, + "step": 3400 + }, + { + "epoch": 200.29, + "grad_norm": 0.26888713240623474, + "learning_rate": 0.0001799294117647059, + "loss": 0.2561, + "step": 3405 + }, + { + "epoch": 200.59, + "grad_norm": 0.3346821367740631, + "learning_rate": 0.0001797529411764706, + "loss": 0.2732, + "step": 3410 + }, + { + "epoch": 200.88, + "grad_norm": 0.4123658835887909, + "learning_rate": 0.00017957647058823527, + "loss": 0.2837, + "step": 3415 + }, + { + "epoch": 201.18, + "grad_norm": 0.38498610258102417, + "learning_rate": 0.00017939999999999997, + "loss": 0.283, + "step": 3420 + }, + { + "epoch": 201.18, + "eval_loss": 0.2682644724845886, + "eval_runtime": 1.9538, + "eval_samples_per_second": 68.586, + "eval_steps_per_second": 8.701, + "step": 3420 + }, + { + "epoch": 201.47, + "grad_norm": 0.3732117712497711, + "learning_rate": 0.00017922352941176467, + "loss": 0.2881, + "step": 3425 + }, + { + "epoch": 201.76, + "grad_norm": 0.3142256438732147, + "learning_rate": 0.0001790470588235294, + "loss": 0.2702, + "step": 3430 + }, + { + "epoch": 202.06, + "grad_norm": 0.342428058385849, + "learning_rate": 0.0001788705882352941, + "loss": 0.2866, + "step": 3435 + }, + { + "epoch": 202.35, + "grad_norm": 0.35131046175956726, + "learning_rate": 0.0001786941176470588, + "loss": 0.269, + "step": 3440 + }, + { + "epoch": 202.35, + "eval_loss": 0.26826438307762146, + "eval_runtime": 1.9922, + "eval_samples_per_second": 67.264, + "eval_steps_per_second": 8.533, + "step": 3440 + }, + { + "epoch": 202.65, + "grad_norm": 0.3504379093647003, + "learning_rate": 0.0001785176470588235, + "loss": 0.2875, + "step": 3445 + }, + { + "epoch": 202.94, + "grad_norm": 0.3603782057762146, + "learning_rate": 0.00017834117647058822, + "loss": 0.2713, + "step": 3450 + }, + { + "epoch": 203.24, + "grad_norm": 0.29872608184814453, + "learning_rate": 0.00017816470588235292, + "loss": 0.265, + "step": 3455 + }, + { + "epoch": 203.53, + "grad_norm": 0.28828245401382446, + "learning_rate": 0.00017798823529411765, + "loss": 0.277, + "step": 3460 + }, + { + "epoch": 203.53, + "eval_loss": 0.2687843143939972, + "eval_runtime": 1.9535, + "eval_samples_per_second": 68.596, + "eval_steps_per_second": 8.702, + "step": 3460 + }, + { + "epoch": 203.82, + "grad_norm": 0.33642733097076416, + "learning_rate": 0.00017781176470588235, + "loss": 0.3118, + "step": 3465 + }, + { + "epoch": 204.12, + "grad_norm": 0.28684553503990173, + "learning_rate": 0.00017763529411764706, + "loss": 0.2622, + "step": 3470 + }, + { + "epoch": 204.41, + "grad_norm": 0.5022943615913391, + "learning_rate": 0.00017745882352941173, + "loss": 0.2549, + "step": 3475 + }, + { + "epoch": 204.71, + "grad_norm": 0.35458943247795105, + "learning_rate": 0.00017728235294117643, + "loss": 0.2806, + "step": 3480 + }, + { + "epoch": 204.71, + "eval_loss": 0.2689105272293091, + "eval_runtime": 1.9565, + "eval_samples_per_second": 68.488, + "eval_steps_per_second": 8.689, + "step": 3480 + }, + { + "epoch": 205.0, + "grad_norm": 0.7953335642814636, + "learning_rate": 0.00017710588235294117, + "loss": 0.3247, + "step": 3485 + }, + { + "epoch": 205.29, + "grad_norm": 0.4112538993358612, + "learning_rate": 0.00017692941176470587, + "loss": 0.2755, + "step": 3490 + }, + { + "epoch": 205.59, + "grad_norm": 0.36768198013305664, + "learning_rate": 0.00017675294117647057, + "loss": 0.2538, + "step": 3495 + }, + { + "epoch": 205.88, + "grad_norm": 0.3491232991218567, + "learning_rate": 0.00017657647058823527, + "loss": 0.2936, + "step": 3500 + }, + { + "epoch": 205.88, + "eval_loss": 0.2692234516143799, + "eval_runtime": 1.9543, + "eval_samples_per_second": 68.568, + "eval_steps_per_second": 8.699, + "step": 3500 + }, + { + "epoch": 206.18, + "grad_norm": 0.37624162435531616, + "learning_rate": 0.00017639999999999998, + "loss": 0.2925, + "step": 3505 + }, + { + "epoch": 206.47, + "grad_norm": 0.35388290882110596, + "learning_rate": 0.00017622352941176468, + "loss": 0.2619, + "step": 3510 + }, + { + "epoch": 206.76, + "grad_norm": 0.4297889471054077, + "learning_rate": 0.0001760470588235294, + "loss": 0.2774, + "step": 3515 + }, + { + "epoch": 207.06, + "grad_norm": 0.2784897983074188, + "learning_rate": 0.00017587058823529411, + "loss": 0.2862, + "step": 3520 + }, + { + "epoch": 207.06, + "eval_loss": 0.2685418725013733, + "eval_runtime": 1.9817, + "eval_samples_per_second": 67.62, + "eval_steps_per_second": 8.579, + "step": 3520 + }, + { + "epoch": 207.35, + "grad_norm": 0.39746060967445374, + "learning_rate": 0.00017569411764705882, + "loss": 0.2644, + "step": 3525 + }, + { + "epoch": 207.65, + "grad_norm": 0.3192261755466461, + "learning_rate": 0.0001755176470588235, + "loss": 0.2751, + "step": 3530 + }, + { + "epoch": 207.94, + "grad_norm": 0.30512139201164246, + "learning_rate": 0.0001753411764705882, + "loss": 0.2966, + "step": 3535 + }, + { + "epoch": 208.24, + "grad_norm": 0.45709657669067383, + "learning_rate": 0.00017516470588235293, + "loss": 0.2797, + "step": 3540 + }, + { + "epoch": 208.24, + "eval_loss": 0.26858973503112793, + "eval_runtime": 1.9554, + "eval_samples_per_second": 68.526, + "eval_steps_per_second": 8.694, + "step": 3540 + }, + { + "epoch": 208.53, + "grad_norm": 0.3059239983558655, + "learning_rate": 0.00017498823529411763, + "loss": 0.2716, + "step": 3545 + }, + { + "epoch": 208.82, + "grad_norm": 0.44816944003105164, + "learning_rate": 0.00017481176470588233, + "loss": 0.2738, + "step": 3550 + }, + { + "epoch": 209.12, + "grad_norm": 0.3316328823566437, + "learning_rate": 0.00017463529411764704, + "loss": 0.3025, + "step": 3555 + }, + { + "epoch": 209.41, + "grad_norm": 0.38213279843330383, + "learning_rate": 0.00017445882352941174, + "loss": 0.2606, + "step": 3560 + }, + { + "epoch": 209.41, + "eval_loss": 0.2686242163181305, + "eval_runtime": 1.9515, + "eval_samples_per_second": 68.666, + "eval_steps_per_second": 8.711, + "step": 3560 + }, + { + "epoch": 209.71, + "grad_norm": 0.35470470786094666, + "learning_rate": 0.00017428235294117644, + "loss": 0.2795, + "step": 3565 + }, + { + "epoch": 210.0, + "grad_norm": 0.4351425766944885, + "learning_rate": 0.00017410588235294117, + "loss": 0.2821, + "step": 3570 + }, + { + "epoch": 210.29, + "grad_norm": 0.34277746081352234, + "learning_rate": 0.00017392941176470588, + "loss": 0.2815, + "step": 3575 + }, + { + "epoch": 210.59, + "grad_norm": 0.2821712791919708, + "learning_rate": 0.00017375294117647058, + "loss": 0.2661, + "step": 3580 + }, + { + "epoch": 210.59, + "eval_loss": 0.2685214579105377, + "eval_runtime": 1.9513, + "eval_samples_per_second": 68.672, + "eval_steps_per_second": 8.712, + "step": 3580 + }, + { + "epoch": 210.88, + "grad_norm": 0.27887848019599915, + "learning_rate": 0.00017357647058823528, + "loss": 0.2741, + "step": 3585 + }, + { + "epoch": 211.18, + "grad_norm": 0.29557564854621887, + "learning_rate": 0.00017339999999999996, + "loss": 0.2869, + "step": 3590 + }, + { + "epoch": 211.47, + "grad_norm": 0.3319908678531647, + "learning_rate": 0.00017322352941176472, + "loss": 0.2692, + "step": 3595 + }, + { + "epoch": 211.76, + "grad_norm": 0.3847210109233856, + "learning_rate": 0.0001730470588235294, + "loss": 0.2979, + "step": 3600 + }, + { + "epoch": 211.76, + "eval_loss": 0.26853594183921814, + "eval_runtime": 1.9555, + "eval_samples_per_second": 68.526, + "eval_steps_per_second": 8.694, + "step": 3600 + }, + { + "epoch": 212.06, + "grad_norm": 0.36667969822883606, + "learning_rate": 0.0001728705882352941, + "loss": 0.2477, + "step": 3605 + }, + { + "epoch": 212.35, + "grad_norm": 0.36953258514404297, + "learning_rate": 0.0001726941176470588, + "loss": 0.2706, + "step": 3610 + }, + { + "epoch": 212.65, + "grad_norm": 0.2699277997016907, + "learning_rate": 0.0001725176470588235, + "loss": 0.272, + "step": 3615 + }, + { + "epoch": 212.94, + "grad_norm": 0.2878658175468445, + "learning_rate": 0.0001723411764705882, + "loss": 0.288, + "step": 3620 + }, + { + "epoch": 212.94, + "eval_loss": 0.2682044208049774, + "eval_runtime": 1.9518, + "eval_samples_per_second": 68.656, + "eval_steps_per_second": 8.71, + "step": 3620 + }, + { + "epoch": 213.24, + "grad_norm": 0.42965853214263916, + "learning_rate": 0.00017216470588235294, + "loss": 0.2767, + "step": 3625 + }, + { + "epoch": 213.53, + "grad_norm": 0.34661805629730225, + "learning_rate": 0.00017198823529411764, + "loss": 0.2661, + "step": 3630 + }, + { + "epoch": 213.82, + "grad_norm": 0.36816754937171936, + "learning_rate": 0.00017181176470588234, + "loss": 0.2792, + "step": 3635 + }, + { + "epoch": 214.12, + "grad_norm": 0.29683583974838257, + "learning_rate": 0.00017163529411764705, + "loss": 0.2801, + "step": 3640 + }, + { + "epoch": 214.12, + "eval_loss": 0.26800212264060974, + "eval_runtime": 1.9544, + "eval_samples_per_second": 68.563, + "eval_steps_per_second": 8.698, + "step": 3640 + }, + { + "epoch": 214.41, + "grad_norm": 0.40612170100212097, + "learning_rate": 0.00017145882352941175, + "loss": 0.2731, + "step": 3645 + }, + { + "epoch": 214.71, + "grad_norm": 0.32734382152557373, + "learning_rate": 0.00017128235294117648, + "loss": 0.2724, + "step": 3650 + }, + { + "epoch": 215.0, + "grad_norm": 0.46737852692604065, + "learning_rate": 0.00017110588235294118, + "loss": 0.2929, + "step": 3655 + }, + { + "epoch": 215.29, + "grad_norm": 0.36513301730155945, + "learning_rate": 0.00017092941176470586, + "loss": 0.2785, + "step": 3660 + }, + { + "epoch": 215.29, + "eval_loss": 0.26821431517601013, + "eval_runtime": 1.9535, + "eval_samples_per_second": 68.595, + "eval_steps_per_second": 8.702, + "step": 3660 + }, + { + "epoch": 215.59, + "grad_norm": 0.41340088844299316, + "learning_rate": 0.00017075294117647056, + "loss": 0.2733, + "step": 3665 + }, + { + "epoch": 215.88, + "grad_norm": 0.3719697892665863, + "learning_rate": 0.00017057647058823526, + "loss": 0.2669, + "step": 3670 + }, + { + "epoch": 216.18, + "grad_norm": 0.36545512080192566, + "learning_rate": 0.00017039999999999997, + "loss": 0.2717, + "step": 3675 + }, + { + "epoch": 216.47, + "grad_norm": 0.37880271673202515, + "learning_rate": 0.0001702235294117647, + "loss": 0.2644, + "step": 3680 + }, + { + "epoch": 216.47, + "eval_loss": 0.2686866223812103, + "eval_runtime": 1.9529, + "eval_samples_per_second": 68.617, + "eval_steps_per_second": 8.705, + "step": 3680 + }, + { + "epoch": 216.76, + "grad_norm": 0.34590771794319153, + "learning_rate": 0.0001700470588235294, + "loss": 0.3004, + "step": 3685 + }, + { + "epoch": 217.06, + "grad_norm": 0.4460364282131195, + "learning_rate": 0.0001698705882352941, + "loss": 0.2844, + "step": 3690 + }, + { + "epoch": 217.35, + "grad_norm": 0.32115277647972107, + "learning_rate": 0.0001696941176470588, + "loss": 0.2757, + "step": 3695 + }, + { + "epoch": 217.65, + "grad_norm": 0.2927345335483551, + "learning_rate": 0.0001695176470588235, + "loss": 0.2742, + "step": 3700 + }, + { + "epoch": 217.65, + "eval_loss": 0.2689415216445923, + "eval_runtime": 1.9568, + "eval_samples_per_second": 68.48, + "eval_steps_per_second": 8.688, + "step": 3700 + }, + { + "epoch": 217.94, + "grad_norm": 0.39260584115982056, + "learning_rate": 0.00016934117647058824, + "loss": 0.2775, + "step": 3705 + }, + { + "epoch": 218.24, + "grad_norm": 0.32040220499038696, + "learning_rate": 0.00016916470588235294, + "loss": 0.2673, + "step": 3710 + }, + { + "epoch": 218.53, + "grad_norm": 0.37890148162841797, + "learning_rate": 0.00016898823529411765, + "loss": 0.2683, + "step": 3715 + }, + { + "epoch": 218.82, + "grad_norm": 0.5056920647621155, + "learning_rate": 0.00016881176470588232, + "loss": 0.2881, + "step": 3720 + }, + { + "epoch": 218.82, + "eval_loss": 0.2683950364589691, + "eval_runtime": 1.9564, + "eval_samples_per_second": 68.493, + "eval_steps_per_second": 8.689, + "step": 3720 + }, + { + "epoch": 219.12, + "grad_norm": 0.36240556836128235, + "learning_rate": 0.00016863529411764703, + "loss": 0.3038, + "step": 3725 + }, + { + "epoch": 219.41, + "grad_norm": 0.3958339989185333, + "learning_rate": 0.00016845882352941173, + "loss": 0.2678, + "step": 3730 + }, + { + "epoch": 219.71, + "grad_norm": 0.34904569387435913, + "learning_rate": 0.00016828235294117646, + "loss": 0.2653, + "step": 3735 + }, + { + "epoch": 220.0, + "grad_norm": 0.5768289566040039, + "learning_rate": 0.00016810588235294116, + "loss": 0.277, + "step": 3740 + }, + { + "epoch": 220.0, + "eval_loss": 0.2686716914176941, + "eval_runtime": 1.9557, + "eval_samples_per_second": 68.517, + "eval_steps_per_second": 8.692, + "step": 3740 + }, + { + "epoch": 220.29, + "grad_norm": 0.3382350504398346, + "learning_rate": 0.00016792941176470587, + "loss": 0.258, + "step": 3745 + }, + { + "epoch": 220.59, + "grad_norm": 0.3434382677078247, + "learning_rate": 0.00016775294117647057, + "loss": 0.2855, + "step": 3750 + }, + { + "epoch": 220.88, + "grad_norm": 0.41246363520622253, + "learning_rate": 0.00016757647058823527, + "loss": 0.279, + "step": 3755 + }, + { + "epoch": 221.18, + "grad_norm": 0.36319679021835327, + "learning_rate": 0.0001674, + "loss": 0.2667, + "step": 3760 + }, + { + "epoch": 221.18, + "eval_loss": 0.26780644059181213, + "eval_runtime": 1.9528, + "eval_samples_per_second": 68.619, + "eval_steps_per_second": 8.705, + "step": 3760 + }, + { + "epoch": 221.47, + "grad_norm": 0.3442753851413727, + "learning_rate": 0.0001672235294117647, + "loss": 0.2834, + "step": 3765 + }, + { + "epoch": 221.76, + "grad_norm": 0.3269500732421875, + "learning_rate": 0.0001670470588235294, + "loss": 0.2643, + "step": 3770 + }, + { + "epoch": 222.06, + "grad_norm": 0.3817020058631897, + "learning_rate": 0.0001668705882352941, + "loss": 0.2915, + "step": 3775 + }, + { + "epoch": 222.35, + "grad_norm": 0.3752339780330658, + "learning_rate": 0.0001666941176470588, + "loss": 0.273, + "step": 3780 + }, + { + "epoch": 222.35, + "eval_loss": 0.2685173451900482, + "eval_runtime": 1.9529, + "eval_samples_per_second": 68.616, + "eval_steps_per_second": 8.705, + "step": 3780 + }, + { + "epoch": 222.65, + "grad_norm": 0.3506356477737427, + "learning_rate": 0.0001665176470588235, + "loss": 0.264, + "step": 3785 + }, + { + "epoch": 222.94, + "grad_norm": 0.35637035965919495, + "learning_rate": 0.00016634117647058822, + "loss": 0.2802, + "step": 3790 + }, + { + "epoch": 223.24, + "grad_norm": 0.2785859704017639, + "learning_rate": 0.00016616470588235293, + "loss": 0.3024, + "step": 3795 + }, + { + "epoch": 223.53, + "grad_norm": 0.2919442653656006, + "learning_rate": 0.00016598823529411763, + "loss": 0.2599, + "step": 3800 + }, + { + "epoch": 223.53, + "eval_loss": 0.26857081055641174, + "eval_runtime": 1.9527, + "eval_samples_per_second": 68.622, + "eval_steps_per_second": 8.706, + "step": 3800 + }, + { + "epoch": 223.82, + "grad_norm": 0.3735847473144531, + "learning_rate": 0.00016581176470588233, + "loss": 0.2784, + "step": 3805 + }, + { + "epoch": 224.12, + "grad_norm": 0.415211945772171, + "learning_rate": 0.00016563529411764704, + "loss": 0.28, + "step": 3810 + }, + { + "epoch": 224.41, + "grad_norm": 0.2927432656288147, + "learning_rate": 0.00016545882352941177, + "loss": 0.2666, + "step": 3815 + }, + { + "epoch": 224.71, + "grad_norm": 0.37965652346611023, + "learning_rate": 0.00016528235294117647, + "loss": 0.2803, + "step": 3820 + }, + { + "epoch": 224.71, + "eval_loss": 0.2682299315929413, + "eval_runtime": 1.9538, + "eval_samples_per_second": 68.584, + "eval_steps_per_second": 8.701, + "step": 3820 + }, + { + "epoch": 225.0, + "grad_norm": 0.6623125076293945, + "learning_rate": 0.00016510588235294117, + "loss": 0.2767, + "step": 3825 + }, + { + "epoch": 225.29, + "grad_norm": 0.3642768859863281, + "learning_rate": 0.00016492941176470587, + "loss": 0.2655, + "step": 3830 + }, + { + "epoch": 225.59, + "grad_norm": 0.4541524052619934, + "learning_rate": 0.00016475294117647058, + "loss": 0.2711, + "step": 3835 + }, + { + "epoch": 225.88, + "grad_norm": 0.5072983503341675, + "learning_rate": 0.00016457647058823525, + "loss": 0.2854, + "step": 3840 + }, + { + "epoch": 225.88, + "eval_loss": 0.2686764597892761, + "eval_runtime": 1.9496, + "eval_samples_per_second": 68.734, + "eval_steps_per_second": 8.72, + "step": 3840 + }, + { + "epoch": 226.18, + "grad_norm": 0.4078729748725891, + "learning_rate": 0.0001644, + "loss": 0.2899, + "step": 3845 + }, + { + "epoch": 226.47, + "grad_norm": 0.40589746832847595, + "learning_rate": 0.0001642235294117647, + "loss": 0.2902, + "step": 3850 + }, + { + "epoch": 226.76, + "grad_norm": 0.31240221858024597, + "learning_rate": 0.0001640470588235294, + "loss": 0.2518, + "step": 3855 + }, + { + "epoch": 227.06, + "grad_norm": 0.3382233679294586, + "learning_rate": 0.0001638705882352941, + "loss": 0.2852, + "step": 3860 + }, + { + "epoch": 227.06, + "eval_loss": 0.27595362067222595, + "eval_runtime": 1.952, + "eval_samples_per_second": 68.647, + "eval_steps_per_second": 8.709, + "step": 3860 + }, + { + "epoch": 227.35, + "grad_norm": 0.3547080457210541, + "learning_rate": 0.0001636941176470588, + "loss": 0.2985, + "step": 3865 + }, + { + "epoch": 227.65, + "grad_norm": 2.6822640895843506, + "learning_rate": 0.00016351764705882353, + "loss": 0.3033, + "step": 3870 + }, + { + "epoch": 227.94, + "grad_norm": 1.6330982446670532, + "learning_rate": 0.00016334117647058823, + "loss": 0.3005, + "step": 3875 + }, + { + "epoch": 228.24, + "grad_norm": 3.304569959640503, + "learning_rate": 0.00016316470588235293, + "loss": 0.3426, + "step": 3880 + }, + { + "epoch": 228.24, + "eval_loss": 0.3442763090133667, + "eval_runtime": 1.9511, + "eval_samples_per_second": 68.679, + "eval_steps_per_second": 8.713, + "step": 3880 + }, + { + "epoch": 228.53, + "grad_norm": 1.643598198890686, + "learning_rate": 0.00016298823529411764, + "loss": 0.3721, + "step": 3885 + }, + { + "epoch": 228.82, + "grad_norm": 1.8132648468017578, + "learning_rate": 0.00016281176470588234, + "loss": 0.3559, + "step": 3890 + }, + { + "epoch": 229.12, + "grad_norm": 3.622833013534546, + "learning_rate": 0.00016263529411764704, + "loss": 0.363, + "step": 3895 + }, + { + "epoch": 229.41, + "grad_norm": 5.5960564613342285, + "learning_rate": 0.00016245882352941177, + "loss": 0.342, + "step": 3900 + }, + { + "epoch": 229.41, + "eval_loss": 0.33262258768081665, + "eval_runtime": 1.9751, + "eval_samples_per_second": 67.844, + "eval_steps_per_second": 8.607, + "step": 3900 + }, + { + "epoch": 229.71, + "grad_norm": 1.150291085243225, + "learning_rate": 0.00016228235294117645, + "loss": 0.3253, + "step": 3905 + }, + { + "epoch": 230.0, + "grad_norm": 3.6979823112487793, + "learning_rate": 0.00016210588235294115, + "loss": 0.3884, + "step": 3910 + }, + { + "epoch": 230.29, + "grad_norm": 0.41416049003601074, + "learning_rate": 0.00016192941176470586, + "loss": 0.3236, + "step": 3915 + }, + { + "epoch": 230.59, + "grad_norm": 4.2704386711120605, + "learning_rate": 0.00016175294117647056, + "loss": 0.3379, + "step": 3920 + }, + { + "epoch": 230.59, + "eval_loss": 0.2907845675945282, + "eval_runtime": 1.9526, + "eval_samples_per_second": 68.627, + "eval_steps_per_second": 8.706, + "step": 3920 + }, + { + "epoch": 230.88, + "grad_norm": 1.291405200958252, + "learning_rate": 0.0001615764705882353, + "loss": 0.2977, + "step": 3925 + }, + { + "epoch": 231.18, + "grad_norm": 0.3919481337070465, + "learning_rate": 0.0001614, + "loss": 0.3143, + "step": 3930 + }, + { + "epoch": 231.47, + "grad_norm": 1.307499885559082, + "learning_rate": 0.0001612235294117647, + "loss": 0.2855, + "step": 3935 + }, + { + "epoch": 231.76, + "grad_norm": 0.5394136309623718, + "learning_rate": 0.0001610470588235294, + "loss": 0.3207, + "step": 3940 + }, + { + "epoch": 231.76, + "eval_loss": 0.279965877532959, + "eval_runtime": 1.9505, + "eval_samples_per_second": 68.701, + "eval_steps_per_second": 8.716, + "step": 3940 + }, + { + "epoch": 232.06, + "grad_norm": 0.7163982391357422, + "learning_rate": 0.0001608705882352941, + "loss": 0.2823, + "step": 3945 + }, + { + "epoch": 232.35, + "grad_norm": 3.876972198486328, + "learning_rate": 0.0001606941176470588, + "loss": 0.3007, + "step": 3950 + }, + { + "epoch": 232.65, + "grad_norm": 0.5939856171607971, + "learning_rate": 0.00016051764705882354, + "loss": 0.3048, + "step": 3955 + }, + { + "epoch": 232.94, + "grad_norm": 0.44169408082962036, + "learning_rate": 0.00016034117647058824, + "loss": 0.2839, + "step": 3960 + }, + { + "epoch": 232.94, + "eval_loss": 0.2800130546092987, + "eval_runtime": 1.9501, + "eval_samples_per_second": 68.714, + "eval_steps_per_second": 8.717, + "step": 3960 + }, + { + "epoch": 233.24, + "grad_norm": 0.6114098429679871, + "learning_rate": 0.00016016470588235292, + "loss": 0.2999, + "step": 3965 + }, + { + "epoch": 233.53, + "grad_norm": 1.1349961757659912, + "learning_rate": 0.00015998823529411762, + "loss": 0.3082, + "step": 3970 + }, + { + "epoch": 233.82, + "grad_norm": 0.5436539053916931, + "learning_rate": 0.00015981176470588232, + "loss": 0.2815, + "step": 3975 + }, + { + "epoch": 234.12, + "grad_norm": 1.0354223251342773, + "learning_rate": 0.00015963529411764705, + "loss": 0.3037, + "step": 3980 + }, + { + "epoch": 234.12, + "eval_loss": 0.27129727602005005, + "eval_runtime": 1.9587, + "eval_samples_per_second": 68.411, + "eval_steps_per_second": 8.679, + "step": 3980 + }, + { + "epoch": 234.41, + "grad_norm": 0.38999831676483154, + "learning_rate": 0.00015945882352941176, + "loss": 0.2739, + "step": 3985 + }, + { + "epoch": 234.71, + "grad_norm": 0.38337576389312744, + "learning_rate": 0.00015928235294117646, + "loss": 0.2765, + "step": 3990 + }, + { + "epoch": 235.0, + "grad_norm": 0.46007809042930603, + "learning_rate": 0.00015910588235294116, + "loss": 0.2835, + "step": 3995 + }, + { + "epoch": 235.29, + "grad_norm": 0.5706712007522583, + "learning_rate": 0.00015892941176470586, + "loss": 0.2788, + "step": 4000 + }, + { + "epoch": 235.29, + "eval_loss": 0.2861948609352112, + "eval_runtime": 1.9518, + "eval_samples_per_second": 68.655, + "eval_steps_per_second": 8.71, + "step": 4000 + }, + { + "epoch": 235.59, + "grad_norm": 1.2941205501556396, + "learning_rate": 0.00015875294117647057, + "loss": 0.3055, + "step": 4005 + }, + { + "epoch": 235.88, + "grad_norm": 0.5581958889961243, + "learning_rate": 0.0001585764705882353, + "loss": 0.2791, + "step": 4010 + }, + { + "epoch": 236.18, + "grad_norm": 0.3634546399116516, + "learning_rate": 0.0001584, + "loss": 0.271, + "step": 4015 + }, + { + "epoch": 236.47, + "grad_norm": 3.101717948913574, + "learning_rate": 0.0001582235294117647, + "loss": 0.2788, + "step": 4020 + }, + { + "epoch": 236.47, + "eval_loss": 0.2718346118927002, + "eval_runtime": 1.9507, + "eval_samples_per_second": 68.693, + "eval_steps_per_second": 8.715, + "step": 4020 + }, + { + "epoch": 236.76, + "grad_norm": 0.44929930567741394, + "learning_rate": 0.00015804705882352938, + "loss": 0.2868, + "step": 4025 + }, + { + "epoch": 237.06, + "grad_norm": 0.435112327337265, + "learning_rate": 0.00015787058823529408, + "loss": 0.2892, + "step": 4030 + }, + { + "epoch": 237.35, + "grad_norm": 0.37171122431755066, + "learning_rate": 0.00015769411764705881, + "loss": 0.2775, + "step": 4035 + }, + { + "epoch": 237.65, + "grad_norm": 0.5312927961349487, + "learning_rate": 0.00015751764705882352, + "loss": 0.2754, + "step": 4040 + }, + { + "epoch": 237.65, + "eval_loss": 0.27094125747680664, + "eval_runtime": 1.9495, + "eval_samples_per_second": 68.735, + "eval_steps_per_second": 8.72, + "step": 4040 + }, + { + "epoch": 237.94, + "grad_norm": 0.3910554349422455, + "learning_rate": 0.00015734117647058822, + "loss": 0.282, + "step": 4045 + }, + { + "epoch": 238.24, + "grad_norm": 0.3710722327232361, + "learning_rate": 0.00015716470588235292, + "loss": 0.2715, + "step": 4050 + }, + { + "epoch": 238.53, + "grad_norm": 0.4959905445575714, + "learning_rate": 0.00015698823529411763, + "loss": 0.289, + "step": 4055 + }, + { + "epoch": 238.82, + "grad_norm": 0.4599302113056183, + "learning_rate": 0.00015681176470588233, + "loss": 0.2734, + "step": 4060 + }, + { + "epoch": 238.82, + "eval_loss": 0.2697753310203552, + "eval_runtime": 1.9505, + "eval_samples_per_second": 68.7, + "eval_steps_per_second": 8.716, + "step": 4060 + }, + { + "epoch": 239.12, + "grad_norm": 0.34432175755500793, + "learning_rate": 0.00015663529411764706, + "loss": 0.2722, + "step": 4065 + }, + { + "epoch": 239.41, + "grad_norm": 0.8361882567405701, + "learning_rate": 0.00015645882352941176, + "loss": 0.2647, + "step": 4070 + }, + { + "epoch": 239.71, + "grad_norm": 0.37892064452171326, + "learning_rate": 0.00015628235294117647, + "loss": 0.2821, + "step": 4075 + }, + { + "epoch": 240.0, + "grad_norm": 0.608864963054657, + "learning_rate": 0.00015610588235294117, + "loss": 0.3078, + "step": 4080 + }, + { + "epoch": 240.0, + "eval_loss": 0.2691473662853241, + "eval_runtime": 1.9499, + "eval_samples_per_second": 68.721, + "eval_steps_per_second": 8.718, + "step": 4080 + }, + { + "epoch": 240.29, + "grad_norm": 0.4258846342563629, + "learning_rate": 0.00015592941176470585, + "loss": 0.2823, + "step": 4085 + }, + { + "epoch": 240.59, + "grad_norm": 0.27903735637664795, + "learning_rate": 0.00015575294117647055, + "loss": 0.2555, + "step": 4090 + }, + { + "epoch": 240.88, + "grad_norm": 0.4321545362472534, + "learning_rate": 0.00015557647058823528, + "loss": 0.2834, + "step": 4095 + }, + { + "epoch": 241.18, + "grad_norm": 0.35063254833221436, + "learning_rate": 0.00015539999999999998, + "loss": 0.2721, + "step": 4100 + }, + { + "epoch": 241.18, + "eval_loss": 0.2684215009212494, + "eval_runtime": 1.9538, + "eval_samples_per_second": 68.586, + "eval_steps_per_second": 8.701, + "step": 4100 + }, + { + "epoch": 241.47, + "grad_norm": 0.3060368001461029, + "learning_rate": 0.00015522352941176469, + "loss": 0.2728, + "step": 4105 + }, + { + "epoch": 241.76, + "grad_norm": 0.46749743819236755, + "learning_rate": 0.0001550470588235294, + "loss": 0.2795, + "step": 4110 + }, + { + "epoch": 242.06, + "grad_norm": 0.4130377471446991, + "learning_rate": 0.0001548705882352941, + "loss": 0.2887, + "step": 4115 + }, + { + "epoch": 242.35, + "grad_norm": 0.3679191768169403, + "learning_rate": 0.00015469411764705882, + "loss": 0.2685, + "step": 4120 + }, + { + "epoch": 242.35, + "eval_loss": 0.26844125986099243, + "eval_runtime": 1.9522, + "eval_samples_per_second": 68.64, + "eval_steps_per_second": 8.708, + "step": 4120 + }, + { + "epoch": 242.65, + "grad_norm": 0.32408028841018677, + "learning_rate": 0.00015451764705882353, + "loss": 0.2868, + "step": 4125 + }, + { + "epoch": 242.94, + "grad_norm": 0.3026672303676605, + "learning_rate": 0.00015434117647058823, + "loss": 0.271, + "step": 4130 + }, + { + "epoch": 243.24, + "grad_norm": 0.3329572081565857, + "learning_rate": 0.00015416470588235293, + "loss": 0.2601, + "step": 4135 + }, + { + "epoch": 243.53, + "grad_norm": 0.3301178812980652, + "learning_rate": 0.00015398823529411764, + "loss": 0.2621, + "step": 4140 + }, + { + "epoch": 243.53, + "eval_loss": 0.26796454191207886, + "eval_runtime": 1.9505, + "eval_samples_per_second": 68.701, + "eval_steps_per_second": 8.716, + "step": 4140 + }, + { + "epoch": 243.82, + "grad_norm": 0.4053894281387329, + "learning_rate": 0.0001538117647058823, + "loss": 0.2885, + "step": 4145 + }, + { + "epoch": 244.12, + "grad_norm": 0.3042493462562561, + "learning_rate": 0.00015363529411764707, + "loss": 0.2833, + "step": 4150 + }, + { + "epoch": 244.41, + "grad_norm": 0.3672405481338501, + "learning_rate": 0.00015345882352941174, + "loss": 0.2586, + "step": 4155 + }, + { + "epoch": 244.71, + "grad_norm": 0.41404080390930176, + "learning_rate": 0.00015328235294117645, + "loss": 0.2842, + "step": 4160 + }, + { + "epoch": 244.71, + "eval_loss": 0.2685539424419403, + "eval_runtime": 1.9498, + "eval_samples_per_second": 68.727, + "eval_steps_per_second": 8.719, + "step": 4160 + }, + { + "epoch": 245.0, + "grad_norm": 0.5190454125404358, + "learning_rate": 0.00015310588235294115, + "loss": 0.2853, + "step": 4165 + }, + { + "epoch": 245.29, + "grad_norm": 0.3695441782474518, + "learning_rate": 0.00015292941176470585, + "loss": 0.2843, + "step": 4170 + }, + { + "epoch": 245.59, + "grad_norm": 0.458651602268219, + "learning_rate": 0.00015275294117647058, + "loss": 0.2732, + "step": 4175 + }, + { + "epoch": 245.88, + "grad_norm": 0.3108579218387604, + "learning_rate": 0.0001525764705882353, + "loss": 0.2713, + "step": 4180 + }, + { + "epoch": 245.88, + "eval_loss": 0.2684222459793091, + "eval_runtime": 1.9514, + "eval_samples_per_second": 68.668, + "eval_steps_per_second": 8.712, + "step": 4180 + }, + { + "epoch": 246.18, + "grad_norm": 0.32316526770591736, + "learning_rate": 0.0001524, + "loss": 0.2593, + "step": 4185 + }, + { + "epoch": 246.47, + "grad_norm": 0.3795959949493408, + "learning_rate": 0.0001522235294117647, + "loss": 0.2648, + "step": 4190 + }, + { + "epoch": 246.76, + "grad_norm": 0.34454479813575745, + "learning_rate": 0.0001520470588235294, + "loss": 0.2948, + "step": 4195 + }, + { + "epoch": 247.06, + "grad_norm": 0.37235042452812195, + "learning_rate": 0.0001518705882352941, + "loss": 0.2734, + "step": 4200 + }, + { + "epoch": 247.06, + "eval_loss": 0.26809829473495483, + "eval_runtime": 1.9493, + "eval_samples_per_second": 68.741, + "eval_steps_per_second": 8.721, + "step": 4200 + }, + { + "epoch": 247.35, + "grad_norm": 0.45896175503730774, + "learning_rate": 0.00015169411764705883, + "loss": 0.2972, + "step": 4205 + }, + { + "epoch": 247.65, + "grad_norm": 0.37833425402641296, + "learning_rate": 0.00015151764705882353, + "loss": 0.2575, + "step": 4210 + }, + { + "epoch": 247.94, + "grad_norm": 0.3599943518638611, + "learning_rate": 0.0001513411764705882, + "loss": 0.2639, + "step": 4215 + }, + { + "epoch": 248.24, + "grad_norm": 0.35436397790908813, + "learning_rate": 0.0001511647058823529, + "loss": 0.2574, + "step": 4220 + }, + { + "epoch": 248.24, + "eval_loss": 0.2679005265235901, + "eval_runtime": 1.9531, + "eval_samples_per_second": 68.611, + "eval_steps_per_second": 8.704, + "step": 4220 + }, + { + "epoch": 248.53, + "grad_norm": 0.3590777516365051, + "learning_rate": 0.00015098823529411762, + "loss": 0.274, + "step": 4225 + }, + { + "epoch": 248.82, + "grad_norm": 0.36425745487213135, + "learning_rate": 0.00015081176470588235, + "loss": 0.2832, + "step": 4230 + }, + { + "epoch": 249.12, + "grad_norm": 0.3374358117580414, + "learning_rate": 0.00015063529411764705, + "loss": 0.2803, + "step": 4235 + }, + { + "epoch": 249.41, + "grad_norm": 0.33885693550109863, + "learning_rate": 0.00015045882352941175, + "loss": 0.2609, + "step": 4240 + }, + { + "epoch": 249.41, + "eval_loss": 0.2679958641529083, + "eval_runtime": 1.9519, + "eval_samples_per_second": 68.65, + "eval_steps_per_second": 8.709, + "step": 4240 + }, + { + "epoch": 249.71, + "grad_norm": 0.3161369860172272, + "learning_rate": 0.00015028235294117646, + "loss": 0.3055, + "step": 4245 + }, + { + "epoch": 250.0, + "grad_norm": 0.37477099895477295, + "learning_rate": 0.00015010588235294116, + "loss": 0.2597, + "step": 4250 + }, + { + "epoch": 250.29, + "grad_norm": 0.40914347767829895, + "learning_rate": 0.00014992941176470586, + "loss": 0.261, + "step": 4255 + }, + { + "epoch": 250.59, + "grad_norm": 0.3606330156326294, + "learning_rate": 0.00014975294117647057, + "loss": 0.2749, + "step": 4260 + }, + { + "epoch": 250.59, + "eval_loss": 0.26829561591148376, + "eval_runtime": 1.9517, + "eval_samples_per_second": 68.657, + "eval_steps_per_second": 8.71, + "step": 4260 + }, + { + "epoch": 250.88, + "grad_norm": 0.3115130066871643, + "learning_rate": 0.0001495764705882353, + "loss": 0.2853, + "step": 4265 + }, + { + "epoch": 251.18, + "grad_norm": 0.3454064130783081, + "learning_rate": 0.0001494, + "loss": 0.2753, + "step": 4270 + }, + { + "epoch": 251.47, + "grad_norm": 0.41166913509368896, + "learning_rate": 0.00014922352941176468, + "loss": 0.2779, + "step": 4275 + }, + { + "epoch": 251.76, + "grad_norm": 0.34331214427948, + "learning_rate": 0.0001490470588235294, + "loss": 0.2679, + "step": 4280 + }, + { + "epoch": 251.76, + "eval_loss": 0.26898276805877686, + "eval_runtime": 1.9508, + "eval_samples_per_second": 68.691, + "eval_steps_per_second": 8.715, + "step": 4280 + }, + { + "epoch": 252.06, + "grad_norm": 0.3516402840614319, + "learning_rate": 0.0001488705882352941, + "loss": 0.2818, + "step": 4285 + }, + { + "epoch": 252.35, + "grad_norm": 0.4438072741031647, + "learning_rate": 0.0001486941176470588, + "loss": 0.2718, + "step": 4290 + }, + { + "epoch": 252.65, + "grad_norm": 0.2938256561756134, + "learning_rate": 0.00014851764705882352, + "loss": 0.2578, + "step": 4295 + }, + { + "epoch": 252.94, + "grad_norm": 0.3824418783187866, + "learning_rate": 0.00014834117647058822, + "loss": 0.2879, + "step": 4300 + }, + { + "epoch": 252.94, + "eval_loss": 0.2682475745677948, + "eval_runtime": 1.9547, + "eval_samples_per_second": 68.554, + "eval_steps_per_second": 8.697, + "step": 4300 + }, + { + "epoch": 253.24, + "grad_norm": 0.3058859705924988, + "learning_rate": 0.00014816470588235295, + "loss": 0.2682, + "step": 4305 + }, + { + "epoch": 253.53, + "grad_norm": 0.3370761573314667, + "learning_rate": 0.00014798823529411762, + "loss": 0.269, + "step": 4310 + }, + { + "epoch": 253.82, + "grad_norm": 0.3609977960586548, + "learning_rate": 0.00014781176470588233, + "loss": 0.2879, + "step": 4315 + }, + { + "epoch": 254.12, + "grad_norm": 0.26350536942481995, + "learning_rate": 0.00014763529411764706, + "loss": 0.2696, + "step": 4320 + }, + { + "epoch": 254.12, + "eval_loss": 0.26790598034858704, + "eval_runtime": 1.9514, + "eval_samples_per_second": 68.667, + "eval_steps_per_second": 8.712, + "step": 4320 + }, + { + "epoch": 254.41, + "grad_norm": 0.43606036901474, + "learning_rate": 0.00014745882352941176, + "loss": 0.2745, + "step": 4325 + }, + { + "epoch": 254.71, + "grad_norm": 0.3826625645160675, + "learning_rate": 0.00014728235294117646, + "loss": 0.2848, + "step": 4330 + }, + { + "epoch": 255.0, + "grad_norm": 0.5310951471328735, + "learning_rate": 0.00014710588235294117, + "loss": 0.283, + "step": 4335 + }, + { + "epoch": 255.29, + "grad_norm": 0.3799887001514435, + "learning_rate": 0.00014692941176470587, + "loss": 0.2679, + "step": 4340 + }, + { + "epoch": 255.29, + "eval_loss": 0.2680792212486267, + "eval_runtime": 1.9534, + "eval_samples_per_second": 68.597, + "eval_steps_per_second": 8.703, + "step": 4340 + }, + { + "epoch": 255.59, + "grad_norm": 0.34268835186958313, + "learning_rate": 0.00014675294117647057, + "loss": 0.269, + "step": 4345 + }, + { + "epoch": 255.88, + "grad_norm": 0.3309810161590576, + "learning_rate": 0.00014657647058823528, + "loss": 0.2793, + "step": 4350 + }, + { + "epoch": 256.18, + "grad_norm": 0.36746713519096375, + "learning_rate": 0.00014639999999999998, + "loss": 0.2853, + "step": 4355 + }, + { + "epoch": 256.47, + "grad_norm": 0.34218254685401917, + "learning_rate": 0.0001462235294117647, + "loss": 0.2748, + "step": 4360 + }, + { + "epoch": 256.47, + "eval_loss": 0.26810911297798157, + "eval_runtime": 1.9513, + "eval_samples_per_second": 68.671, + "eval_steps_per_second": 8.712, + "step": 4360 + }, + { + "epoch": 256.76, + "grad_norm": 0.41486817598342896, + "learning_rate": 0.0001460470588235294, + "loss": 0.2803, + "step": 4365 + }, + { + "epoch": 257.06, + "grad_norm": 0.42362910509109497, + "learning_rate": 0.0001458705882352941, + "loss": 0.2823, + "step": 4370 + }, + { + "epoch": 257.35, + "grad_norm": 0.4304226338863373, + "learning_rate": 0.00014569411764705882, + "loss": 0.278, + "step": 4375 + }, + { + "epoch": 257.65, + "grad_norm": 0.3646482229232788, + "learning_rate": 0.00014551764705882352, + "loss": 0.2813, + "step": 4380 + }, + { + "epoch": 257.65, + "eval_loss": 0.2688765525817871, + "eval_runtime": 1.95, + "eval_samples_per_second": 68.716, + "eval_steps_per_second": 8.718, + "step": 4380 + }, + { + "epoch": 257.94, + "grad_norm": 0.4169907569885254, + "learning_rate": 0.00014534117647058823, + "loss": 0.2665, + "step": 4385 + }, + { + "epoch": 258.24, + "grad_norm": 0.3664630651473999, + "learning_rate": 0.00014516470588235293, + "loss": 0.2752, + "step": 4390 + }, + { + "epoch": 258.53, + "grad_norm": 0.35448622703552246, + "learning_rate": 0.00014498823529411763, + "loss": 0.2669, + "step": 4395 + }, + { + "epoch": 258.82, + "grad_norm": 0.3023925721645355, + "learning_rate": 0.00014481176470588234, + "loss": 0.2727, + "step": 4400 + }, + { + "epoch": 258.82, + "eval_loss": 0.2685912847518921, + "eval_runtime": 1.951, + "eval_samples_per_second": 68.682, + "eval_steps_per_second": 8.713, + "step": 4400 + }, + { + "epoch": 259.12, + "grad_norm": 0.3286932706832886, + "learning_rate": 0.00014463529411764704, + "loss": 0.2714, + "step": 4405 + }, + { + "epoch": 259.41, + "grad_norm": 0.3822152018547058, + "learning_rate": 0.00014445882352941174, + "loss": 0.2797, + "step": 4410 + }, + { + "epoch": 259.71, + "grad_norm": 0.3247034549713135, + "learning_rate": 0.00014428235294117647, + "loss": 0.2698, + "step": 4415 + }, + { + "epoch": 260.0, + "grad_norm": 0.3092283606529236, + "learning_rate": 0.00014410588235294118, + "loss": 0.2695, + "step": 4420 + }, + { + "epoch": 260.0, + "eval_loss": 0.268071711063385, + "eval_runtime": 1.9553, + "eval_samples_per_second": 68.532, + "eval_steps_per_second": 8.694, + "step": 4420 + }, + { + "epoch": 260.29, + "grad_norm": 0.31391987204551697, + "learning_rate": 0.00014392941176470585, + "loss": 0.2524, + "step": 4425 + }, + { + "epoch": 260.59, + "grad_norm": 0.4478277564048767, + "learning_rate": 0.00014375294117647058, + "loss": 0.2932, + "step": 4430 + }, + { + "epoch": 260.88, + "grad_norm": 0.3896423280239105, + "learning_rate": 0.00014357647058823529, + "loss": 0.2721, + "step": 4435 + }, + { + "epoch": 261.18, + "grad_norm": 0.38531139492988586, + "learning_rate": 0.0001434, + "loss": 0.2783, + "step": 4440 + }, + { + "epoch": 261.18, + "eval_loss": 0.26778537034988403, + "eval_runtime": 1.9729, + "eval_samples_per_second": 67.921, + "eval_steps_per_second": 8.617, + "step": 4440 + }, + { + "epoch": 261.47, + "grad_norm": 0.3232022225856781, + "learning_rate": 0.0001432235294117647, + "loss": 0.2672, + "step": 4445 + }, + { + "epoch": 261.76, + "grad_norm": 0.30753687024116516, + "learning_rate": 0.0001430470588235294, + "loss": 0.2707, + "step": 4450 + }, + { + "epoch": 262.06, + "grad_norm": 0.5235870480537415, + "learning_rate": 0.0001428705882352941, + "loss": 0.2897, + "step": 4455 + }, + { + "epoch": 262.35, + "grad_norm": 0.3308814764022827, + "learning_rate": 0.0001426941176470588, + "loss": 0.2691, + "step": 4460 + }, + { + "epoch": 262.35, + "eval_loss": 0.2677620053291321, + "eval_runtime": 1.953, + "eval_samples_per_second": 68.614, + "eval_steps_per_second": 8.705, + "step": 4460 + }, + { + "epoch": 262.65, + "grad_norm": 0.34953877329826355, + "learning_rate": 0.0001425176470588235, + "loss": 0.2647, + "step": 4465 + }, + { + "epoch": 262.94, + "grad_norm": 0.4638715982437134, + "learning_rate": 0.00014234117647058824, + "loss": 0.2797, + "step": 4470 + }, + { + "epoch": 263.24, + "grad_norm": 0.3672123849391937, + "learning_rate": 0.00014216470588235294, + "loss": 0.2732, + "step": 4475 + }, + { + "epoch": 263.53, + "grad_norm": 0.3995963931083679, + "learning_rate": 0.00014198823529411764, + "loss": 0.2915, + "step": 4480 + }, + { + "epoch": 263.53, + "eval_loss": 0.2678990066051483, + "eval_runtime": 1.9518, + "eval_samples_per_second": 68.656, + "eval_steps_per_second": 8.71, + "step": 4480 + }, + { + "epoch": 263.82, + "grad_norm": 0.3602835237979889, + "learning_rate": 0.00014181176470588234, + "loss": 0.2686, + "step": 4485 + }, + { + "epoch": 264.12, + "grad_norm": 0.29979801177978516, + "learning_rate": 0.00014163529411764705, + "loss": 0.2556, + "step": 4490 + }, + { + "epoch": 264.41, + "grad_norm": 0.3114030659198761, + "learning_rate": 0.00014145882352941175, + "loss": 0.2826, + "step": 4495 + }, + { + "epoch": 264.71, + "grad_norm": 0.39181116223335266, + "learning_rate": 0.00014128235294117645, + "loss": 0.2767, + "step": 4500 + }, + { + "epoch": 264.71, + "eval_loss": 0.26801636815071106, + "eval_runtime": 1.9518, + "eval_samples_per_second": 68.655, + "eval_steps_per_second": 8.71, + "step": 4500 + }, + { + "epoch": 265.0, + "grad_norm": 0.7562241554260254, + "learning_rate": 0.00014110588235294116, + "loss": 0.2861, + "step": 4505 + }, + { + "epoch": 265.29, + "grad_norm": 0.31726887822151184, + "learning_rate": 0.00014092941176470586, + "loss": 0.2577, + "step": 4510 + }, + { + "epoch": 265.59, + "grad_norm": 0.41455981135368347, + "learning_rate": 0.0001407529411764706, + "loss": 0.2781, + "step": 4515 + }, + { + "epoch": 265.88, + "grad_norm": 0.3152996897697449, + "learning_rate": 0.00014057647058823527, + "loss": 0.2738, + "step": 4520 + }, + { + "epoch": 265.88, + "eval_loss": 0.2681591212749481, + "eval_runtime": 1.9641, + "eval_samples_per_second": 68.226, + "eval_steps_per_second": 8.656, + "step": 4520 + }, + { + "epoch": 266.18, + "grad_norm": 0.3235112130641937, + "learning_rate": 0.0001404, + "loss": 0.2785, + "step": 4525 + }, + { + "epoch": 266.47, + "grad_norm": 0.3388126790523529, + "learning_rate": 0.0001402235294117647, + "loss": 0.2656, + "step": 4530 + }, + { + "epoch": 266.76, + "grad_norm": 0.3489181399345398, + "learning_rate": 0.0001400470588235294, + "loss": 0.2852, + "step": 4535 + }, + { + "epoch": 267.06, + "grad_norm": 0.3196962773799896, + "learning_rate": 0.0001398705882352941, + "loss": 0.2662, + "step": 4540 + }, + { + "epoch": 267.06, + "eval_loss": 0.2676832973957062, + "eval_runtime": 1.9507, + "eval_samples_per_second": 68.695, + "eval_steps_per_second": 8.715, + "step": 4540 + }, + { + "epoch": 267.35, + "grad_norm": 0.30039843916893005, + "learning_rate": 0.0001396941176470588, + "loss": 0.2543, + "step": 4545 + }, + { + "epoch": 267.65, + "grad_norm": 0.3503200113773346, + "learning_rate": 0.0001395176470588235, + "loss": 0.2706, + "step": 4550 + }, + { + "epoch": 267.94, + "grad_norm": 0.39216938614845276, + "learning_rate": 0.00013934117647058822, + "loss": 0.3006, + "step": 4555 + }, + { + "epoch": 268.24, + "grad_norm": 0.40269961953163147, + "learning_rate": 0.00013916470588235292, + "loss": 0.2503, + "step": 4560 + }, + { + "epoch": 268.24, + "eval_loss": 0.2675241529941559, + "eval_runtime": 1.9518, + "eval_samples_per_second": 68.654, + "eval_steps_per_second": 8.71, + "step": 4560 + }, + { + "epoch": 268.53, + "grad_norm": 0.36722421646118164, + "learning_rate": 0.00013898823529411762, + "loss": 0.2868, + "step": 4565 + }, + { + "epoch": 268.82, + "grad_norm": 0.32469937205314636, + "learning_rate": 0.00013881176470588235, + "loss": 0.2709, + "step": 4570 + }, + { + "epoch": 269.12, + "grad_norm": 0.2924494445323944, + "learning_rate": 0.00013863529411764706, + "loss": 0.2787, + "step": 4575 + }, + { + "epoch": 269.41, + "grad_norm": 0.37839242815971375, + "learning_rate": 0.00013845882352941173, + "loss": 0.2699, + "step": 4580 + }, + { + "epoch": 269.41, + "eval_loss": 0.2681180536746979, + "eval_runtime": 1.9497, + "eval_samples_per_second": 68.729, + "eval_steps_per_second": 8.719, + "step": 4580 + }, + { + "epoch": 269.71, + "grad_norm": 0.46704792976379395, + "learning_rate": 0.00013828235294117646, + "loss": 0.288, + "step": 4585 + }, + { + "epoch": 270.0, + "grad_norm": 0.3285718560218811, + "learning_rate": 0.00013810588235294117, + "loss": 0.2644, + "step": 4590 + }, + { + "epoch": 270.29, + "grad_norm": 0.5653596520423889, + "learning_rate": 0.00013792941176470587, + "loss": 0.2782, + "step": 4595 + }, + { + "epoch": 270.59, + "grad_norm": 0.4037015736103058, + "learning_rate": 0.00013775294117647057, + "loss": 0.2752, + "step": 4600 + }, + { + "epoch": 270.59, + "eval_loss": 0.26798370480537415, + "eval_runtime": 1.9598, + "eval_samples_per_second": 68.373, + "eval_steps_per_second": 8.674, + "step": 4600 + }, + { + "epoch": 270.88, + "grad_norm": 0.4096309244632721, + "learning_rate": 0.00013757647058823528, + "loss": 0.2745, + "step": 4605 + }, + { + "epoch": 271.18, + "grad_norm": 0.3656843900680542, + "learning_rate": 0.0001374, + "loss": 0.2818, + "step": 4610 + }, + { + "epoch": 271.47, + "grad_norm": 0.4000866413116455, + "learning_rate": 0.00013722352941176468, + "loss": 0.2744, + "step": 4615 + }, + { + "epoch": 271.76, + "grad_norm": 0.36871519684791565, + "learning_rate": 0.00013704705882352939, + "loss": 0.2687, + "step": 4620 + }, + { + "epoch": 271.76, + "eval_loss": 0.2678588330745697, + "eval_runtime": 1.9513, + "eval_samples_per_second": 68.672, + "eval_steps_per_second": 8.712, + "step": 4620 + }, + { + "epoch": 272.06, + "grad_norm": 0.3991299271583557, + "learning_rate": 0.00013687058823529412, + "loss": 0.2688, + "step": 4625 + }, + { + "epoch": 272.35, + "grad_norm": 0.29168155789375305, + "learning_rate": 0.00013669411764705882, + "loss": 0.2552, + "step": 4630 + }, + { + "epoch": 272.65, + "grad_norm": 0.3696051239967346, + "learning_rate": 0.00013651764705882352, + "loss": 0.2935, + "step": 4635 + }, + { + "epoch": 272.94, + "grad_norm": 0.34117817878723145, + "learning_rate": 0.00013634117647058822, + "loss": 0.2652, + "step": 4640 + }, + { + "epoch": 272.94, + "eval_loss": 0.26783978939056396, + "eval_runtime": 1.9516, + "eval_samples_per_second": 68.66, + "eval_steps_per_second": 8.711, + "step": 4640 + }, + { + "epoch": 273.24, + "grad_norm": 0.317380428314209, + "learning_rate": 0.00013616470588235293, + "loss": 0.2498, + "step": 4645 + }, + { + "epoch": 273.53, + "grad_norm": 0.4965627193450928, + "learning_rate": 0.00013598823529411763, + "loss": 0.2764, + "step": 4650 + }, + { + "epoch": 273.82, + "grad_norm": 0.4523441791534424, + "learning_rate": 0.00013581176470588233, + "loss": 0.2794, + "step": 4655 + }, + { + "epoch": 274.12, + "grad_norm": 0.21841204166412354, + "learning_rate": 0.00013563529411764704, + "loss": 0.2743, + "step": 4660 + }, + { + "epoch": 274.12, + "eval_loss": 0.2674843370914459, + "eval_runtime": 1.9524, + "eval_samples_per_second": 68.634, + "eval_steps_per_second": 8.707, + "step": 4660 + }, + { + "epoch": 274.41, + "grad_norm": 0.3210611641407013, + "learning_rate": 0.00013545882352941177, + "loss": 0.2718, + "step": 4665 + }, + { + "epoch": 274.71, + "grad_norm": 0.4059251844882965, + "learning_rate": 0.00013528235294117647, + "loss": 0.275, + "step": 4670 + }, + { + "epoch": 275.0, + "grad_norm": 0.43324097990989685, + "learning_rate": 0.00013510588235294115, + "loss": 0.2968, + "step": 4675 + }, + { + "epoch": 275.29, + "grad_norm": 0.3630353510379791, + "learning_rate": 0.00013492941176470588, + "loss": 0.2816, + "step": 4680 + }, + { + "epoch": 275.29, + "eval_loss": 0.2679999768733978, + "eval_runtime": 1.9509, + "eval_samples_per_second": 68.685, + "eval_steps_per_second": 8.714, + "step": 4680 + }, + { + "epoch": 275.59, + "grad_norm": 0.3136293888092041, + "learning_rate": 0.00013475294117647058, + "loss": 0.2566, + "step": 4685 + }, + { + "epoch": 275.88, + "grad_norm": 0.4704384505748749, + "learning_rate": 0.00013457647058823528, + "loss": 0.2894, + "step": 4690 + }, + { + "epoch": 276.18, + "grad_norm": 0.3781212568283081, + "learning_rate": 0.0001344, + "loss": 0.2732, + "step": 4695 + }, + { + "epoch": 276.47, + "grad_norm": 0.302179753780365, + "learning_rate": 0.0001342235294117647, + "loss": 0.2688, + "step": 4700 + }, + { + "epoch": 276.47, + "eval_loss": 0.2683798670768738, + "eval_runtime": 1.9509, + "eval_samples_per_second": 68.685, + "eval_steps_per_second": 8.714, + "step": 4700 + }, + { + "epoch": 276.76, + "grad_norm": 0.3125908076763153, + "learning_rate": 0.00013404705882352942, + "loss": 0.2617, + "step": 4705 + }, + { + "epoch": 277.06, + "grad_norm": 0.33059990406036377, + "learning_rate": 0.0001338705882352941, + "loss": 0.2846, + "step": 4710 + }, + { + "epoch": 277.35, + "grad_norm": 0.34243327379226685, + "learning_rate": 0.0001336941176470588, + "loss": 0.2565, + "step": 4715 + }, + { + "epoch": 277.65, + "grad_norm": 0.36869239807128906, + "learning_rate": 0.00013351764705882353, + "loss": 0.2724, + "step": 4720 + }, + { + "epoch": 277.65, + "eval_loss": 0.2678760290145874, + "eval_runtime": 1.9531, + "eval_samples_per_second": 68.61, + "eval_steps_per_second": 8.704, + "step": 4720 + }, + { + "epoch": 277.94, + "grad_norm": 0.40165045857429504, + "learning_rate": 0.00013334117647058823, + "loss": 0.3, + "step": 4725 + }, + { + "epoch": 278.24, + "grad_norm": 0.315395325422287, + "learning_rate": 0.00013316470588235294, + "loss": 0.2562, + "step": 4730 + }, + { + "epoch": 278.53, + "grad_norm": 0.38870301842689514, + "learning_rate": 0.00013298823529411764, + "loss": 0.2744, + "step": 4735 + }, + { + "epoch": 278.82, + "grad_norm": 0.3201696574687958, + "learning_rate": 0.00013281176470588234, + "loss": 0.2695, + "step": 4740 + }, + { + "epoch": 278.82, + "eval_loss": 0.2679029405117035, + "eval_runtime": 1.952, + "eval_samples_per_second": 68.649, + "eval_steps_per_second": 8.709, + "step": 4740 + }, + { + "epoch": 279.12, + "grad_norm": 0.47319474816322327, + "learning_rate": 0.00013263529411764705, + "loss": 0.2999, + "step": 4745 + }, + { + "epoch": 279.41, + "grad_norm": 0.36222875118255615, + "learning_rate": 0.00013245882352941175, + "loss": 0.2672, + "step": 4750 + }, + { + "epoch": 279.71, + "grad_norm": 0.362989604473114, + "learning_rate": 0.00013228235294117645, + "loss": 0.2488, + "step": 4755 + }, + { + "epoch": 280.0, + "grad_norm": 0.4017425775527954, + "learning_rate": 0.00013210588235294118, + "loss": 0.3045, + "step": 4760 + }, + { + "epoch": 280.0, + "eval_loss": 0.26791420578956604, + "eval_runtime": 1.952, + "eval_samples_per_second": 68.647, + "eval_steps_per_second": 8.709, + "step": 4760 + }, + { + "epoch": 280.29, + "grad_norm": 0.3575941324234009, + "learning_rate": 0.00013192941176470586, + "loss": 0.2716, + "step": 4765 + }, + { + "epoch": 280.59, + "grad_norm": 0.3435975909233093, + "learning_rate": 0.00013175294117647056, + "loss": 0.2578, + "step": 4770 + }, + { + "epoch": 280.88, + "grad_norm": 0.32188889384269714, + "learning_rate": 0.0001315764705882353, + "loss": 0.2792, + "step": 4775 + }, + { + "epoch": 281.18, + "grad_norm": 0.37657076120376587, + "learning_rate": 0.0001314, + "loss": 0.2752, + "step": 4780 + }, + { + "epoch": 281.18, + "eval_loss": 0.2675827741622925, + "eval_runtime": 1.9518, + "eval_samples_per_second": 68.655, + "eval_steps_per_second": 8.71, + "step": 4780 + }, + { + "epoch": 281.47, + "grad_norm": 0.36095955967903137, + "learning_rate": 0.0001312235294117647, + "loss": 0.2641, + "step": 4785 + }, + { + "epoch": 281.76, + "grad_norm": 0.4052543044090271, + "learning_rate": 0.0001310470588235294, + "loss": 0.3018, + "step": 4790 + }, + { + "epoch": 282.06, + "grad_norm": 0.3855169117450714, + "learning_rate": 0.0001308705882352941, + "loss": 0.2744, + "step": 4795 + }, + { + "epoch": 282.35, + "grad_norm": 0.31001076102256775, + "learning_rate": 0.0001306941176470588, + "loss": 0.2665, + "step": 4800 + }, + { + "epoch": 282.35, + "eval_loss": 0.2676566541194916, + "eval_runtime": 1.9523, + "eval_samples_per_second": 68.638, + "eval_steps_per_second": 8.708, + "step": 4800 + }, + { + "epoch": 282.65, + "grad_norm": 0.3395485579967499, + "learning_rate": 0.0001305176470588235, + "loss": 0.2719, + "step": 4805 + }, + { + "epoch": 282.94, + "grad_norm": 0.33626648783683777, + "learning_rate": 0.00013034117647058821, + "loss": 0.2735, + "step": 4810 + }, + { + "epoch": 283.24, + "grad_norm": 0.4000275731086731, + "learning_rate": 0.00013016470588235294, + "loss": 0.2772, + "step": 4815 + }, + { + "epoch": 283.53, + "grad_norm": 0.33824244141578674, + "learning_rate": 0.00012998823529411765, + "loss": 0.2519, + "step": 4820 + }, + { + "epoch": 283.53, + "eval_loss": 0.267904132604599, + "eval_runtime": 1.9523, + "eval_samples_per_second": 68.637, + "eval_steps_per_second": 8.708, + "step": 4820 + }, + { + "epoch": 283.82, + "grad_norm": 0.32466191053390503, + "learning_rate": 0.00012981176470588232, + "loss": 0.3011, + "step": 4825 + }, + { + "epoch": 284.12, + "grad_norm": 0.4421817660331726, + "learning_rate": 0.00012963529411764705, + "loss": 0.2875, + "step": 4830 + }, + { + "epoch": 284.41, + "grad_norm": 0.33043330907821655, + "learning_rate": 0.00012945882352941176, + "loss": 0.2585, + "step": 4835 + }, + { + "epoch": 284.71, + "grad_norm": 0.2833538353443146, + "learning_rate": 0.00012928235294117646, + "loss": 0.2633, + "step": 4840 + }, + { + "epoch": 284.71, + "eval_loss": 0.2680717408657074, + "eval_runtime": 1.9551, + "eval_samples_per_second": 68.538, + "eval_steps_per_second": 8.695, + "step": 4840 + }, + { + "epoch": 285.0, + "grad_norm": 0.4212743937969208, + "learning_rate": 0.00012910588235294116, + "loss": 0.2818, + "step": 4845 + }, + { + "epoch": 285.29, + "grad_norm": 0.32360944151878357, + "learning_rate": 0.00012892941176470587, + "loss": 0.2828, + "step": 4850 + }, + { + "epoch": 285.59, + "grad_norm": 0.34716305136680603, + "learning_rate": 0.00012875294117647057, + "loss": 0.2582, + "step": 4855 + }, + { + "epoch": 285.88, + "grad_norm": 0.3057546019554138, + "learning_rate": 0.00012857647058823527, + "loss": 0.2782, + "step": 4860 + }, + { + "epoch": 285.88, + "eval_loss": 0.26780936121940613, + "eval_runtime": 1.9496, + "eval_samples_per_second": 68.733, + "eval_steps_per_second": 8.72, + "step": 4860 + }, + { + "epoch": 286.18, + "grad_norm": 0.27177900075912476, + "learning_rate": 0.00012839999999999998, + "loss": 0.2709, + "step": 4865 + }, + { + "epoch": 286.47, + "grad_norm": 0.38816970586776733, + "learning_rate": 0.0001282235294117647, + "loss": 0.2747, + "step": 4870 + }, + { + "epoch": 286.76, + "grad_norm": 0.45978912711143494, + "learning_rate": 0.0001280470588235294, + "loss": 0.2768, + "step": 4875 + }, + { + "epoch": 287.06, + "grad_norm": 0.40856796503067017, + "learning_rate": 0.0001278705882352941, + "loss": 0.2737, + "step": 4880 + }, + { + "epoch": 287.06, + "eval_loss": 0.26749733090400696, + "eval_runtime": 1.9492, + "eval_samples_per_second": 68.746, + "eval_steps_per_second": 8.722, + "step": 4880 + }, + { + "epoch": 287.35, + "grad_norm": 0.35362082719802856, + "learning_rate": 0.00012769411764705882, + "loss": 0.257, + "step": 4885 + }, + { + "epoch": 287.65, + "grad_norm": 0.5068021416664124, + "learning_rate": 0.00012751764705882352, + "loss": 0.2894, + "step": 4890 + }, + { + "epoch": 287.94, + "grad_norm": 0.299964964389801, + "learning_rate": 0.00012734117647058822, + "loss": 0.2688, + "step": 4895 + }, + { + "epoch": 288.24, + "grad_norm": 0.37156447768211365, + "learning_rate": 0.00012716470588235293, + "loss": 0.2774, + "step": 4900 + }, + { + "epoch": 288.24, + "eval_loss": 0.26747068762779236, + "eval_runtime": 1.9493, + "eval_samples_per_second": 68.743, + "eval_steps_per_second": 8.721, + "step": 4900 + }, + { + "epoch": 288.53, + "grad_norm": 0.29150059819221497, + "learning_rate": 0.00012698823529411763, + "loss": 0.285, + "step": 4905 + }, + { + "epoch": 288.82, + "grad_norm": 0.4122222065925598, + "learning_rate": 0.00012681176470588233, + "loss": 0.2759, + "step": 4910 + }, + { + "epoch": 289.12, + "grad_norm": 0.3781174123287201, + "learning_rate": 0.00012663529411764706, + "loss": 0.2476, + "step": 4915 + }, + { + "epoch": 289.41, + "grad_norm": 0.4797203838825226, + "learning_rate": 0.00012645882352941174, + "loss": 0.2826, + "step": 4920 + }, + { + "epoch": 289.41, + "eval_loss": 0.26735639572143555, + "eval_runtime": 1.9528, + "eval_samples_per_second": 68.618, + "eval_steps_per_second": 8.705, + "step": 4920 + }, + { + "epoch": 289.71, + "grad_norm": 0.34512466192245483, + "learning_rate": 0.00012628235294117647, + "loss": 0.2736, + "step": 4925 + }, + { + "epoch": 290.0, + "grad_norm": 0.45723918080329895, + "learning_rate": 0.00012610588235294117, + "loss": 0.2735, + "step": 4930 + }, + { + "epoch": 290.29, + "grad_norm": 0.350223183631897, + "learning_rate": 0.00012592941176470588, + "loss": 0.2738, + "step": 4935 + }, + { + "epoch": 290.59, + "grad_norm": 0.3203045427799225, + "learning_rate": 0.00012575294117647058, + "loss": 0.2598, + "step": 4940 + }, + { + "epoch": 290.59, + "eval_loss": 0.26764583587646484, + "eval_runtime": 1.9517, + "eval_samples_per_second": 68.658, + "eval_steps_per_second": 8.71, + "step": 4940 + }, + { + "epoch": 290.88, + "grad_norm": 0.3497321903705597, + "learning_rate": 0.00012557647058823528, + "loss": 0.278, + "step": 4945 + }, + { + "epoch": 291.18, + "grad_norm": 0.36741694808006287, + "learning_rate": 0.00012539999999999999, + "loss": 0.2949, + "step": 4950 + }, + { + "epoch": 291.47, + "grad_norm": 0.3636045455932617, + "learning_rate": 0.0001252235294117647, + "loss": 0.2723, + "step": 4955 + }, + { + "epoch": 291.76, + "grad_norm": 0.39553460478782654, + "learning_rate": 0.0001250470588235294, + "loss": 0.277, + "step": 4960 + }, + { + "epoch": 291.76, + "eval_loss": 0.26768288016319275, + "eval_runtime": 1.952, + "eval_samples_per_second": 68.647, + "eval_steps_per_second": 8.709, + "step": 4960 + }, + { + "epoch": 292.06, + "grad_norm": 0.37442415952682495, + "learning_rate": 0.0001248705882352941, + "loss": 0.2619, + "step": 4965 + }, + { + "epoch": 292.35, + "grad_norm": 0.31354594230651855, + "learning_rate": 0.00012469411764705883, + "loss": 0.2701, + "step": 4970 + }, + { + "epoch": 292.65, + "grad_norm": 0.4127384126186371, + "learning_rate": 0.00012451764705882353, + "loss": 0.2798, + "step": 4975 + }, + { + "epoch": 292.94, + "grad_norm": 0.4686755836009979, + "learning_rate": 0.0001243411764705882, + "loss": 0.2776, + "step": 4980 + }, + { + "epoch": 292.94, + "eval_loss": 0.2676917612552643, + "eval_runtime": 1.949, + "eval_samples_per_second": 68.751, + "eval_steps_per_second": 8.722, + "step": 4980 + }, + { + "epoch": 293.24, + "grad_norm": 0.33352458477020264, + "learning_rate": 0.00012416470588235293, + "loss": 0.2673, + "step": 4985 + }, + { + "epoch": 293.53, + "grad_norm": 0.29412564635276794, + "learning_rate": 0.00012398823529411764, + "loss": 0.2787, + "step": 4990 + }, + { + "epoch": 293.82, + "grad_norm": 0.44755318760871887, + "learning_rate": 0.00012381176470588234, + "loss": 0.2642, + "step": 4995 + }, + { + "epoch": 294.12, + "grad_norm": 0.3944944441318512, + "learning_rate": 0.00012363529411764704, + "loss": 0.2773, + "step": 5000 + }, + { + "epoch": 294.12, + "eval_loss": 0.2675646245479584, + "eval_runtime": 1.9512, + "eval_samples_per_second": 68.676, + "eval_steps_per_second": 8.713, + "step": 5000 + }, + { + "epoch": 294.41, + "grad_norm": 0.38821926712989807, + "learning_rate": 0.00012345882352941175, + "loss": 0.2745, + "step": 5005 + }, + { + "epoch": 294.71, + "grad_norm": 0.4502621293067932, + "learning_rate": 0.00012328235294117648, + "loss": 0.2702, + "step": 5010 + }, + { + "epoch": 295.0, + "grad_norm": 0.44437360763549805, + "learning_rate": 0.00012310588235294115, + "loss": 0.2781, + "step": 5015 + }, + { + "epoch": 295.29, + "grad_norm": 0.3315514624118805, + "learning_rate": 0.00012292941176470586, + "loss": 0.2766, + "step": 5020 + }, + { + "epoch": 295.29, + "eval_loss": 0.26779085397720337, + "eval_runtime": 1.9521, + "eval_samples_per_second": 68.643, + "eval_steps_per_second": 8.708, + "step": 5020 + }, + { + "epoch": 295.59, + "grad_norm": 0.3349094092845917, + "learning_rate": 0.0001227529411764706, + "loss": 0.278, + "step": 5025 + }, + { + "epoch": 295.88, + "grad_norm": 0.3044911026954651, + "learning_rate": 0.0001225764705882353, + "loss": 0.2635, + "step": 5030 + }, + { + "epoch": 296.18, + "grad_norm": 0.3015868365764618, + "learning_rate": 0.0001224, + "loss": 0.2726, + "step": 5035 + }, + { + "epoch": 296.47, + "grad_norm": 0.3300056755542755, + "learning_rate": 0.0001222235294117647, + "loss": 0.2735, + "step": 5040 + }, + { + "epoch": 296.47, + "eval_loss": 0.26769718527793884, + "eval_runtime": 1.9498, + "eval_samples_per_second": 68.725, + "eval_steps_per_second": 8.719, + "step": 5040 + }, + { + "epoch": 296.76, + "grad_norm": 0.2934749126434326, + "learning_rate": 0.0001220470588235294, + "loss": 0.2844, + "step": 5045 + }, + { + "epoch": 297.06, + "grad_norm": 0.47713515162467957, + "learning_rate": 0.00012187058823529412, + "loss": 0.2831, + "step": 5050 + }, + { + "epoch": 297.35, + "grad_norm": 0.4163975715637207, + "learning_rate": 0.00012169411764705882, + "loss": 0.2763, + "step": 5055 + }, + { + "epoch": 297.65, + "grad_norm": 0.3687848448753357, + "learning_rate": 0.00012151764705882351, + "loss": 0.2657, + "step": 5060 + }, + { + "epoch": 297.65, + "eval_loss": 0.2679336667060852, + "eval_runtime": 1.9525, + "eval_samples_per_second": 68.629, + "eval_steps_per_second": 8.707, + "step": 5060 + }, + { + "epoch": 297.94, + "grad_norm": 0.3908083438873291, + "learning_rate": 0.00012134117647058823, + "loss": 0.2778, + "step": 5065 + }, + { + "epoch": 298.24, + "grad_norm": 0.4374167025089264, + "learning_rate": 0.00012116470588235293, + "loss": 0.2794, + "step": 5070 + }, + { + "epoch": 298.53, + "grad_norm": 0.4555642306804657, + "learning_rate": 0.00012098823529411763, + "loss": 0.2762, + "step": 5075 + }, + { + "epoch": 298.82, + "grad_norm": 0.3667364716529846, + "learning_rate": 0.00012081176470588235, + "loss": 0.2703, + "step": 5080 + }, + { + "epoch": 298.82, + "eval_loss": 0.26758870482444763, + "eval_runtime": 1.9513, + "eval_samples_per_second": 68.673, + "eval_steps_per_second": 8.712, + "step": 5080 + }, + { + "epoch": 299.12, + "grad_norm": 0.34506818652153015, + "learning_rate": 0.00012063529411764705, + "loss": 0.2725, + "step": 5085 + }, + { + "epoch": 299.41, + "grad_norm": 0.3395228385925293, + "learning_rate": 0.00012045882352941174, + "loss": 0.2653, + "step": 5090 + }, + { + "epoch": 299.71, + "grad_norm": 0.312497615814209, + "learning_rate": 0.00012028235294117646, + "loss": 0.2864, + "step": 5095 + }, + { + "epoch": 300.0, + "grad_norm": 0.3668994903564453, + "learning_rate": 0.00012010588235294116, + "loss": 0.2604, + "step": 5100 + }, + { + "epoch": 300.0, + "eval_loss": 0.2675992548465729, + "eval_runtime": 1.9507, + "eval_samples_per_second": 68.694, + "eval_steps_per_second": 8.715, + "step": 5100 + }, + { + "epoch": 300.29, + "grad_norm": 0.2964438498020172, + "learning_rate": 0.00011992941176470588, + "loss": 0.2711, + "step": 5105 + }, + { + "epoch": 300.59, + "grad_norm": 0.2991167902946472, + "learning_rate": 0.00011975294117647058, + "loss": 0.265, + "step": 5110 + }, + { + "epoch": 300.88, + "grad_norm": 0.3649525046348572, + "learning_rate": 0.00011957647058823529, + "loss": 0.2745, + "step": 5115 + }, + { + "epoch": 301.18, + "grad_norm": 0.3166786730289459, + "learning_rate": 0.0001194, + "loss": 0.2924, + "step": 5120 + }, + { + "epoch": 301.18, + "eval_loss": 0.2671992778778076, + "eval_runtime": 1.9519, + "eval_samples_per_second": 68.653, + "eval_steps_per_second": 8.71, + "step": 5120 + }, + { + "epoch": 301.47, + "grad_norm": 0.29124000668525696, + "learning_rate": 0.00011922352941176469, + "loss": 0.2735, + "step": 5125 + }, + { + "epoch": 301.76, + "grad_norm": 0.3728093206882477, + "learning_rate": 0.0001190470588235294, + "loss": 0.2576, + "step": 5130 + }, + { + "epoch": 302.06, + "grad_norm": 0.33258339762687683, + "learning_rate": 0.00011887058823529411, + "loss": 0.2793, + "step": 5135 + }, + { + "epoch": 302.35, + "grad_norm": 0.4311297833919525, + "learning_rate": 0.00011869411764705881, + "loss": 0.2714, + "step": 5140 + }, + { + "epoch": 302.35, + "eval_loss": 0.2673178017139435, + "eval_runtime": 1.9516, + "eval_samples_per_second": 68.662, + "eval_steps_per_second": 8.711, + "step": 5140 + }, + { + "epoch": 302.65, + "grad_norm": 0.39538463950157166, + "learning_rate": 0.00011851764705882352, + "loss": 0.2841, + "step": 5145 + }, + { + "epoch": 302.94, + "grad_norm": 0.4409559667110443, + "learning_rate": 0.00011834117647058823, + "loss": 0.2738, + "step": 5150 + }, + { + "epoch": 303.24, + "grad_norm": 0.3207736909389496, + "learning_rate": 0.00011816470588235292, + "loss": 0.2695, + "step": 5155 + }, + { + "epoch": 303.53, + "grad_norm": 0.4177800118923187, + "learning_rate": 0.00011798823529411764, + "loss": 0.2807, + "step": 5160 + }, + { + "epoch": 303.53, + "eval_loss": 0.2675679326057434, + "eval_runtime": 1.951, + "eval_samples_per_second": 68.682, + "eval_steps_per_second": 8.713, + "step": 5160 + }, + { + "epoch": 303.82, + "grad_norm": 0.3397594690322876, + "learning_rate": 0.00011781176470588234, + "loss": 0.2606, + "step": 5165 + }, + { + "epoch": 304.12, + "grad_norm": 0.3563714325428009, + "learning_rate": 0.00011763529411764705, + "loss": 0.2942, + "step": 5170 + }, + { + "epoch": 304.41, + "grad_norm": 0.35460689663887024, + "learning_rate": 0.00011745882352941176, + "loss": 0.2633, + "step": 5175 + }, + { + "epoch": 304.71, + "grad_norm": 0.2972242832183838, + "learning_rate": 0.00011728235294117647, + "loss": 0.2574, + "step": 5180 + }, + { + "epoch": 304.71, + "eval_loss": 0.26792898774147034, + "eval_runtime": 1.9517, + "eval_samples_per_second": 68.659, + "eval_steps_per_second": 8.71, + "step": 5180 + }, + { + "epoch": 305.0, + "grad_norm": 0.5814924240112305, + "learning_rate": 0.00011710588235294116, + "loss": 0.2932, + "step": 5185 + }, + { + "epoch": 305.29, + "grad_norm": 0.3438197076320648, + "learning_rate": 0.00011692941176470587, + "loss": 0.2485, + "step": 5190 + }, + { + "epoch": 305.59, + "grad_norm": 0.3513415455818176, + "learning_rate": 0.00011675294117647058, + "loss": 0.2901, + "step": 5195 + }, + { + "epoch": 305.88, + "grad_norm": 0.2991219460964203, + "learning_rate": 0.00011657647058823528, + "loss": 0.2828, + "step": 5200 + }, + { + "epoch": 305.88, + "eval_loss": 0.2679195702075958, + "eval_runtime": 1.9511, + "eval_samples_per_second": 68.68, + "eval_steps_per_second": 8.713, + "step": 5200 + }, + { + "epoch": 306.18, + "grad_norm": 0.32192981243133545, + "learning_rate": 0.0001164, + "loss": 0.2842, + "step": 5205 + }, + { + "epoch": 306.47, + "grad_norm": 0.352367639541626, + "learning_rate": 0.00011622352941176469, + "loss": 0.2703, + "step": 5210 + }, + { + "epoch": 306.76, + "grad_norm": 0.39249107241630554, + "learning_rate": 0.0001160470588235294, + "loss": 0.2671, + "step": 5215 + }, + { + "epoch": 307.06, + "grad_norm": 0.32310575246810913, + "learning_rate": 0.0001158705882352941, + "loss": 0.2811, + "step": 5220 + }, + { + "epoch": 307.06, + "eval_loss": 0.2676811218261719, + "eval_runtime": 1.9921, + "eval_samples_per_second": 67.265, + "eval_steps_per_second": 8.534, + "step": 5220 + }, + { + "epoch": 307.35, + "grad_norm": 0.39959678053855896, + "learning_rate": 0.00011569411764705881, + "loss": 0.2675, + "step": 5225 + }, + { + "epoch": 307.65, + "grad_norm": 0.4226132929325104, + "learning_rate": 0.00011551764705882353, + "loss": 0.2583, + "step": 5230 + }, + { + "epoch": 307.94, + "grad_norm": 0.4457205832004547, + "learning_rate": 0.00011534117647058823, + "loss": 0.3008, + "step": 5235 + }, + { + "epoch": 308.24, + "grad_norm": 0.33544936776161194, + "learning_rate": 0.00011516470588235292, + "loss": 0.2744, + "step": 5240 + }, + { + "epoch": 308.24, + "eval_loss": 0.2674923241138458, + "eval_runtime": 1.9506, + "eval_samples_per_second": 68.697, + "eval_steps_per_second": 8.715, + "step": 5240 + }, + { + "epoch": 308.53, + "grad_norm": 0.35008352994918823, + "learning_rate": 0.00011498823529411764, + "loss": 0.2758, + "step": 5245 + }, + { + "epoch": 308.82, + "grad_norm": 0.3677573502063751, + "learning_rate": 0.00011481176470588234, + "loss": 0.2669, + "step": 5250 + }, + { + "epoch": 309.12, + "grad_norm": 0.486016184091568, + "learning_rate": 0.00011463529411764704, + "loss": 0.2867, + "step": 5255 + }, + { + "epoch": 309.41, + "grad_norm": 0.4204300343990326, + "learning_rate": 0.00011445882352941176, + "loss": 0.2492, + "step": 5260 + }, + { + "epoch": 309.41, + "eval_loss": 0.2679157555103302, + "eval_runtime": 1.9516, + "eval_samples_per_second": 68.663, + "eval_steps_per_second": 8.711, + "step": 5260 + }, + { + "epoch": 309.71, + "grad_norm": 0.300603985786438, + "learning_rate": 0.00011428235294117646, + "loss": 0.2687, + "step": 5265 + }, + { + "epoch": 310.0, + "grad_norm": 0.5466859936714172, + "learning_rate": 0.00011410588235294118, + "loss": 0.3053, + "step": 5270 + }, + { + "epoch": 310.29, + "grad_norm": 0.41164669394493103, + "learning_rate": 0.00011392941176470587, + "loss": 0.2737, + "step": 5275 + }, + { + "epoch": 310.59, + "grad_norm": 0.32829609513282776, + "learning_rate": 0.00011375294117647057, + "loss": 0.2656, + "step": 5280 + }, + { + "epoch": 310.59, + "eval_loss": 0.26765042543411255, + "eval_runtime": 1.9559, + "eval_samples_per_second": 68.51, + "eval_steps_per_second": 8.692, + "step": 5280 + }, + { + "epoch": 310.88, + "grad_norm": 0.348434716463089, + "learning_rate": 0.00011357647058823529, + "loss": 0.2824, + "step": 5285 + }, + { + "epoch": 311.18, + "grad_norm": 0.3653252124786377, + "learning_rate": 0.00011339999999999999, + "loss": 0.2607, + "step": 5290 + }, + { + "epoch": 311.47, + "grad_norm": 0.3474944233894348, + "learning_rate": 0.0001132235294117647, + "loss": 0.2647, + "step": 5295 + }, + { + "epoch": 311.76, + "grad_norm": 0.33017656207084656, + "learning_rate": 0.00011304705882352941, + "loss": 0.2872, + "step": 5300 + }, + { + "epoch": 311.76, + "eval_loss": 0.2681010067462921, + "eval_runtime": 1.9922, + "eval_samples_per_second": 67.264, + "eval_steps_per_second": 8.533, + "step": 5300 + }, + { + "epoch": 312.06, + "grad_norm": 0.3059663474559784, + "learning_rate": 0.0001128705882352941, + "loss": 0.2647, + "step": 5305 + }, + { + "epoch": 312.35, + "grad_norm": 0.4996471107006073, + "learning_rate": 0.0001126941176470588, + "loss": 0.2662, + "step": 5310 + }, + { + "epoch": 312.65, + "grad_norm": 0.4012407660484314, + "learning_rate": 0.00011251764705882352, + "loss": 0.271, + "step": 5315 + }, + { + "epoch": 312.94, + "grad_norm": 0.31381988525390625, + "learning_rate": 0.00011234117647058822, + "loss": 0.2822, + "step": 5320 + }, + { + "epoch": 312.94, + "eval_loss": 0.2678367793560028, + "eval_runtime": 2.1359, + "eval_samples_per_second": 62.737, + "eval_steps_per_second": 7.959, + "step": 5320 + }, + { + "epoch": 313.24, + "grad_norm": 0.3857933580875397, + "learning_rate": 0.00011216470588235293, + "loss": 0.2808, + "step": 5325 + }, + { + "epoch": 313.53, + "grad_norm": 0.3678051829338074, + "learning_rate": 0.00011198823529411764, + "loss": 0.2622, + "step": 5330 + }, + { + "epoch": 313.82, + "grad_norm": 0.37850359082221985, + "learning_rate": 0.00011181176470588233, + "loss": 0.2849, + "step": 5335 + }, + { + "epoch": 314.12, + "grad_norm": 0.33382633328437805, + "learning_rate": 0.00011163529411764705, + "loss": 0.2798, + "step": 5340 + }, + { + "epoch": 314.12, + "eval_loss": 0.2676757276058197, + "eval_runtime": 1.9511, + "eval_samples_per_second": 68.679, + "eval_steps_per_second": 8.713, + "step": 5340 + }, + { + "epoch": 314.41, + "grad_norm": 0.35435083508491516, + "learning_rate": 0.00011145882352941175, + "loss": 0.2626, + "step": 5345 + }, + { + "epoch": 314.71, + "grad_norm": 0.3210301399230957, + "learning_rate": 0.00011128235294117646, + "loss": 0.2638, + "step": 5350 + }, + { + "epoch": 315.0, + "grad_norm": 0.631574273109436, + "learning_rate": 0.00011110588235294117, + "loss": 0.2966, + "step": 5355 + }, + { + "epoch": 315.29, + "grad_norm": 0.3390182852745056, + "learning_rate": 0.00011092941176470588, + "loss": 0.2705, + "step": 5360 + }, + { + "epoch": 315.29, + "eval_loss": 0.26768192648887634, + "eval_runtime": 1.9496, + "eval_samples_per_second": 68.732, + "eval_steps_per_second": 8.72, + "step": 5360 + }, + { + "epoch": 315.59, + "grad_norm": 0.34884804487228394, + "learning_rate": 0.00011075294117647057, + "loss": 0.2647, + "step": 5365 + }, + { + "epoch": 315.88, + "grad_norm": 0.4360363483428955, + "learning_rate": 0.00011057647058823528, + "loss": 0.2818, + "step": 5370 + }, + { + "epoch": 316.18, + "grad_norm": 0.3483468294143677, + "learning_rate": 0.00011039999999999999, + "loss": 0.2662, + "step": 5375 + }, + { + "epoch": 316.47, + "grad_norm": 0.33021044731140137, + "learning_rate": 0.00011022352941176469, + "loss": 0.2665, + "step": 5380 + }, + { + "epoch": 316.47, + "eval_loss": 0.2681596875190735, + "eval_runtime": 1.9511, + "eval_samples_per_second": 68.68, + "eval_steps_per_second": 8.713, + "step": 5380 + }, + { + "epoch": 316.76, + "grad_norm": 0.284152090549469, + "learning_rate": 0.0001100470588235294, + "loss": 0.2747, + "step": 5385 + }, + { + "epoch": 317.06, + "grad_norm": 0.31344881653785706, + "learning_rate": 0.00010987058823529411, + "loss": 0.2891, + "step": 5390 + }, + { + "epoch": 317.35, + "grad_norm": 0.379783570766449, + "learning_rate": 0.00010969411764705883, + "loss": 0.2541, + "step": 5395 + }, + { + "epoch": 317.65, + "grad_norm": 0.3765096068382263, + "learning_rate": 0.00010951764705882352, + "loss": 0.2881, + "step": 5400 + }, + { + "epoch": 317.65, + "eval_loss": 0.26803770661354065, + "eval_runtime": 1.9527, + "eval_samples_per_second": 68.624, + "eval_steps_per_second": 8.706, + "step": 5400 + }, + { + "epoch": 317.94, + "grad_norm": 0.33734330534935, + "learning_rate": 0.00010934117647058822, + "loss": 0.2789, + "step": 5405 + }, + { + "epoch": 318.24, + "grad_norm": 0.3086510896682739, + "learning_rate": 0.00010916470588235294, + "loss": 0.2678, + "step": 5410 + }, + { + "epoch": 318.53, + "grad_norm": 0.3951496481895447, + "learning_rate": 0.00010898823529411764, + "loss": 0.2656, + "step": 5415 + }, + { + "epoch": 318.82, + "grad_norm": 0.5031821727752686, + "learning_rate": 0.00010881176470588234, + "loss": 0.2911, + "step": 5420 + }, + { + "epoch": 318.82, + "eval_loss": 0.26788726449012756, + "eval_runtime": 1.9524, + "eval_samples_per_second": 68.632, + "eval_steps_per_second": 8.707, + "step": 5420 + }, + { + "epoch": 319.12, + "grad_norm": 0.2810389995574951, + "learning_rate": 0.00010863529411764706, + "loss": 0.2745, + "step": 5425 + }, + { + "epoch": 319.41, + "grad_norm": 0.35122090578079224, + "learning_rate": 0.00010845882352941175, + "loss": 0.2776, + "step": 5430 + }, + { + "epoch": 319.71, + "grad_norm": 0.3709968030452728, + "learning_rate": 0.00010828235294117645, + "loss": 0.2758, + "step": 5435 + }, + { + "epoch": 320.0, + "grad_norm": 0.33672964572906494, + "learning_rate": 0.00010810588235294117, + "loss": 0.2737, + "step": 5440 + }, + { + "epoch": 320.0, + "eval_loss": 0.2675476372241974, + "eval_runtime": 1.9507, + "eval_samples_per_second": 68.692, + "eval_steps_per_second": 8.715, + "step": 5440 + }, + { + "epoch": 320.29, + "grad_norm": 0.42993614077568054, + "learning_rate": 0.00010792941176470587, + "loss": 0.2617, + "step": 5445 + }, + { + "epoch": 320.59, + "grad_norm": 0.37771227955818176, + "learning_rate": 0.00010775294117647059, + "loss": 0.2837, + "step": 5450 + }, + { + "epoch": 320.88, + "grad_norm": 0.34151655435562134, + "learning_rate": 0.00010757647058823529, + "loss": 0.2628, + "step": 5455 + }, + { + "epoch": 321.18, + "grad_norm": 0.37274566292762756, + "learning_rate": 0.00010739999999999998, + "loss": 0.2998, + "step": 5460 + }, + { + "epoch": 321.18, + "eval_loss": 0.26716136932373047, + "eval_runtime": 1.9506, + "eval_samples_per_second": 68.696, + "eval_steps_per_second": 8.715, + "step": 5460 + }, + { + "epoch": 321.47, + "grad_norm": 0.46079370379447937, + "learning_rate": 0.0001072235294117647, + "loss": 0.2768, + "step": 5465 + }, + { + "epoch": 321.76, + "grad_norm": 0.40266698598861694, + "learning_rate": 0.0001070470588235294, + "loss": 0.2713, + "step": 5470 + }, + { + "epoch": 322.06, + "grad_norm": 0.3301185369491577, + "learning_rate": 0.0001068705882352941, + "loss": 0.2496, + "step": 5475 + }, + { + "epoch": 322.35, + "grad_norm": 0.30476114153862, + "learning_rate": 0.00010669411764705882, + "loss": 0.2706, + "step": 5480 + }, + { + "epoch": 322.35, + "eval_loss": 0.26763421297073364, + "eval_runtime": 1.9525, + "eval_samples_per_second": 68.631, + "eval_steps_per_second": 8.707, + "step": 5480 + }, + { + "epoch": 322.65, + "grad_norm": 0.4151112139225006, + "learning_rate": 0.00010651764705882352, + "loss": 0.2727, + "step": 5485 + }, + { + "epoch": 322.94, + "grad_norm": 0.5137280821800232, + "learning_rate": 0.00010634117647058821, + "loss": 0.2839, + "step": 5490 + }, + { + "epoch": 323.24, + "grad_norm": 0.3088178038597107, + "learning_rate": 0.00010616470588235293, + "loss": 0.2741, + "step": 5495 + }, + { + "epoch": 323.53, + "grad_norm": 0.37711068987846375, + "learning_rate": 0.00010598823529411763, + "loss": 0.2874, + "step": 5500 + }, + { + "epoch": 323.53, + "eval_loss": 0.2674301862716675, + "eval_runtime": 1.95, + "eval_samples_per_second": 68.719, + "eval_steps_per_second": 8.718, + "step": 5500 + }, + { + "epoch": 323.82, + "grad_norm": 0.36296695470809937, + "learning_rate": 0.00010581176470588235, + "loss": 0.2673, + "step": 5505 + }, + { + "epoch": 324.12, + "grad_norm": 0.3567427694797516, + "learning_rate": 0.00010563529411764705, + "loss": 0.2586, + "step": 5510 + }, + { + "epoch": 324.41, + "grad_norm": 0.31744328141212463, + "learning_rate": 0.00010545882352941176, + "loss": 0.2572, + "step": 5515 + }, + { + "epoch": 324.71, + "grad_norm": 0.44774046540260315, + "learning_rate": 0.00010528235294117647, + "loss": 0.2818, + "step": 5520 + }, + { + "epoch": 324.71, + "eval_loss": 0.26762863993644714, + "eval_runtime": 1.951, + "eval_samples_per_second": 68.682, + "eval_steps_per_second": 8.713, + "step": 5520 + }, + { + "epoch": 325.0, + "grad_norm": 0.35441726446151733, + "learning_rate": 0.00010510588235294116, + "loss": 0.2788, + "step": 5525 + }, + { + "epoch": 325.29, + "grad_norm": 0.35020384192466736, + "learning_rate": 0.00010492941176470587, + "loss": 0.2711, + "step": 5530 + }, + { + "epoch": 325.59, + "grad_norm": 0.32111018896102905, + "learning_rate": 0.00010475294117647058, + "loss": 0.2871, + "step": 5535 + }, + { + "epoch": 325.88, + "grad_norm": 0.29281875491142273, + "learning_rate": 0.00010457647058823529, + "loss": 0.2582, + "step": 5540 + }, + { + "epoch": 325.88, + "eval_loss": 0.2675672471523285, + "eval_runtime": 1.9504, + "eval_samples_per_second": 68.702, + "eval_steps_per_second": 8.716, + "step": 5540 + }, + { + "epoch": 326.18, + "grad_norm": 0.3279877305030823, + "learning_rate": 0.00010439999999999999, + "loss": 0.2859, + "step": 5545 + }, + { + "epoch": 326.47, + "grad_norm": 0.31693965196609497, + "learning_rate": 0.0001042235294117647, + "loss": 0.2589, + "step": 5550 + }, + { + "epoch": 326.76, + "grad_norm": 0.4101545810699463, + "learning_rate": 0.0001040470588235294, + "loss": 0.2779, + "step": 5555 + }, + { + "epoch": 327.06, + "grad_norm": 0.3412899374961853, + "learning_rate": 0.00010387058823529411, + "loss": 0.2743, + "step": 5560 + }, + { + "epoch": 327.06, + "eval_loss": 0.26723936200141907, + "eval_runtime": 1.9503, + "eval_samples_per_second": 68.706, + "eval_steps_per_second": 8.716, + "step": 5560 + }, + { + "epoch": 327.35, + "grad_norm": 0.30297550559043884, + "learning_rate": 0.00010369411764705882, + "loss": 0.279, + "step": 5565 + }, + { + "epoch": 327.65, + "grad_norm": 0.3419901132583618, + "learning_rate": 0.00010351764705882352, + "loss": 0.2866, + "step": 5570 + }, + { + "epoch": 327.94, + "grad_norm": 0.3979418873786926, + "learning_rate": 0.00010334117647058824, + "loss": 0.2574, + "step": 5575 + }, + { + "epoch": 328.24, + "grad_norm": 0.4262833595275879, + "learning_rate": 0.00010316470588235294, + "loss": 0.2814, + "step": 5580 + }, + { + "epoch": 328.24, + "eval_loss": 0.2673919200897217, + "eval_runtime": 1.9511, + "eval_samples_per_second": 68.678, + "eval_steps_per_second": 8.713, + "step": 5580 + }, + { + "epoch": 328.53, + "grad_norm": 0.33117881417274475, + "learning_rate": 0.00010298823529411763, + "loss": 0.2928, + "step": 5585 + }, + { + "epoch": 328.82, + "grad_norm": 0.3354574143886566, + "learning_rate": 0.00010281176470588235, + "loss": 0.2474, + "step": 5590 + }, + { + "epoch": 329.12, + "grad_norm": 0.378356009721756, + "learning_rate": 0.00010263529411764705, + "loss": 0.2774, + "step": 5595 + }, + { + "epoch": 329.41, + "grad_norm": 0.39948397874832153, + "learning_rate": 0.00010245882352941175, + "loss": 0.2742, + "step": 5600 + }, + { + "epoch": 329.41, + "eval_loss": 0.2673865854740143, + "eval_runtime": 1.9504, + "eval_samples_per_second": 68.705, + "eval_steps_per_second": 8.716, + "step": 5600 + }, + { + "epoch": 329.71, + "grad_norm": 0.34866583347320557, + "learning_rate": 0.00010228235294117647, + "loss": 0.2676, + "step": 5605 + }, + { + "epoch": 330.0, + "grad_norm": 0.4224211275577545, + "learning_rate": 0.00010210588235294117, + "loss": 0.2817, + "step": 5610 + }, + { + "epoch": 330.29, + "grad_norm": 0.3401482403278351, + "learning_rate": 0.00010192941176470588, + "loss": 0.2657, + "step": 5615 + }, + { + "epoch": 330.59, + "grad_norm": 0.34803012013435364, + "learning_rate": 0.00010175294117647058, + "loss": 0.2703, + "step": 5620 + }, + { + "epoch": 330.59, + "eval_loss": 0.26749762892723083, + "eval_runtime": 1.9523, + "eval_samples_per_second": 68.636, + "eval_steps_per_second": 8.708, + "step": 5620 + }, + { + "epoch": 330.88, + "grad_norm": 0.4031808078289032, + "learning_rate": 0.00010157647058823528, + "loss": 0.2653, + "step": 5625 + }, + { + "epoch": 331.18, + "grad_norm": 0.3816465139389038, + "learning_rate": 0.0001014, + "loss": 0.286, + "step": 5630 + }, + { + "epoch": 331.47, + "grad_norm": 0.3817666172981262, + "learning_rate": 0.0001012235294117647, + "loss": 0.2706, + "step": 5635 + }, + { + "epoch": 331.76, + "grad_norm": 0.35487768054008484, + "learning_rate": 0.00010104705882352939, + "loss": 0.2801, + "step": 5640 + }, + { + "epoch": 331.76, + "eval_loss": 0.26746058464050293, + "eval_runtime": 1.9527, + "eval_samples_per_second": 68.622, + "eval_steps_per_second": 8.706, + "step": 5640 + }, + { + "epoch": 332.06, + "grad_norm": 0.3794578015804291, + "learning_rate": 0.00010087058823529411, + "loss": 0.2616, + "step": 5645 + }, + { + "epoch": 332.35, + "grad_norm": 0.43280404806137085, + "learning_rate": 0.00010069411764705881, + "loss": 0.2846, + "step": 5650 + }, + { + "epoch": 332.65, + "grad_norm": 0.38624128699302673, + "learning_rate": 0.00010051764705882351, + "loss": 0.2735, + "step": 5655 + }, + { + "epoch": 332.94, + "grad_norm": 0.3096133768558502, + "learning_rate": 0.00010034117647058823, + "loss": 0.257, + "step": 5660 + }, + { + "epoch": 332.94, + "eval_loss": 0.26737287640571594, + "eval_runtime": 1.9537, + "eval_samples_per_second": 68.588, + "eval_steps_per_second": 8.702, + "step": 5660 + }, + { + "epoch": 333.24, + "grad_norm": 0.2694839537143707, + "learning_rate": 0.00010016470588235293, + "loss": 0.2485, + "step": 5665 + }, + { + "epoch": 333.53, + "grad_norm": 0.387744277715683, + "learning_rate": 9.998823529411762e-05, + "loss": 0.2843, + "step": 5670 + }, + { + "epoch": 333.82, + "grad_norm": 0.2859618067741394, + "learning_rate": 9.981176470588234e-05, + "loss": 0.2715, + "step": 5675 + }, + { + "epoch": 334.12, + "grad_norm": 0.27788206934928894, + "learning_rate": 9.963529411764704e-05, + "loss": 0.2734, + "step": 5680 + }, + { + "epoch": 334.12, + "eval_loss": 0.26720064878463745, + "eval_runtime": 1.9545, + "eval_samples_per_second": 68.56, + "eval_steps_per_second": 8.698, + "step": 5680 + }, + { + "epoch": 334.41, + "grad_norm": 0.4651300311088562, + "learning_rate": 9.945882352941176e-05, + "loss": 0.2932, + "step": 5685 + }, + { + "epoch": 334.71, + "grad_norm": 0.3416568338871002, + "learning_rate": 9.928235294117646e-05, + "loss": 0.2571, + "step": 5690 + }, + { + "epoch": 335.0, + "grad_norm": 0.4788595139980316, + "learning_rate": 9.910588235294117e-05, + "loss": 0.2744, + "step": 5695 + }, + { + "epoch": 335.29, + "grad_norm": 0.40339604020118713, + "learning_rate": 9.892941176470588e-05, + "loss": 0.2768, + "step": 5700 + }, + { + "epoch": 335.29, + "eval_loss": 0.26745712757110596, + "eval_runtime": 1.9526, + "eval_samples_per_second": 68.627, + "eval_steps_per_second": 8.706, + "step": 5700 + }, + { + "epoch": 335.59, + "grad_norm": 0.3372475206851959, + "learning_rate": 9.875294117647057e-05, + "loss": 0.2459, + "step": 5705 + }, + { + "epoch": 335.88, + "grad_norm": 0.37835636734962463, + "learning_rate": 9.857647058823528e-05, + "loss": 0.2949, + "step": 5710 + }, + { + "epoch": 336.18, + "grad_norm": 0.3410570025444031, + "learning_rate": 9.839999999999999e-05, + "loss": 0.2843, + "step": 5715 + }, + { + "epoch": 336.47, + "grad_norm": 0.2878612279891968, + "learning_rate": 9.82235294117647e-05, + "loss": 0.2535, + "step": 5720 + }, + { + "epoch": 336.47, + "eval_loss": 0.26730409264564514, + "eval_runtime": 1.9508, + "eval_samples_per_second": 68.689, + "eval_steps_per_second": 8.714, + "step": 5720 + }, + { + "epoch": 336.76, + "grad_norm": 0.49723029136657715, + "learning_rate": 9.80470588235294e-05, + "loss": 0.2927, + "step": 5725 + }, + { + "epoch": 337.06, + "grad_norm": 0.32686054706573486, + "learning_rate": 9.787058823529412e-05, + "loss": 0.267, + "step": 5730 + }, + { + "epoch": 337.35, + "grad_norm": 0.30484649538993835, + "learning_rate": 9.76941176470588e-05, + "loss": 0.2794, + "step": 5735 + }, + { + "epoch": 337.65, + "grad_norm": 0.31358230113983154, + "learning_rate": 9.751764705882352e-05, + "loss": 0.2733, + "step": 5740 + }, + { + "epoch": 337.65, + "eval_loss": 0.267838716506958, + "eval_runtime": 1.9515, + "eval_samples_per_second": 68.665, + "eval_steps_per_second": 8.711, + "step": 5740 + }, + { + "epoch": 337.94, + "grad_norm": 0.44924864172935486, + "learning_rate": 9.734117647058823e-05, + "loss": 0.2712, + "step": 5745 + }, + { + "epoch": 338.24, + "grad_norm": 0.3639657199382782, + "learning_rate": 9.716470588235293e-05, + "loss": 0.2793, + "step": 5750 + }, + { + "epoch": 338.53, + "grad_norm": 0.36268535256385803, + "learning_rate": 9.698823529411765e-05, + "loss": 0.2817, + "step": 5755 + }, + { + "epoch": 338.82, + "grad_norm": 0.35680049657821655, + "learning_rate": 9.681176470588235e-05, + "loss": 0.2746, + "step": 5760 + }, + { + "epoch": 338.82, + "eval_loss": 0.2678186297416687, + "eval_runtime": 2.0194, + "eval_samples_per_second": 66.356, + "eval_steps_per_second": 8.418, + "step": 5760 + }, + { + "epoch": 339.12, + "grad_norm": 0.43545612692832947, + "learning_rate": 9.663529411764704e-05, + "loss": 0.2559, + "step": 5765 + }, + { + "epoch": 339.41, + "grad_norm": 0.46106550097465515, + "learning_rate": 9.645882352941176e-05, + "loss": 0.2994, + "step": 5770 + }, + { + "epoch": 339.71, + "grad_norm": 0.3287884294986725, + "learning_rate": 9.628235294117646e-05, + "loss": 0.2542, + "step": 5775 + }, + { + "epoch": 340.0, + "grad_norm": 0.4440259337425232, + "learning_rate": 9.610588235294116e-05, + "loss": 0.2743, + "step": 5780 + }, + { + "epoch": 340.0, + "eval_loss": 0.2675306499004364, + "eval_runtime": 1.9521, + "eval_samples_per_second": 68.643, + "eval_steps_per_second": 8.708, + "step": 5780 + }, + { + "epoch": 340.29, + "grad_norm": 0.370511919260025, + "learning_rate": 9.592941176470588e-05, + "loss": 0.2782, + "step": 5785 + }, + { + "epoch": 340.59, + "grad_norm": 0.376701295375824, + "learning_rate": 9.575294117647058e-05, + "loss": 0.2611, + "step": 5790 + }, + { + "epoch": 340.88, + "grad_norm": 0.4207409918308258, + "learning_rate": 9.55764705882353e-05, + "loss": 0.2732, + "step": 5795 + }, + { + "epoch": 341.18, + "grad_norm": 0.26235973834991455, + "learning_rate": 9.539999999999999e-05, + "loss": 0.2712, + "step": 5800 + }, + { + "epoch": 341.18, + "eval_loss": 0.2674858868122101, + "eval_runtime": 1.9515, + "eval_samples_per_second": 68.665, + "eval_steps_per_second": 8.711, + "step": 5800 + }, + { + "epoch": 341.47, + "grad_norm": 0.34371572732925415, + "learning_rate": 9.522352941176469e-05, + "loss": 0.2673, + "step": 5805 + }, + { + "epoch": 341.76, + "grad_norm": 0.4027385711669922, + "learning_rate": 9.504705882352941e-05, + "loss": 0.285, + "step": 5810 + }, + { + "epoch": 342.06, + "grad_norm": 0.3307850956916809, + "learning_rate": 9.487058823529411e-05, + "loss": 0.2665, + "step": 5815 + }, + { + "epoch": 342.35, + "grad_norm": 0.4924294352531433, + "learning_rate": 9.469411764705881e-05, + "loss": 0.2806, + "step": 5820 + }, + { + "epoch": 342.35, + "eval_loss": 0.2671770751476288, + "eval_runtime": 2.0081, + "eval_samples_per_second": 66.73, + "eval_steps_per_second": 8.466, + "step": 5820 + }, + { + "epoch": 342.65, + "grad_norm": 0.3287699222564697, + "learning_rate": 9.451764705882353e-05, + "loss": 0.2609, + "step": 5825 + }, + { + "epoch": 342.94, + "grad_norm": 0.36351487040519714, + "learning_rate": 9.434117647058822e-05, + "loss": 0.2832, + "step": 5830 + }, + { + "epoch": 343.24, + "grad_norm": 0.31963416934013367, + "learning_rate": 9.416470588235292e-05, + "loss": 0.2915, + "step": 5835 + }, + { + "epoch": 343.53, + "grad_norm": 0.30901607871055603, + "learning_rate": 9.398823529411764e-05, + "loss": 0.2502, + "step": 5840 + }, + { + "epoch": 343.53, + "eval_loss": 0.26759618520736694, + "eval_runtime": 2.0131, + "eval_samples_per_second": 66.564, + "eval_steps_per_second": 8.445, + "step": 5840 + }, + { + "epoch": 343.82, + "grad_norm": 0.4501369595527649, + "learning_rate": 9.381176470588234e-05, + "loss": 0.2836, + "step": 5845 + }, + { + "epoch": 344.12, + "grad_norm": 0.29734399914741516, + "learning_rate": 9.363529411764706e-05, + "loss": 0.2581, + "step": 5850 + }, + { + "epoch": 344.41, + "grad_norm": 0.32178354263305664, + "learning_rate": 9.345882352941176e-05, + "loss": 0.2715, + "step": 5855 + }, + { + "epoch": 344.71, + "grad_norm": 0.4321109354496002, + "learning_rate": 9.328235294117645e-05, + "loss": 0.2559, + "step": 5860 + }, + { + "epoch": 344.71, + "eval_loss": 0.267403781414032, + "eval_runtime": 1.9521, + "eval_samples_per_second": 68.645, + "eval_steps_per_second": 8.709, + "step": 5860 + }, + { + "epoch": 345.0, + "grad_norm": 0.3940346837043762, + "learning_rate": 9.310588235294117e-05, + "loss": 0.2851, + "step": 5865 + }, + { + "epoch": 345.29, + "grad_norm": 0.37561747431755066, + "learning_rate": 9.292941176470587e-05, + "loss": 0.2722, + "step": 5870 + }, + { + "epoch": 345.59, + "grad_norm": 0.3261379897594452, + "learning_rate": 9.275294117647058e-05, + "loss": 0.2622, + "step": 5875 + }, + { + "epoch": 345.88, + "grad_norm": 0.2994685173034668, + "learning_rate": 9.257647058823529e-05, + "loss": 0.2718, + "step": 5880 + }, + { + "epoch": 345.88, + "eval_loss": 0.267347514629364, + "eval_runtime": 1.9501, + "eval_samples_per_second": 68.713, + "eval_steps_per_second": 8.717, + "step": 5880 + }, + { + "epoch": 346.18, + "grad_norm": 0.3946426808834076, + "learning_rate": 9.24e-05, + "loss": 0.2661, + "step": 5885 + }, + { + "epoch": 346.47, + "grad_norm": 0.2947648763656616, + "learning_rate": 9.222352941176469e-05, + "loss": 0.2912, + "step": 5890 + }, + { + "epoch": 346.76, + "grad_norm": 0.37579163908958435, + "learning_rate": 9.20470588235294e-05, + "loss": 0.2806, + "step": 5895 + }, + { + "epoch": 347.06, + "grad_norm": 0.3368131220340729, + "learning_rate": 9.18705882352941e-05, + "loss": 0.2726, + "step": 5900 + }, + { + "epoch": 347.06, + "eval_loss": 0.26721295714378357, + "eval_runtime": 1.9512, + "eval_samples_per_second": 68.676, + "eval_steps_per_second": 8.713, + "step": 5900 + }, + { + "epoch": 347.35, + "grad_norm": 0.4486690163612366, + "learning_rate": 9.169411764705882e-05, + "loss": 0.2832, + "step": 5905 + }, + { + "epoch": 347.65, + "grad_norm": 0.3776603043079376, + "learning_rate": 9.151764705882353e-05, + "loss": 0.2529, + "step": 5910 + }, + { + "epoch": 347.94, + "grad_norm": 0.328617125749588, + "learning_rate": 9.134117647058823e-05, + "loss": 0.2777, + "step": 5915 + }, + { + "epoch": 348.24, + "grad_norm": 0.3706214427947998, + "learning_rate": 9.116470588235295e-05, + "loss": 0.2814, + "step": 5920 + }, + { + "epoch": 348.24, + "eval_loss": 0.26748955249786377, + "eval_runtime": 1.9645, + "eval_samples_per_second": 68.211, + "eval_steps_per_second": 8.654, + "step": 5920 + }, + { + "epoch": 348.53, + "grad_norm": 0.37739190459251404, + "learning_rate": 9.098823529411764e-05, + "loss": 0.2582, + "step": 5925 + }, + { + "epoch": 348.82, + "grad_norm": 0.3229033350944519, + "learning_rate": 9.081176470588234e-05, + "loss": 0.2837, + "step": 5930 + }, + { + "epoch": 349.12, + "grad_norm": 0.34771692752838135, + "learning_rate": 9.063529411764706e-05, + "loss": 0.2806, + "step": 5935 + }, + { + "epoch": 349.41, + "grad_norm": 0.2674664855003357, + "learning_rate": 9.045882352941176e-05, + "loss": 0.263, + "step": 5940 + }, + { + "epoch": 349.41, + "eval_loss": 0.2675427198410034, + "eval_runtime": 1.9511, + "eval_samples_per_second": 68.68, + "eval_steps_per_second": 8.713, + "step": 5940 + }, + { + "epoch": 349.71, + "grad_norm": 0.3410346210002899, + "learning_rate": 9.028235294117646e-05, + "loss": 0.2679, + "step": 5945 + }, + { + "epoch": 350.0, + "grad_norm": 0.5055866837501526, + "learning_rate": 9.010588235294118e-05, + "loss": 0.2798, + "step": 5950 + }, + { + "epoch": 350.29, + "grad_norm": 0.2741018831729889, + "learning_rate": 8.992941176470587e-05, + "loss": 0.2612, + "step": 5955 + }, + { + "epoch": 350.59, + "grad_norm": 0.28262433409690857, + "learning_rate": 8.975294117647058e-05, + "loss": 0.2666, + "step": 5960 + }, + { + "epoch": 350.59, + "eval_loss": 0.2675207257270813, + "eval_runtime": 1.9526, + "eval_samples_per_second": 68.628, + "eval_steps_per_second": 8.707, + "step": 5960 + }, + { + "epoch": 350.88, + "grad_norm": 0.3404819667339325, + "learning_rate": 8.957647058823529e-05, + "loss": 0.2698, + "step": 5965 + }, + { + "epoch": 351.18, + "grad_norm": 0.31054213643074036, + "learning_rate": 8.939999999999999e-05, + "loss": 0.2854, + "step": 5970 + }, + { + "epoch": 351.47, + "grad_norm": 0.3628139793872833, + "learning_rate": 8.922352941176471e-05, + "loss": 0.2645, + "step": 5975 + }, + { + "epoch": 351.76, + "grad_norm": 0.2938252091407776, + "learning_rate": 8.904705882352941e-05, + "loss": 0.2608, + "step": 5980 + }, + { + "epoch": 351.76, + "eval_loss": 0.267805814743042, + "eval_runtime": 1.9515, + "eval_samples_per_second": 68.665, + "eval_steps_per_second": 8.711, + "step": 5980 + }, + { + "epoch": 352.06, + "grad_norm": 0.4092661440372467, + "learning_rate": 8.88705882352941e-05, + "loss": 0.3072, + "step": 5985 + }, + { + "epoch": 352.35, + "grad_norm": 0.3128484785556793, + "learning_rate": 8.869411764705882e-05, + "loss": 0.2508, + "step": 5990 + }, + { + "epoch": 352.65, + "grad_norm": 0.3315788805484772, + "learning_rate": 8.851764705882352e-05, + "loss": 0.2713, + "step": 5995 + }, + { + "epoch": 352.94, + "grad_norm": 0.4288850426673889, + "learning_rate": 8.834117647058822e-05, + "loss": 0.287, + "step": 6000 + }, + { + "epoch": 352.94, + "eval_loss": 0.2674010694026947, + "eval_runtime": 1.95, + "eval_samples_per_second": 68.717, + "eval_steps_per_second": 8.718, + "step": 6000 + }, + { + "epoch": 353.24, + "grad_norm": 0.36417680978775024, + "learning_rate": 8.816470588235294e-05, + "loss": 0.2624, + "step": 6005 + }, + { + "epoch": 353.53, + "grad_norm": 0.30314961075782776, + "learning_rate": 8.798823529411764e-05, + "loss": 0.2725, + "step": 6010 + }, + { + "epoch": 353.82, + "grad_norm": 0.3506593406200409, + "learning_rate": 8.781176470588233e-05, + "loss": 0.2786, + "step": 6015 + }, + { + "epoch": 354.12, + "grad_norm": 0.377427339553833, + "learning_rate": 8.763529411764705e-05, + "loss": 0.2789, + "step": 6020 + }, + { + "epoch": 354.12, + "eval_loss": 0.2672005295753479, + "eval_runtime": 1.9532, + "eval_samples_per_second": 68.607, + "eval_steps_per_second": 8.704, + "step": 6020 + }, + { + "epoch": 354.41, + "grad_norm": 0.40812671184539795, + "learning_rate": 8.745882352941175e-05, + "loss": 0.2758, + "step": 6025 + }, + { + "epoch": 354.71, + "grad_norm": 0.3672526180744171, + "learning_rate": 8.728235294117647e-05, + "loss": 0.2556, + "step": 6030 + }, + { + "epoch": 355.0, + "grad_norm": 0.4231290817260742, + "learning_rate": 8.710588235294117e-05, + "loss": 0.2861, + "step": 6035 + }, + { + "epoch": 355.29, + "grad_norm": 0.3113958537578583, + "learning_rate": 8.692941176470588e-05, + "loss": 0.248, + "step": 6040 + }, + { + "epoch": 355.29, + "eval_loss": 0.26705193519592285, + "eval_runtime": 1.9508, + "eval_samples_per_second": 68.688, + "eval_steps_per_second": 8.714, + "step": 6040 + }, + { + "epoch": 355.59, + "grad_norm": 0.3717113733291626, + "learning_rate": 8.675294117647058e-05, + "loss": 0.2806, + "step": 6045 + }, + { + "epoch": 355.88, + "grad_norm": 0.3994509279727936, + "learning_rate": 8.657647058823528e-05, + "loss": 0.2854, + "step": 6050 + }, + { + "epoch": 356.18, + "grad_norm": 0.35055795311927795, + "learning_rate": 8.639999999999999e-05, + "loss": 0.269, + "step": 6055 + }, + { + "epoch": 356.47, + "grad_norm": 0.388405442237854, + "learning_rate": 8.62235294117647e-05, + "loss": 0.2741, + "step": 6060 + }, + { + "epoch": 356.47, + "eval_loss": 0.2674548327922821, + "eval_runtime": 1.9509, + "eval_samples_per_second": 68.687, + "eval_steps_per_second": 8.714, + "step": 6060 + }, + { + "epoch": 356.76, + "grad_norm": 0.40469926595687866, + "learning_rate": 8.60470588235294e-05, + "loss": 0.2749, + "step": 6065 + }, + { + "epoch": 357.06, + "grad_norm": 0.35421106219291687, + "learning_rate": 8.58705882352941e-05, + "loss": 0.2779, + "step": 6070 + }, + { + "epoch": 357.35, + "grad_norm": 0.42168813943862915, + "learning_rate": 8.569411764705881e-05, + "loss": 0.2846, + "step": 6075 + }, + { + "epoch": 357.65, + "grad_norm": 0.33547458052635193, + "learning_rate": 8.551764705882352e-05, + "loss": 0.2585, + "step": 6080 + }, + { + "epoch": 357.65, + "eval_loss": 0.26726043224334717, + "eval_runtime": 1.9532, + "eval_samples_per_second": 68.605, + "eval_steps_per_second": 8.704, + "step": 6080 + }, + { + "epoch": 357.94, + "grad_norm": 0.3579460084438324, + "learning_rate": 8.534117647058823e-05, + "loss": 0.266, + "step": 6085 + }, + { + "epoch": 358.24, + "grad_norm": 0.3353218138217926, + "learning_rate": 8.516470588235294e-05, + "loss": 0.2668, + "step": 6090 + }, + { + "epoch": 358.53, + "grad_norm": 0.34978580474853516, + "learning_rate": 8.498823529411764e-05, + "loss": 0.2664, + "step": 6095 + }, + { + "epoch": 358.82, + "grad_norm": 0.3615849018096924, + "learning_rate": 8.481176470588236e-05, + "loss": 0.285, + "step": 6100 + }, + { + "epoch": 358.82, + "eval_loss": 0.26745954155921936, + "eval_runtime": 1.9509, + "eval_samples_per_second": 68.685, + "eval_steps_per_second": 8.714, + "step": 6100 + }, + { + "epoch": 359.12, + "grad_norm": 0.3914557993412018, + "learning_rate": 8.463529411764704e-05, + "loss": 0.2558, + "step": 6105 + }, + { + "epoch": 359.41, + "grad_norm": 0.37587404251098633, + "learning_rate": 8.445882352941175e-05, + "loss": 0.2775, + "step": 6110 + }, + { + "epoch": 359.71, + "grad_norm": 0.5058900117874146, + "learning_rate": 8.428235294117646e-05, + "loss": 0.2762, + "step": 6115 + }, + { + "epoch": 360.0, + "grad_norm": 0.3985624313354492, + "learning_rate": 8.410588235294117e-05, + "loss": 0.2725, + "step": 6120 + }, + { + "epoch": 360.0, + "eval_loss": 0.26724329590797424, + "eval_runtime": 1.9503, + "eval_samples_per_second": 68.707, + "eval_steps_per_second": 8.717, + "step": 6120 + }, + { + "epoch": 360.29, + "grad_norm": 0.31552571058273315, + "learning_rate": 8.392941176470587e-05, + "loss": 0.2622, + "step": 6125 + }, + { + "epoch": 360.59, + "grad_norm": 0.27004382014274597, + "learning_rate": 8.375294117647059e-05, + "loss": 0.249, + "step": 6130 + }, + { + "epoch": 360.88, + "grad_norm": 0.5702885389328003, + "learning_rate": 8.357647058823528e-05, + "loss": 0.3053, + "step": 6135 + }, + { + "epoch": 361.18, + "grad_norm": 0.32766902446746826, + "learning_rate": 8.34e-05, + "loss": 0.2709, + "step": 6140 + }, + { + "epoch": 361.18, + "eval_loss": 0.26711100339889526, + "eval_runtime": 1.9537, + "eval_samples_per_second": 68.589, + "eval_steps_per_second": 8.702, + "step": 6140 + }, + { + "epoch": 361.47, + "grad_norm": 0.3389482796192169, + "learning_rate": 8.32235294117647e-05, + "loss": 0.2733, + "step": 6145 + }, + { + "epoch": 361.76, + "grad_norm": 0.49755939841270447, + "learning_rate": 8.30470588235294e-05, + "loss": 0.2702, + "step": 6150 + }, + { + "epoch": 362.06, + "grad_norm": 0.4386669993400574, + "learning_rate": 8.287058823529412e-05, + "loss": 0.2936, + "step": 6155 + }, + { + "epoch": 362.35, + "grad_norm": 0.3359367549419403, + "learning_rate": 8.269411764705882e-05, + "loss": 0.2474, + "step": 6160 + }, + { + "epoch": 362.35, + "eval_loss": 0.2672062814235687, + "eval_runtime": 1.95, + "eval_samples_per_second": 68.717, + "eval_steps_per_second": 8.718, + "step": 6160 + }, + { + "epoch": 362.65, + "grad_norm": 0.40593552589416504, + "learning_rate": 8.251764705882351e-05, + "loss": 0.2754, + "step": 6165 + }, + { + "epoch": 362.94, + "grad_norm": 0.3315235376358032, + "learning_rate": 8.234117647058823e-05, + "loss": 0.2752, + "step": 6170 + }, + { + "epoch": 363.24, + "grad_norm": 0.3803354799747467, + "learning_rate": 8.216470588235293e-05, + "loss": 0.2812, + "step": 6175 + }, + { + "epoch": 363.53, + "grad_norm": 0.4206208884716034, + "learning_rate": 8.198823529411763e-05, + "loss": 0.272, + "step": 6180 + }, + { + "epoch": 363.53, + "eval_loss": 0.2671860456466675, + "eval_runtime": 1.9889, + "eval_samples_per_second": 67.374, + "eval_steps_per_second": 8.547, + "step": 6180 + }, + { + "epoch": 363.82, + "grad_norm": 0.39495155215263367, + "learning_rate": 8.181176470588235e-05, + "loss": 0.283, + "step": 6185 + }, + { + "epoch": 364.12, + "grad_norm": 0.382572203874588, + "learning_rate": 8.163529411764705e-05, + "loss": 0.2761, + "step": 6190 + }, + { + "epoch": 364.41, + "grad_norm": 0.38203752040863037, + "learning_rate": 8.145882352941177e-05, + "loss": 0.2669, + "step": 6195 + }, + { + "epoch": 364.71, + "grad_norm": 0.3476529121398926, + "learning_rate": 8.128235294117646e-05, + "loss": 0.2791, + "step": 6200 + }, + { + "epoch": 364.71, + "eval_loss": 0.26738208532333374, + "eval_runtime": 1.9518, + "eval_samples_per_second": 68.655, + "eval_steps_per_second": 8.71, + "step": 6200 + }, + { + "epoch": 365.0, + "grad_norm": 0.36798906326293945, + "learning_rate": 8.110588235294116e-05, + "loss": 0.2665, + "step": 6205 + }, + { + "epoch": 365.29, + "grad_norm": 0.31820008158683777, + "learning_rate": 8.092941176470588e-05, + "loss": 0.2755, + "step": 6210 + }, + { + "epoch": 365.59, + "grad_norm": 0.33463528752326965, + "learning_rate": 8.075294117647058e-05, + "loss": 0.2523, + "step": 6215 + }, + { + "epoch": 365.88, + "grad_norm": 0.4484810531139374, + "learning_rate": 8.057647058823529e-05, + "loss": 0.2756, + "step": 6220 + }, + { + "epoch": 365.88, + "eval_loss": 0.26725342869758606, + "eval_runtime": 1.9685, + "eval_samples_per_second": 68.071, + "eval_steps_per_second": 8.636, + "step": 6220 + }, + { + "epoch": 366.18, + "grad_norm": 0.26314979791641235, + "learning_rate": 8.04e-05, + "loss": 0.2655, + "step": 6225 + }, + { + "epoch": 366.47, + "grad_norm": 0.40340784192085266, + "learning_rate": 8.022352941176469e-05, + "loss": 0.3024, + "step": 6230 + }, + { + "epoch": 366.76, + "grad_norm": 0.2637499272823334, + "learning_rate": 8.00470588235294e-05, + "loss": 0.2603, + "step": 6235 + }, + { + "epoch": 367.06, + "grad_norm": 0.3528923988342285, + "learning_rate": 7.987058823529411e-05, + "loss": 0.2688, + "step": 6240 + }, + { + "epoch": 367.06, + "eval_loss": 0.2671893239021301, + "eval_runtime": 1.9516, + "eval_samples_per_second": 68.663, + "eval_steps_per_second": 8.711, + "step": 6240 + }, + { + "epoch": 367.35, + "grad_norm": 0.40042927861213684, + "learning_rate": 7.969411764705882e-05, + "loss": 0.2826, + "step": 6245 + }, + { + "epoch": 367.65, + "grad_norm": 0.345426082611084, + "learning_rate": 7.951764705882353e-05, + "loss": 0.2552, + "step": 6250 + }, + { + "epoch": 367.94, + "grad_norm": 0.3973243236541748, + "learning_rate": 7.934117647058824e-05, + "loss": 0.2911, + "step": 6255 + }, + { + "epoch": 368.24, + "grad_norm": 0.35589733719825745, + "learning_rate": 7.916470588235293e-05, + "loss": 0.2586, + "step": 6260 + }, + { + "epoch": 368.24, + "eval_loss": 0.26706206798553467, + "eval_runtime": 1.9515, + "eval_samples_per_second": 68.666, + "eval_steps_per_second": 8.711, + "step": 6260 + }, + { + "epoch": 368.53, + "grad_norm": 0.35010915994644165, + "learning_rate": 7.898823529411764e-05, + "loss": 0.2677, + "step": 6265 + }, + { + "epoch": 368.82, + "grad_norm": 0.3606186509132385, + "learning_rate": 7.881176470588234e-05, + "loss": 0.2828, + "step": 6270 + }, + { + "epoch": 369.12, + "grad_norm": 0.24894337356090546, + "learning_rate": 7.863529411764705e-05, + "loss": 0.2718, + "step": 6275 + }, + { + "epoch": 369.41, + "grad_norm": 0.4606989622116089, + "learning_rate": 7.845882352941176e-05, + "loss": 0.2676, + "step": 6280 + }, + { + "epoch": 369.41, + "eval_loss": 0.26726627349853516, + "eval_runtime": 2.001, + "eval_samples_per_second": 66.965, + "eval_steps_per_second": 8.496, + "step": 6280 + }, + { + "epoch": 369.71, + "grad_norm": 0.29813820123672485, + "learning_rate": 7.828235294117647e-05, + "loss": 0.2713, + "step": 6285 + }, + { + "epoch": 370.0, + "grad_norm": 0.4744376838207245, + "learning_rate": 7.810588235294116e-05, + "loss": 0.2887, + "step": 6290 + }, + { + "epoch": 370.29, + "grad_norm": 0.3478952646255493, + "learning_rate": 7.792941176470587e-05, + "loss": 0.2637, + "step": 6295 + }, + { + "epoch": 370.59, + "grad_norm": 0.32131505012512207, + "learning_rate": 7.775294117647058e-05, + "loss": 0.2655, + "step": 6300 + }, + { + "epoch": 370.59, + "eval_loss": 0.2671908140182495, + "eval_runtime": 1.9773, + "eval_samples_per_second": 67.77, + "eval_steps_per_second": 8.598, + "step": 6300 + }, + { + "epoch": 370.88, + "grad_norm": 0.32886752486228943, + "learning_rate": 7.75764705882353e-05, + "loss": 0.2746, + "step": 6305 + }, + { + "epoch": 371.18, + "grad_norm": 0.41275689005851746, + "learning_rate": 7.74e-05, + "loss": 0.2714, + "step": 6310 + }, + { + "epoch": 371.47, + "grad_norm": 0.5604836940765381, + "learning_rate": 7.72235294117647e-05, + "loss": 0.2775, + "step": 6315 + }, + { + "epoch": 371.76, + "grad_norm": 0.34156864881515503, + "learning_rate": 7.704705882352942e-05, + "loss": 0.2739, + "step": 6320 + }, + { + "epoch": 371.76, + "eval_loss": 0.2672483026981354, + "eval_runtime": 1.9532, + "eval_samples_per_second": 68.604, + "eval_steps_per_second": 8.704, + "step": 6320 + }, + { + "epoch": 372.06, + "grad_norm": 0.3203223943710327, + "learning_rate": 7.687058823529411e-05, + "loss": 0.2815, + "step": 6325 + }, + { + "epoch": 372.35, + "grad_norm": 0.422767698764801, + "learning_rate": 7.669411764705881e-05, + "loss": 0.2758, + "step": 6330 + }, + { + "epoch": 372.65, + "grad_norm": 0.41294363141059875, + "learning_rate": 7.651764705882353e-05, + "loss": 0.279, + "step": 6335 + }, + { + "epoch": 372.94, + "grad_norm": 0.3619501292705536, + "learning_rate": 7.634117647058823e-05, + "loss": 0.2651, + "step": 6340 + }, + { + "epoch": 372.94, + "eval_loss": 0.26729047298431396, + "eval_runtime": 1.9823, + "eval_samples_per_second": 67.597, + "eval_steps_per_second": 8.576, + "step": 6340 + }, + { + "epoch": 373.24, + "grad_norm": 0.3015596568584442, + "learning_rate": 7.616470588235293e-05, + "loss": 0.2852, + "step": 6345 + }, + { + "epoch": 373.53, + "grad_norm": 0.39669787883758545, + "learning_rate": 7.598823529411765e-05, + "loss": 0.2639, + "step": 6350 + }, + { + "epoch": 373.82, + "grad_norm": 0.3523363471031189, + "learning_rate": 7.581176470588234e-05, + "loss": 0.2781, + "step": 6355 + }, + { + "epoch": 374.12, + "grad_norm": 0.29564139246940613, + "learning_rate": 7.563529411764704e-05, + "loss": 0.261, + "step": 6360 + }, + { + "epoch": 374.12, + "eval_loss": 0.26733723282814026, + "eval_runtime": 1.9745, + "eval_samples_per_second": 67.867, + "eval_steps_per_second": 8.61, + "step": 6360 + }, + { + "epoch": 374.41, + "grad_norm": 0.3095841407775879, + "learning_rate": 7.545882352941176e-05, + "loss": 0.2655, + "step": 6365 + }, + { + "epoch": 374.71, + "grad_norm": 0.43749403953552246, + "learning_rate": 7.528235294117646e-05, + "loss": 0.2751, + "step": 6370 + }, + { + "epoch": 375.0, + "grad_norm": 0.6555472016334534, + "learning_rate": 7.510588235294118e-05, + "loss": 0.2882, + "step": 6375 + }, + { + "epoch": 375.29, + "grad_norm": 0.30169016122817993, + "learning_rate": 7.492941176470588e-05, + "loss": 0.2784, + "step": 6380 + }, + { + "epoch": 375.29, + "eval_loss": 0.2671310007572174, + "eval_runtime": 1.952, + "eval_samples_per_second": 68.648, + "eval_steps_per_second": 8.709, + "step": 6380 + }, + { + "epoch": 375.59, + "grad_norm": 0.3225093185901642, + "learning_rate": 7.475294117647059e-05, + "loss": 0.2813, + "step": 6385 + }, + { + "epoch": 375.88, + "grad_norm": 0.3392583429813385, + "learning_rate": 7.457647058823529e-05, + "loss": 0.2613, + "step": 6390 + }, + { + "epoch": 376.18, + "grad_norm": 0.6147930026054382, + "learning_rate": 7.439999999999999e-05, + "loss": 0.2744, + "step": 6395 + }, + { + "epoch": 376.47, + "grad_norm": 0.3492681384086609, + "learning_rate": 7.42235294117647e-05, + "loss": 0.2715, + "step": 6400 + }, + { + "epoch": 376.47, + "eval_loss": 0.2673848271369934, + "eval_runtime": 1.9519, + "eval_samples_per_second": 68.651, + "eval_steps_per_second": 8.709, + "step": 6400 + }, + { + "epoch": 376.76, + "grad_norm": 0.35173168778419495, + "learning_rate": 7.404705882352941e-05, + "loss": 0.2574, + "step": 6405 + }, + { + "epoch": 377.06, + "grad_norm": 0.36092716455459595, + "learning_rate": 7.387058823529412e-05, + "loss": 0.2788, + "step": 6410 + }, + { + "epoch": 377.35, + "grad_norm": 0.40118205547332764, + "learning_rate": 7.369411764705882e-05, + "loss": 0.2688, + "step": 6415 + }, + { + "epoch": 377.65, + "grad_norm": 0.2681124806404114, + "learning_rate": 7.351764705882352e-05, + "loss": 0.2597, + "step": 6420 + }, + { + "epoch": 377.65, + "eval_loss": 0.2672716975212097, + "eval_runtime": 1.9486, + "eval_samples_per_second": 68.767, + "eval_steps_per_second": 8.724, + "step": 6420 + }, + { + "epoch": 377.94, + "grad_norm": 0.464854896068573, + "learning_rate": 7.334117647058823e-05, + "loss": 0.2806, + "step": 6425 + }, + { + "epoch": 378.24, + "grad_norm": 0.320843368768692, + "learning_rate": 7.316470588235293e-05, + "loss": 0.3152, + "step": 6430 + }, + { + "epoch": 378.53, + "grad_norm": 0.37584948539733887, + "learning_rate": 7.298823529411765e-05, + "loss": 0.2655, + "step": 6435 + }, + { + "epoch": 378.82, + "grad_norm": 0.2702130377292633, + "learning_rate": 7.281176470588235e-05, + "loss": 0.2666, + "step": 6440 + }, + { + "epoch": 378.82, + "eval_loss": 0.2674597501754761, + "eval_runtime": 1.9517, + "eval_samples_per_second": 68.66, + "eval_steps_per_second": 8.711, + "step": 6440 + }, + { + "epoch": 379.12, + "grad_norm": 0.3348217308521271, + "learning_rate": 7.263529411764705e-05, + "loss": 0.2584, + "step": 6445 + }, + { + "epoch": 379.41, + "grad_norm": 0.3533666431903839, + "learning_rate": 7.245882352941175e-05, + "loss": 0.2639, + "step": 6450 + }, + { + "epoch": 379.71, + "grad_norm": 0.37202340364456177, + "learning_rate": 7.228235294117647e-05, + "loss": 0.2823, + "step": 6455 + }, + { + "epoch": 380.0, + "grad_norm": 0.497489333152771, + "learning_rate": 7.210588235294116e-05, + "loss": 0.2798, + "step": 6460 + }, + { + "epoch": 380.0, + "eval_loss": 0.2671857178211212, + "eval_runtime": 1.9758, + "eval_samples_per_second": 67.822, + "eval_steps_per_second": 8.604, + "step": 6460 + }, + { + "epoch": 380.29, + "grad_norm": 0.293053537607193, + "learning_rate": 7.192941176470588e-05, + "loss": 0.2749, + "step": 6465 + }, + { + "epoch": 380.59, + "grad_norm": 0.2904147505760193, + "learning_rate": 7.175294117647058e-05, + "loss": 0.2701, + "step": 6470 + }, + { + "epoch": 380.88, + "grad_norm": 0.353919118642807, + "learning_rate": 7.157647058823528e-05, + "loss": 0.2676, + "step": 6475 + }, + { + "epoch": 381.18, + "grad_norm": 0.3408125340938568, + "learning_rate": 7.139999999999999e-05, + "loss": 0.2748, + "step": 6480 + }, + { + "epoch": 381.18, + "eval_loss": 0.2671048045158386, + "eval_runtime": 1.9972, + "eval_samples_per_second": 67.095, + "eval_steps_per_second": 8.512, + "step": 6480 + }, + { + "epoch": 381.47, + "grad_norm": 0.4919535219669342, + "learning_rate": 7.12235294117647e-05, + "loss": 0.2826, + "step": 6485 + }, + { + "epoch": 381.76, + "grad_norm": 0.3266085386276245, + "learning_rate": 7.104705882352941e-05, + "loss": 0.2664, + "step": 6490 + }, + { + "epoch": 382.06, + "grad_norm": 0.4877208471298218, + "learning_rate": 7.087058823529411e-05, + "loss": 0.2722, + "step": 6495 + }, + { + "epoch": 382.35, + "grad_norm": 0.396945595741272, + "learning_rate": 7.069411764705881e-05, + "loss": 0.2637, + "step": 6500 + }, + { + "epoch": 382.35, + "eval_loss": 0.2670700252056122, + "eval_runtime": 1.9777, + "eval_samples_per_second": 67.754, + "eval_steps_per_second": 8.596, + "step": 6500 + }, + { + "epoch": 382.65, + "grad_norm": 0.4301978349685669, + "learning_rate": 7.051764705882352e-05, + "loss": 0.269, + "step": 6505 + }, + { + "epoch": 382.94, + "grad_norm": 0.3541216254234314, + "learning_rate": 7.034117647058823e-05, + "loss": 0.2743, + "step": 6510 + }, + { + "epoch": 383.24, + "grad_norm": 0.39344117045402527, + "learning_rate": 7.016470588235294e-05, + "loss": 0.2482, + "step": 6515 + }, + { + "epoch": 383.53, + "grad_norm": 0.30213138461112976, + "learning_rate": 6.998823529411764e-05, + "loss": 0.2859, + "step": 6520 + }, + { + "epoch": 383.53, + "eval_loss": 0.2671603858470917, + "eval_runtime": 2.0013, + "eval_samples_per_second": 66.957, + "eval_steps_per_second": 8.495, + "step": 6520 + }, + { + "epoch": 383.82, + "grad_norm": 0.34998735785484314, + "learning_rate": 6.981176470588234e-05, + "loss": 0.277, + "step": 6525 + }, + { + "epoch": 384.12, + "grad_norm": 0.290435254573822, + "learning_rate": 6.963529411764706e-05, + "loss": 0.2754, + "step": 6530 + }, + { + "epoch": 384.41, + "grad_norm": 0.27687838673591614, + "learning_rate": 6.945882352941175e-05, + "loss": 0.2547, + "step": 6535 + }, + { + "epoch": 384.71, + "grad_norm": 0.49060243368148804, + "learning_rate": 6.928235294117647e-05, + "loss": 0.2641, + "step": 6540 + }, + { + "epoch": 384.71, + "eval_loss": 0.2672353982925415, + "eval_runtime": 1.9805, + "eval_samples_per_second": 67.659, + "eval_steps_per_second": 8.584, + "step": 6540 + }, + { + "epoch": 385.0, + "grad_norm": 0.48018312454223633, + "learning_rate": 6.910588235294117e-05, + "loss": 0.3054, + "step": 6545 + }, + { + "epoch": 385.29, + "grad_norm": 0.3915199637413025, + "learning_rate": 6.892941176470589e-05, + "loss": 0.2648, + "step": 6550 + }, + { + "epoch": 385.59, + "grad_norm": 0.350553423166275, + "learning_rate": 6.875294117647058e-05, + "loss": 0.2675, + "step": 6555 + }, + { + "epoch": 385.88, + "grad_norm": 0.43172863125801086, + "learning_rate": 6.857647058823529e-05, + "loss": 0.2652, + "step": 6560 + }, + { + "epoch": 385.88, + "eval_loss": 0.26732397079467773, + "eval_runtime": 1.9508, + "eval_samples_per_second": 68.69, + "eval_steps_per_second": 8.714, + "step": 6560 + }, + { + "epoch": 386.18, + "grad_norm": 0.41070055961608887, + "learning_rate": 6.84e-05, + "loss": 0.2927, + "step": 6565 + }, + { + "epoch": 386.47, + "grad_norm": 0.37032806873321533, + "learning_rate": 6.82235294117647e-05, + "loss": 0.2775, + "step": 6570 + }, + { + "epoch": 386.76, + "grad_norm": 0.3668869435787201, + "learning_rate": 6.80470588235294e-05, + "loss": 0.2606, + "step": 6575 + }, + { + "epoch": 387.06, + "grad_norm": 0.3992270827293396, + "learning_rate": 6.787058823529412e-05, + "loss": 0.2881, + "step": 6580 + }, + { + "epoch": 387.06, + "eval_loss": 0.26705142855644226, + "eval_runtime": 1.951, + "eval_samples_per_second": 68.682, + "eval_steps_per_second": 8.713, + "step": 6580 + }, + { + "epoch": 387.35, + "grad_norm": 0.4839495122432709, + "learning_rate": 6.769411764705882e-05, + "loss": 0.2864, + "step": 6585 + }, + { + "epoch": 387.65, + "grad_norm": 0.2888788878917694, + "learning_rate": 6.751764705882353e-05, + "loss": 0.2559, + "step": 6590 + }, + { + "epoch": 387.94, + "grad_norm": 0.37609121203422546, + "learning_rate": 6.734117647058823e-05, + "loss": 0.2681, + "step": 6595 + }, + { + "epoch": 388.24, + "grad_norm": 0.4350452125072479, + "learning_rate": 6.716470588235293e-05, + "loss": 0.2888, + "step": 6600 + }, + { + "epoch": 388.24, + "eval_loss": 0.2672191858291626, + "eval_runtime": 1.9496, + "eval_samples_per_second": 68.731, + "eval_steps_per_second": 8.72, + "step": 6600 + }, + { + "epoch": 388.53, + "grad_norm": 0.3134041130542755, + "learning_rate": 6.698823529411765e-05, + "loss": 0.2539, + "step": 6605 + }, + { + "epoch": 388.82, + "grad_norm": 0.4493381977081299, + "learning_rate": 6.681176470588235e-05, + "loss": 0.2712, + "step": 6610 + }, + { + "epoch": 389.12, + "grad_norm": 0.3677453398704529, + "learning_rate": 6.663529411764705e-05, + "loss": 0.2764, + "step": 6615 + }, + { + "epoch": 389.41, + "grad_norm": 0.28110626339912415, + "learning_rate": 6.645882352941176e-05, + "loss": 0.2749, + "step": 6620 + }, + { + "epoch": 389.41, + "eval_loss": 0.267000287771225, + "eval_runtime": 1.951, + "eval_samples_per_second": 68.683, + "eval_steps_per_second": 8.713, + "step": 6620 + }, + { + "epoch": 389.71, + "grad_norm": 0.2776085138320923, + "learning_rate": 6.628235294117646e-05, + "loss": 0.266, + "step": 6625 + }, + { + "epoch": 390.0, + "grad_norm": 0.3852093815803528, + "learning_rate": 6.610588235294116e-05, + "loss": 0.2641, + "step": 6630 + }, + { + "epoch": 390.29, + "grad_norm": 0.39009889960289, + "learning_rate": 6.592941176470588e-05, + "loss": 0.2801, + "step": 6635 + }, + { + "epoch": 390.59, + "grad_norm": 0.3021441400051117, + "learning_rate": 6.575294117647058e-05, + "loss": 0.2579, + "step": 6640 + }, + { + "epoch": 390.59, + "eval_loss": 0.26710474491119385, + "eval_runtime": 1.9513, + "eval_samples_per_second": 68.673, + "eval_steps_per_second": 8.712, + "step": 6640 + }, + { + "epoch": 390.88, + "grad_norm": 0.47458308935165405, + "learning_rate": 6.557647058823529e-05, + "loss": 0.2671, + "step": 6645 + }, + { + "epoch": 391.18, + "grad_norm": 0.3273366093635559, + "learning_rate": 6.539999999999999e-05, + "loss": 0.2824, + "step": 6650 + }, + { + "epoch": 391.47, + "grad_norm": 0.400155633687973, + "learning_rate": 6.522352941176471e-05, + "loss": 0.2659, + "step": 6655 + }, + { + "epoch": 391.76, + "grad_norm": 0.37741804122924805, + "learning_rate": 6.50470588235294e-05, + "loss": 0.2693, + "step": 6660 + }, + { + "epoch": 391.76, + "eval_loss": 0.26707640290260315, + "eval_runtime": 1.9525, + "eval_samples_per_second": 68.632, + "eval_steps_per_second": 8.707, + "step": 6660 + }, + { + "epoch": 392.06, + "grad_norm": 0.30003881454467773, + "learning_rate": 6.487058823529411e-05, + "loss": 0.2649, + "step": 6665 + }, + { + "epoch": 392.35, + "grad_norm": 0.4413745403289795, + "learning_rate": 6.469411764705882e-05, + "loss": 0.2581, + "step": 6670 + }, + { + "epoch": 392.65, + "grad_norm": 0.35154813528060913, + "learning_rate": 6.451764705882352e-05, + "loss": 0.2882, + "step": 6675 + }, + { + "epoch": 392.94, + "grad_norm": 0.4098171293735504, + "learning_rate": 6.434117647058822e-05, + "loss": 0.291, + "step": 6680 + }, + { + "epoch": 392.94, + "eval_loss": 0.2669891119003296, + "eval_runtime": 1.9511, + "eval_samples_per_second": 68.679, + "eval_steps_per_second": 8.713, + "step": 6680 + }, + { + "epoch": 393.24, + "grad_norm": 0.39548414945602417, + "learning_rate": 6.416470588235294e-05, + "loss": 0.2583, + "step": 6685 + }, + { + "epoch": 393.53, + "grad_norm": 0.40910691022872925, + "learning_rate": 6.398823529411764e-05, + "loss": 0.2797, + "step": 6690 + }, + { + "epoch": 393.82, + "grad_norm": 0.33205562829971313, + "learning_rate": 6.381176470588235e-05, + "loss": 0.2644, + "step": 6695 + }, + { + "epoch": 394.12, + "grad_norm": 0.36516687273979187, + "learning_rate": 6.363529411764705e-05, + "loss": 0.2715, + "step": 6700 + }, + { + "epoch": 394.12, + "eval_loss": 0.2669064998626709, + "eval_runtime": 1.9522, + "eval_samples_per_second": 68.64, + "eval_steps_per_second": 8.708, + "step": 6700 + }, + { + "epoch": 394.41, + "grad_norm": 0.40672898292541504, + "learning_rate": 6.345882352941175e-05, + "loss": 0.26, + "step": 6705 + }, + { + "epoch": 394.71, + "grad_norm": 0.4095798134803772, + "learning_rate": 6.328235294117647e-05, + "loss": 0.2826, + "step": 6710 + }, + { + "epoch": 395.0, + "grad_norm": 0.4764916002750397, + "learning_rate": 6.310588235294117e-05, + "loss": 0.2774, + "step": 6715 + }, + { + "epoch": 395.29, + "grad_norm": 0.37296828627586365, + "learning_rate": 6.292941176470588e-05, + "loss": 0.2547, + "step": 6720 + }, + { + "epoch": 395.29, + "eval_loss": 0.2669101357460022, + "eval_runtime": 1.9507, + "eval_samples_per_second": 68.695, + "eval_steps_per_second": 8.715, + "step": 6720 + }, + { + "epoch": 395.59, + "grad_norm": 0.44701212644577026, + "learning_rate": 6.275294117647058e-05, + "loss": 0.2932, + "step": 6725 + }, + { + "epoch": 395.88, + "grad_norm": 0.26783275604248047, + "learning_rate": 6.25764705882353e-05, + "loss": 0.2638, + "step": 6730 + }, + { + "epoch": 396.18, + "grad_norm": 0.32074496150016785, + "learning_rate": 6.239999999999999e-05, + "loss": 0.2608, + "step": 6735 + }, + { + "epoch": 396.47, + "grad_norm": 0.43822526931762695, + "learning_rate": 6.22235294117647e-05, + "loss": 0.2872, + "step": 6740 + }, + { + "epoch": 396.47, + "eval_loss": 0.26694077253341675, + "eval_runtime": 1.9497, + "eval_samples_per_second": 68.729, + "eval_steps_per_second": 8.719, + "step": 6740 + }, + { + "epoch": 396.76, + "grad_norm": 0.3225814998149872, + "learning_rate": 6.20470588235294e-05, + "loss": 0.2589, + "step": 6745 + }, + { + "epoch": 397.06, + "grad_norm": 0.34406936168670654, + "learning_rate": 6.187058823529412e-05, + "loss": 0.2712, + "step": 6750 + }, + { + "epoch": 397.35, + "grad_norm": 0.32372158765792847, + "learning_rate": 6.169411764705881e-05, + "loss": 0.2791, + "step": 6755 + }, + { + "epoch": 397.65, + "grad_norm": 0.34214866161346436, + "learning_rate": 6.151764705882353e-05, + "loss": 0.2718, + "step": 6760 + }, + { + "epoch": 397.65, + "eval_loss": 0.26730066537857056, + "eval_runtime": 1.9657, + "eval_samples_per_second": 68.167, + "eval_steps_per_second": 8.648, + "step": 6760 + }, + { + "epoch": 397.94, + "grad_norm": 0.2773337960243225, + "learning_rate": 6.134117647058823e-05, + "loss": 0.256, + "step": 6765 + }, + { + "epoch": 398.24, + "grad_norm": 0.32852426171302795, + "learning_rate": 6.116470588235293e-05, + "loss": 0.2566, + "step": 6770 + }, + { + "epoch": 398.53, + "grad_norm": 0.34852075576782227, + "learning_rate": 6.098823529411764e-05, + "loss": 0.2627, + "step": 6775 + }, + { + "epoch": 398.82, + "grad_norm": 0.38895657658576965, + "learning_rate": 6.081176470588235e-05, + "loss": 0.2891, + "step": 6780 + }, + { + "epoch": 398.82, + "eval_loss": 0.2669762670993805, + "eval_runtime": 1.9512, + "eval_samples_per_second": 68.677, + "eval_steps_per_second": 8.713, + "step": 6780 + }, + { + "epoch": 399.12, + "grad_norm": 0.3377654552459717, + "learning_rate": 6.063529411764706e-05, + "loss": 0.2756, + "step": 6785 + }, + { + "epoch": 399.41, + "grad_norm": 0.4323594272136688, + "learning_rate": 6.0458823529411754e-05, + "loss": 0.265, + "step": 6790 + }, + { + "epoch": 399.71, + "grad_norm": 0.3999699056148529, + "learning_rate": 6.0282352941176464e-05, + "loss": 0.2777, + "step": 6795 + }, + { + "epoch": 400.0, + "grad_norm": 0.49368980526924133, + "learning_rate": 6.0105882352941174e-05, + "loss": 0.2737, + "step": 6800 + }, + { + "epoch": 400.0, + "eval_loss": 0.26709508895874023, + "eval_runtime": 1.9523, + "eval_samples_per_second": 68.638, + "eval_steps_per_second": 8.708, + "step": 6800 + }, + { + "epoch": 400.29, + "grad_norm": 0.31394705176353455, + "learning_rate": 5.992941176470587e-05, + "loss": 0.2771, + "step": 6805 + }, + { + "epoch": 400.59, + "grad_norm": 0.44644689559936523, + "learning_rate": 5.975294117647058e-05, + "loss": 0.2786, + "step": 6810 + }, + { + "epoch": 400.88, + "grad_norm": 0.3081410825252533, + "learning_rate": 5.957647058823529e-05, + "loss": 0.2644, + "step": 6815 + }, + { + "epoch": 401.18, + "grad_norm": 0.3674854338169098, + "learning_rate": 5.94e-05, + "loss": 0.2564, + "step": 6820 + }, + { + "epoch": 401.18, + "eval_loss": 0.2669694721698761, + "eval_runtime": 1.9545, + "eval_samples_per_second": 68.561, + "eval_steps_per_second": 8.698, + "step": 6820 + }, + { + "epoch": 401.47, + "grad_norm": 0.4225727915763855, + "learning_rate": 5.92235294117647e-05, + "loss": 0.2737, + "step": 6825 + }, + { + "epoch": 401.76, + "grad_norm": 0.39366257190704346, + "learning_rate": 5.904705882352941e-05, + "loss": 0.2778, + "step": 6830 + }, + { + "epoch": 402.06, + "grad_norm": 0.3397902250289917, + "learning_rate": 5.887058823529412e-05, + "loss": 0.2662, + "step": 6835 + }, + { + "epoch": 402.35, + "grad_norm": 0.398686021566391, + "learning_rate": 5.869411764705882e-05, + "loss": 0.2804, + "step": 6840 + }, + { + "epoch": 402.35, + "eval_loss": 0.2669714093208313, + "eval_runtime": 1.9692, + "eval_samples_per_second": 68.047, + "eval_steps_per_second": 8.633, + "step": 6840 + }, + { + "epoch": 402.65, + "grad_norm": 0.3490874767303467, + "learning_rate": 5.851764705882352e-05, + "loss": 0.2625, + "step": 6845 + }, + { + "epoch": 402.94, + "grad_norm": 0.34351468086242676, + "learning_rate": 5.834117647058823e-05, + "loss": 0.2674, + "step": 6850 + }, + { + "epoch": 403.24, + "grad_norm": 0.2977721095085144, + "learning_rate": 5.8164705882352936e-05, + "loss": 0.2581, + "step": 6855 + }, + { + "epoch": 403.53, + "grad_norm": 0.3474538326263428, + "learning_rate": 5.798823529411764e-05, + "loss": 0.2835, + "step": 6860 + }, + { + "epoch": 403.53, + "eval_loss": 0.2669595777988434, + "eval_runtime": 1.9527, + "eval_samples_per_second": 68.623, + "eval_steps_per_second": 8.706, + "step": 6860 + }, + { + "epoch": 403.82, + "grad_norm": 0.3499893546104431, + "learning_rate": 5.781176470588235e-05, + "loss": 0.2606, + "step": 6865 + }, + { + "epoch": 404.12, + "grad_norm": 0.33697545528411865, + "learning_rate": 5.763529411764705e-05, + "loss": 0.2674, + "step": 6870 + }, + { + "epoch": 404.41, + "grad_norm": 0.38439154624938965, + "learning_rate": 5.745882352941176e-05, + "loss": 0.2804, + "step": 6875 + }, + { + "epoch": 404.71, + "grad_norm": 0.23312297463417053, + "learning_rate": 5.7282352941176466e-05, + "loss": 0.257, + "step": 6880 + }, + { + "epoch": 404.71, + "eval_loss": 0.2669928967952728, + "eval_runtime": 1.9506, + "eval_samples_per_second": 68.697, + "eval_steps_per_second": 8.715, + "step": 6880 + }, + { + "epoch": 405.0, + "grad_norm": 0.49800968170166016, + "learning_rate": 5.710588235294117e-05, + "loss": 0.2721, + "step": 6885 + }, + { + "epoch": 405.29, + "grad_norm": 0.2819765508174896, + "learning_rate": 5.692941176470588e-05, + "loss": 0.2489, + "step": 6890 + }, + { + "epoch": 405.59, + "grad_norm": 0.33795782923698425, + "learning_rate": 5.675294117647059e-05, + "loss": 0.2789, + "step": 6895 + }, + { + "epoch": 405.88, + "grad_norm": 0.453967809677124, + "learning_rate": 5.6576470588235285e-05, + "loss": 0.2767, + "step": 6900 + }, + { + "epoch": 405.88, + "eval_loss": 0.2670580744743347, + "eval_runtime": 1.9519, + "eval_samples_per_second": 68.65, + "eval_steps_per_second": 8.709, + "step": 6900 + }, + { + "epoch": 406.18, + "grad_norm": 0.2979728579521179, + "learning_rate": 5.6399999999999995e-05, + "loss": 0.2809, + "step": 6905 + }, + { + "epoch": 406.47, + "grad_norm": 0.3364047408103943, + "learning_rate": 5.6223529411764705e-05, + "loss": 0.257, + "step": 6910 + }, + { + "epoch": 406.76, + "grad_norm": 0.44577115774154663, + "learning_rate": 5.60470588235294e-05, + "loss": 0.2696, + "step": 6915 + }, + { + "epoch": 407.06, + "grad_norm": 0.3086043894290924, + "learning_rate": 5.587058823529411e-05, + "loss": 0.2733, + "step": 6920 + }, + { + "epoch": 407.06, + "eval_loss": 0.26697295904159546, + "eval_runtime": 1.9622, + "eval_samples_per_second": 68.289, + "eval_steps_per_second": 8.664, + "step": 6920 + }, + { + "epoch": 407.35, + "grad_norm": 0.3703622817993164, + "learning_rate": 5.569411764705882e-05, + "loss": 0.2844, + "step": 6925 + }, + { + "epoch": 407.65, + "grad_norm": 0.40562862157821655, + "learning_rate": 5.5517647058823525e-05, + "loss": 0.2734, + "step": 6930 + }, + { + "epoch": 407.94, + "grad_norm": 0.3966273069381714, + "learning_rate": 5.534117647058823e-05, + "loss": 0.2598, + "step": 6935 + }, + { + "epoch": 408.24, + "grad_norm": 0.3033981919288635, + "learning_rate": 5.516470588235294e-05, + "loss": 0.2655, + "step": 6940 + }, + { + "epoch": 408.24, + "eval_loss": 0.2669752836227417, + "eval_runtime": 1.957, + "eval_samples_per_second": 68.472, + "eval_steps_per_second": 8.687, + "step": 6940 + }, + { + "epoch": 408.53, + "grad_norm": 0.3142114281654358, + "learning_rate": 5.498823529411764e-05, + "loss": 0.2776, + "step": 6945 + }, + { + "epoch": 408.82, + "grad_norm": 0.36577939987182617, + "learning_rate": 5.481176470588235e-05, + "loss": 0.2552, + "step": 6950 + }, + { + "epoch": 409.12, + "grad_norm": 0.3394526243209839, + "learning_rate": 5.4635294117647054e-05, + "loss": 0.2797, + "step": 6955 + }, + { + "epoch": 409.41, + "grad_norm": 0.3619946241378784, + "learning_rate": 5.445882352941176e-05, + "loss": 0.2642, + "step": 6960 + }, + { + "epoch": 409.41, + "eval_loss": 0.26693862676620483, + "eval_runtime": 1.9559, + "eval_samples_per_second": 68.511, + "eval_steps_per_second": 8.692, + "step": 6960 + }, + { + "epoch": 409.71, + "grad_norm": 0.4088079035282135, + "learning_rate": 5.428235294117647e-05, + "loss": 0.2753, + "step": 6965 + }, + { + "epoch": 410.0, + "grad_norm": 0.37255966663360596, + "learning_rate": 5.410588235294117e-05, + "loss": 0.2699, + "step": 6970 + }, + { + "epoch": 410.29, + "grad_norm": 0.35212811827659607, + "learning_rate": 5.3929411764705874e-05, + "loss": 0.2689, + "step": 6975 + }, + { + "epoch": 410.59, + "grad_norm": 0.2719222605228424, + "learning_rate": 5.3752941176470584e-05, + "loss": 0.2639, + "step": 6980 + }, + { + "epoch": 410.59, + "eval_loss": 0.2669658660888672, + "eval_runtime": 1.9622, + "eval_samples_per_second": 68.291, + "eval_steps_per_second": 8.664, + "step": 6980 + }, + { + "epoch": 410.88, + "grad_norm": 0.3696828782558441, + "learning_rate": 5.3576470588235294e-05, + "loss": 0.2681, + "step": 6985 + }, + { + "epoch": 411.18, + "grad_norm": 0.3527109920978546, + "learning_rate": 5.339999999999999e-05, + "loss": 0.2818, + "step": 6990 + }, + { + "epoch": 411.47, + "grad_norm": 0.3617575466632843, + "learning_rate": 5.32235294117647e-05, + "loss": 0.2919, + "step": 6995 + }, + { + "epoch": 411.76, + "grad_norm": 0.29520612955093384, + "learning_rate": 5.304705882352941e-05, + "loss": 0.2634, + "step": 7000 + }, + { + "epoch": 411.76, + "eval_loss": 0.26695409417152405, + "eval_runtime": 1.958, + "eval_samples_per_second": 68.438, + "eval_steps_per_second": 8.682, + "step": 7000 + }, + { + "epoch": 412.06, + "grad_norm": 0.4025411307811737, + "learning_rate": 5.2870588235294107e-05, + "loss": 0.2577, + "step": 7005 + }, + { + "epoch": 412.35, + "grad_norm": 0.3489183485507965, + "learning_rate": 5.2694117647058816e-05, + "loss": 0.2771, + "step": 7010 + }, + { + "epoch": 412.65, + "grad_norm": 0.36042025685310364, + "learning_rate": 5.2517647058823526e-05, + "loss": 0.2702, + "step": 7015 + }, + { + "epoch": 412.94, + "grad_norm": 0.4295525550842285, + "learning_rate": 5.2341176470588236e-05, + "loss": 0.272, + "step": 7020 + }, + { + "epoch": 412.94, + "eval_loss": 0.26698940992355347, + "eval_runtime": 1.9555, + "eval_samples_per_second": 68.525, + "eval_steps_per_second": 8.694, + "step": 7020 + }, + { + "epoch": 413.24, + "grad_norm": 0.37152427434921265, + "learning_rate": 5.216470588235293e-05, + "loss": 0.2546, + "step": 7025 + }, + { + "epoch": 413.53, + "grad_norm": 0.4405229091644287, + "learning_rate": 5.198823529411764e-05, + "loss": 0.2868, + "step": 7030 + }, + { + "epoch": 413.82, + "grad_norm": 0.376372367143631, + "learning_rate": 5.181176470588235e-05, + "loss": 0.2743, + "step": 7035 + }, + { + "epoch": 414.12, + "grad_norm": 0.331646591424942, + "learning_rate": 5.1635294117647056e-05, + "loss": 0.2454, + "step": 7040 + }, + { + "epoch": 414.12, + "eval_loss": 0.2668600082397461, + "eval_runtime": 1.9559, + "eval_samples_per_second": 68.509, + "eval_steps_per_second": 8.691, + "step": 7040 + }, + { + "epoch": 414.41, + "grad_norm": 0.36862948536872864, + "learning_rate": 5.145882352941176e-05, + "loss": 0.2767, + "step": 7045 + }, + { + "epoch": 414.71, + "grad_norm": 0.3542099893093109, + "learning_rate": 5.128235294117647e-05, + "loss": 0.2649, + "step": 7050 + }, + { + "epoch": 415.0, + "grad_norm": 0.4815574586391449, + "learning_rate": 5.110588235294117e-05, + "loss": 0.2819, + "step": 7055 + }, + { + "epoch": 415.29, + "grad_norm": 0.43821844458580017, + "learning_rate": 5.0929411764705875e-05, + "loss": 0.2809, + "step": 7060 + }, + { + "epoch": 415.29, + "eval_loss": 0.2668548822402954, + "eval_runtime": 1.9577, + "eval_samples_per_second": 68.447, + "eval_steps_per_second": 8.684, + "step": 7060 + }, + { + "epoch": 415.59, + "grad_norm": 0.38886937499046326, + "learning_rate": 5.0752941176470585e-05, + "loss": 0.2638, + "step": 7065 + }, + { + "epoch": 415.88, + "grad_norm": 0.40317365527153015, + "learning_rate": 5.057647058823529e-05, + "loss": 0.2812, + "step": 7070 + }, + { + "epoch": 416.18, + "grad_norm": 0.3852801024913788, + "learning_rate": 5.04e-05, + "loss": 0.2613, + "step": 7075 + }, + { + "epoch": 416.47, + "grad_norm": 0.3388362228870392, + "learning_rate": 5.02235294117647e-05, + "loss": 0.2594, + "step": 7080 + }, + { + "epoch": 416.47, + "eval_loss": 0.2670306861400604, + "eval_runtime": 1.9573, + "eval_samples_per_second": 68.461, + "eval_steps_per_second": 8.685, + "step": 7080 + }, + { + "epoch": 416.76, + "grad_norm": 0.2896983027458191, + "learning_rate": 5.0047058823529405e-05, + "loss": 0.2917, + "step": 7085 + }, + { + "epoch": 417.06, + "grad_norm": 0.3077000081539154, + "learning_rate": 4.9870588235294115e-05, + "loss": 0.2619, + "step": 7090 + }, + { + "epoch": 417.35, + "grad_norm": 0.4406202733516693, + "learning_rate": 4.9694117647058825e-05, + "loss": 0.2743, + "step": 7095 + }, + { + "epoch": 417.65, + "grad_norm": 0.4249783158302307, + "learning_rate": 4.951764705882352e-05, + "loss": 0.2746, + "step": 7100 + }, + { + "epoch": 417.65, + "eval_loss": 0.2669161558151245, + "eval_runtime": 1.959, + "eval_samples_per_second": 68.402, + "eval_steps_per_second": 8.678, + "step": 7100 + }, + { + "epoch": 417.94, + "grad_norm": 0.3489026725292206, + "learning_rate": 4.934117647058823e-05, + "loss": 0.2611, + "step": 7105 + }, + { + "epoch": 418.24, + "grad_norm": 0.39304283261299133, + "learning_rate": 4.916470588235294e-05, + "loss": 0.2661, + "step": 7110 + }, + { + "epoch": 418.53, + "grad_norm": 0.4020162522792816, + "learning_rate": 4.898823529411764e-05, + "loss": 0.271, + "step": 7115 + }, + { + "epoch": 418.82, + "grad_norm": 0.42413970828056335, + "learning_rate": 4.881176470588235e-05, + "loss": 0.2718, + "step": 7120 + }, + { + "epoch": 418.82, + "eval_loss": 0.26709815859794617, + "eval_runtime": 1.9579, + "eval_samples_per_second": 68.442, + "eval_steps_per_second": 8.683, + "step": 7120 + }, + { + "epoch": 419.12, + "grad_norm": 0.39988934993743896, + "learning_rate": 4.863529411764706e-05, + "loss": 0.2568, + "step": 7125 + }, + { + "epoch": 419.41, + "grad_norm": 0.2976055145263672, + "learning_rate": 4.845882352941176e-05, + "loss": 0.2718, + "step": 7130 + }, + { + "epoch": 419.71, + "grad_norm": 0.3112695515155792, + "learning_rate": 4.8282352941176464e-05, + "loss": 0.2833, + "step": 7135 + }, + { + "epoch": 420.0, + "grad_norm": 0.612853467464447, + "learning_rate": 4.8105882352941174e-05, + "loss": 0.2698, + "step": 7140 + }, + { + "epoch": 420.0, + "eval_loss": 0.2669050991535187, + "eval_runtime": 1.9565, + "eval_samples_per_second": 68.488, + "eval_steps_per_second": 8.689, + "step": 7140 + }, + { + "epoch": 420.29, + "grad_norm": 0.4274137020111084, + "learning_rate": 4.792941176470588e-05, + "loss": 0.2625, + "step": 7145 + }, + { + "epoch": 420.59, + "grad_norm": 0.33665600419044495, + "learning_rate": 4.775294117647058e-05, + "loss": 0.263, + "step": 7150 + }, + { + "epoch": 420.88, + "grad_norm": 0.3614768385887146, + "learning_rate": 4.757647058823529e-05, + "loss": 0.2817, + "step": 7155 + }, + { + "epoch": 421.18, + "grad_norm": 0.36414796113967896, + "learning_rate": 4.7399999999999993e-05, + "loss": 0.268, + "step": 7160 + }, + { + "epoch": 421.18, + "eval_loss": 0.26688352227211, + "eval_runtime": 1.9632, + "eval_samples_per_second": 68.254, + "eval_steps_per_second": 8.659, + "step": 7160 + }, + { + "epoch": 421.47, + "grad_norm": 0.3113715350627899, + "learning_rate": 4.72235294117647e-05, + "loss": 0.2529, + "step": 7165 + }, + { + "epoch": 421.76, + "grad_norm": 0.3870895206928253, + "learning_rate": 4.7047058823529407e-05, + "loss": 0.2782, + "step": 7170 + }, + { + "epoch": 422.06, + "grad_norm": 0.3166474401950836, + "learning_rate": 4.687058823529411e-05, + "loss": 0.2785, + "step": 7175 + }, + { + "epoch": 422.35, + "grad_norm": 0.46040579676628113, + "learning_rate": 4.669411764705882e-05, + "loss": 0.2912, + "step": 7180 + }, + { + "epoch": 422.35, + "eval_loss": 0.26677265763282776, + "eval_runtime": 1.9607, + "eval_samples_per_second": 68.344, + "eval_steps_per_second": 8.671, + "step": 7180 + }, + { + "epoch": 422.65, + "grad_norm": 0.32868900895118713, + "learning_rate": 4.651764705882353e-05, + "loss": 0.2681, + "step": 7185 + }, + { + "epoch": 422.94, + "grad_norm": 0.2918817102909088, + "learning_rate": 4.6341176470588226e-05, + "loss": 0.2587, + "step": 7190 + }, + { + "epoch": 423.24, + "grad_norm": 0.36773785948753357, + "learning_rate": 4.6164705882352936e-05, + "loss": 0.2762, + "step": 7195 + }, + { + "epoch": 423.53, + "grad_norm": 0.3703620433807373, + "learning_rate": 4.5988235294117646e-05, + "loss": 0.2718, + "step": 7200 + }, + { + "epoch": 423.53, + "eval_loss": 0.267058789730072, + "eval_runtime": 1.9613, + "eval_samples_per_second": 68.322, + "eval_steps_per_second": 8.668, + "step": 7200 + }, + { + "epoch": 423.82, + "grad_norm": 0.3874302804470062, + "learning_rate": 4.581176470588234e-05, + "loss": 0.2743, + "step": 7205 + }, + { + "epoch": 424.12, + "grad_norm": 0.400434285402298, + "learning_rate": 4.563529411764705e-05, + "loss": 0.2618, + "step": 7210 + }, + { + "epoch": 424.41, + "grad_norm": 0.3008696138858795, + "learning_rate": 4.545882352941176e-05, + "loss": 0.269, + "step": 7215 + }, + { + "epoch": 424.71, + "grad_norm": 0.3740052580833435, + "learning_rate": 4.528235294117647e-05, + "loss": 0.2609, + "step": 7220 + }, + { + "epoch": 424.71, + "eval_loss": 0.267034113407135, + "eval_runtime": 1.9596, + "eval_samples_per_second": 68.383, + "eval_steps_per_second": 8.675, + "step": 7220 + }, + { + "epoch": 425.0, + "grad_norm": 0.4082770049571991, + "learning_rate": 4.510588235294117e-05, + "loss": 0.282, + "step": 7225 + }, + { + "epoch": 425.29, + "grad_norm": 0.3433248698711395, + "learning_rate": 4.492941176470588e-05, + "loss": 0.2706, + "step": 7230 + }, + { + "epoch": 425.59, + "grad_norm": 0.34400299191474915, + "learning_rate": 4.475294117647059e-05, + "loss": 0.2755, + "step": 7235 + }, + { + "epoch": 425.88, + "grad_norm": 0.43836483359336853, + "learning_rate": 4.457647058823529e-05, + "loss": 0.2574, + "step": 7240 + }, + { + "epoch": 425.88, + "eval_loss": 0.2668897807598114, + "eval_runtime": 1.9604, + "eval_samples_per_second": 68.352, + "eval_steps_per_second": 8.671, + "step": 7240 + }, + { + "epoch": 426.18, + "grad_norm": 0.4405163824558258, + "learning_rate": 4.4399999999999995e-05, + "loss": 0.2694, + "step": 7245 + }, + { + "epoch": 426.47, + "grad_norm": 0.3573856055736542, + "learning_rate": 4.4223529411764705e-05, + "loss": 0.2794, + "step": 7250 + }, + { + "epoch": 426.76, + "grad_norm": 0.2749679386615753, + "learning_rate": 4.404705882352941e-05, + "loss": 0.28, + "step": 7255 + }, + { + "epoch": 427.06, + "grad_norm": 0.4706021547317505, + "learning_rate": 4.387058823529411e-05, + "loss": 0.262, + "step": 7260 + }, + { + "epoch": 427.06, + "eval_loss": 0.2668740153312683, + "eval_runtime": 1.9612, + "eval_samples_per_second": 68.327, + "eval_steps_per_second": 8.668, + "step": 7260 + }, + { + "epoch": 427.35, + "grad_norm": 0.4040289521217346, + "learning_rate": 4.369411764705882e-05, + "loss": 0.2855, + "step": 7265 + }, + { + "epoch": 427.65, + "grad_norm": 0.3104715049266815, + "learning_rate": 4.3517647058823525e-05, + "loss": 0.2556, + "step": 7270 + }, + { + "epoch": 427.94, + "grad_norm": 0.3873388171195984, + "learning_rate": 4.3341176470588234e-05, + "loss": 0.2592, + "step": 7275 + }, + { + "epoch": 428.24, + "grad_norm": 0.3692169189453125, + "learning_rate": 4.316470588235294e-05, + "loss": 0.2576, + "step": 7280 + }, + { + "epoch": 428.24, + "eval_loss": 0.26689329743385315, + "eval_runtime": 1.9596, + "eval_samples_per_second": 68.382, + "eval_steps_per_second": 8.675, + "step": 7280 + }, + { + "epoch": 428.53, + "grad_norm": 0.3553586006164551, + "learning_rate": 4.298823529411764e-05, + "loss": 0.2658, + "step": 7285 + }, + { + "epoch": 428.82, + "grad_norm": 0.3793712556362152, + "learning_rate": 4.281176470588235e-05, + "loss": 0.2908, + "step": 7290 + }, + { + "epoch": 429.12, + "grad_norm": 0.34864869713783264, + "learning_rate": 4.263529411764706e-05, + "loss": 0.2817, + "step": 7295 + }, + { + "epoch": 429.41, + "grad_norm": 0.30609363317489624, + "learning_rate": 4.245882352941176e-05, + "loss": 0.2541, + "step": 7300 + }, + { + "epoch": 429.41, + "eval_loss": 0.2668551504611969, + "eval_runtime": 1.963, + "eval_samples_per_second": 68.262, + "eval_steps_per_second": 8.66, + "step": 7300 + }, + { + "epoch": 429.71, + "grad_norm": 0.30565717816352844, + "learning_rate": 4.228235294117647e-05, + "loss": 0.2676, + "step": 7305 + }, + { + "epoch": 430.0, + "grad_norm": 0.46970728039741516, + "learning_rate": 4.210588235294118e-05, + "loss": 0.2717, + "step": 7310 + }, + { + "epoch": 430.29, + "grad_norm": 0.3707132041454315, + "learning_rate": 4.1929411764705874e-05, + "loss": 0.2575, + "step": 7315 + }, + { + "epoch": 430.59, + "grad_norm": 0.3450472056865692, + "learning_rate": 4.1752941176470583e-05, + "loss": 0.2705, + "step": 7320 + }, + { + "epoch": 430.59, + "eval_loss": 0.2669720947742462, + "eval_runtime": 1.9581, + "eval_samples_per_second": 68.433, + "eval_steps_per_second": 8.682, + "step": 7320 + }, + { + "epoch": 430.88, + "grad_norm": 0.32696962356567383, + "learning_rate": 4.1576470588235293e-05, + "loss": 0.2682, + "step": 7325 + }, + { + "epoch": 431.18, + "grad_norm": 0.3563686013221741, + "learning_rate": 4.14e-05, + "loss": 0.2996, + "step": 7330 + }, + { + "epoch": 431.47, + "grad_norm": 0.3656381070613861, + "learning_rate": 4.12235294117647e-05, + "loss": 0.2666, + "step": 7335 + }, + { + "epoch": 431.76, + "grad_norm": 0.34631747007369995, + "learning_rate": 4.104705882352941e-05, + "loss": 0.2657, + "step": 7340 + }, + { + "epoch": 431.76, + "eval_loss": 0.26687678694725037, + "eval_runtime": 1.96, + "eval_samples_per_second": 68.367, + "eval_steps_per_second": 8.673, + "step": 7340 + }, + { + "epoch": 432.06, + "grad_norm": 0.34558171033859253, + "learning_rate": 4.087058823529411e-05, + "loss": 0.2545, + "step": 7345 + }, + { + "epoch": 432.35, + "grad_norm": 0.31891581416130066, + "learning_rate": 4.0694117647058816e-05, + "loss": 0.2704, + "step": 7350 + }, + { + "epoch": 432.65, + "grad_norm": 0.37224364280700684, + "learning_rate": 4.0517647058823526e-05, + "loss": 0.2694, + "step": 7355 + }, + { + "epoch": 432.94, + "grad_norm": 0.34004124999046326, + "learning_rate": 4.034117647058823e-05, + "loss": 0.2714, + "step": 7360 + }, + { + "epoch": 432.94, + "eval_loss": 0.26686975359916687, + "eval_runtime": 1.9607, + "eval_samples_per_second": 68.341, + "eval_steps_per_second": 8.67, + "step": 7360 + }, + { + "epoch": 433.24, + "grad_norm": 0.3908204138278961, + "learning_rate": 4.016470588235294e-05, + "loss": 0.2677, + "step": 7365 + }, + { + "epoch": 433.53, + "grad_norm": 0.41703084111213684, + "learning_rate": 3.998823529411764e-05, + "loss": 0.2748, + "step": 7370 + }, + { + "epoch": 433.82, + "grad_norm": 0.2899929881095886, + "learning_rate": 3.9811764705882346e-05, + "loss": 0.2474, + "step": 7375 + }, + { + "epoch": 434.12, + "grad_norm": 0.4455171823501587, + "learning_rate": 3.9635294117647056e-05, + "loss": 0.2952, + "step": 7380 + }, + { + "epoch": 434.12, + "eval_loss": 0.2668073773384094, + "eval_runtime": 1.9852, + "eval_samples_per_second": 67.5, + "eval_steps_per_second": 8.563, + "step": 7380 + }, + { + "epoch": 434.41, + "grad_norm": 0.41936570405960083, + "learning_rate": 3.9458823529411766e-05, + "loss": 0.2779, + "step": 7385 + }, + { + "epoch": 434.71, + "grad_norm": 0.313447505235672, + "learning_rate": 3.928235294117646e-05, + "loss": 0.2535, + "step": 7390 + }, + { + "epoch": 435.0, + "grad_norm": 0.46548548340797424, + "learning_rate": 3.910588235294117e-05, + "loss": 0.2838, + "step": 7395 + }, + { + "epoch": 435.29, + "grad_norm": 0.3401893675327301, + "learning_rate": 3.892941176470588e-05, + "loss": 0.2684, + "step": 7400 + }, + { + "epoch": 435.29, + "eval_loss": 0.2667679488658905, + "eval_runtime": 1.9594, + "eval_samples_per_second": 68.387, + "eval_steps_per_second": 8.676, + "step": 7400 + }, + { + "epoch": 435.59, + "grad_norm": 0.47850340604782104, + "learning_rate": 3.875294117647058e-05, + "loss": 0.2669, + "step": 7405 + }, + { + "epoch": 435.88, + "grad_norm": 0.5556897521018982, + "learning_rate": 3.857647058823529e-05, + "loss": 0.292, + "step": 7410 + }, + { + "epoch": 436.18, + "grad_norm": 0.2881021201610565, + "learning_rate": 3.84e-05, + "loss": 0.2441, + "step": 7415 + }, + { + "epoch": 436.47, + "grad_norm": 0.38983282446861267, + "learning_rate": 3.822352941176471e-05, + "loss": 0.2862, + "step": 7420 + }, + { + "epoch": 436.47, + "eval_loss": 0.26682788133621216, + "eval_runtime": 1.9612, + "eval_samples_per_second": 68.326, + "eval_steps_per_second": 8.668, + "step": 7420 + }, + { + "epoch": 436.76, + "grad_norm": 0.3093799650669098, + "learning_rate": 3.8047058823529405e-05, + "loss": 0.2672, + "step": 7425 + }, + { + "epoch": 437.06, + "grad_norm": 0.349763423204422, + "learning_rate": 3.7870588235294115e-05, + "loss": 0.2719, + "step": 7430 + }, + { + "epoch": 437.35, + "grad_norm": 0.47995463013648987, + "learning_rate": 3.7694117647058825e-05, + "loss": 0.2755, + "step": 7435 + }, + { + "epoch": 437.65, + "grad_norm": 0.3841699957847595, + "learning_rate": 3.751764705882353e-05, + "loss": 0.272, + "step": 7440 + }, + { + "epoch": 437.65, + "eval_loss": 0.2668739855289459, + "eval_runtime": 1.958, + "eval_samples_per_second": 68.437, + "eval_steps_per_second": 8.682, + "step": 7440 + }, + { + "epoch": 437.94, + "grad_norm": 0.4285668134689331, + "learning_rate": 3.734117647058823e-05, + "loss": 0.2786, + "step": 7445 + }, + { + "epoch": 438.24, + "grad_norm": 0.3198803961277008, + "learning_rate": 3.716470588235294e-05, + "loss": 0.2606, + "step": 7450 + }, + { + "epoch": 438.53, + "grad_norm": 0.35747140645980835, + "learning_rate": 3.6988235294117644e-05, + "loss": 0.2766, + "step": 7455 + }, + { + "epoch": 438.82, + "grad_norm": 0.271284282207489, + "learning_rate": 3.681176470588235e-05, + "loss": 0.2699, + "step": 7460 + }, + { + "epoch": 438.82, + "eval_loss": 0.2667958736419678, + "eval_runtime": 1.9748, + "eval_samples_per_second": 67.854, + "eval_steps_per_second": 8.608, + "step": 7460 + }, + { + "epoch": 439.12, + "grad_norm": 0.2348794788122177, + "learning_rate": 3.663529411764706e-05, + "loss": 0.2583, + "step": 7465 + }, + { + "epoch": 439.41, + "grad_norm": 0.349631130695343, + "learning_rate": 3.645882352941176e-05, + "loss": 0.26, + "step": 7470 + }, + { + "epoch": 439.71, + "grad_norm": 0.46635523438453674, + "learning_rate": 3.6282352941176464e-05, + "loss": 0.2788, + "step": 7475 + }, + { + "epoch": 440.0, + "grad_norm": 0.5208901762962341, + "learning_rate": 3.6105882352941174e-05, + "loss": 0.2844, + "step": 7480 + }, + { + "epoch": 440.0, + "eval_loss": 0.26684466004371643, + "eval_runtime": 1.9588, + "eval_samples_per_second": 68.408, + "eval_steps_per_second": 8.679, + "step": 7480 + }, + { + "epoch": 440.29, + "grad_norm": 0.3358863890171051, + "learning_rate": 3.592941176470588e-05, + "loss": 0.2711, + "step": 7485 + }, + { + "epoch": 440.59, + "grad_norm": 0.41073301434516907, + "learning_rate": 3.575294117647059e-05, + "loss": 0.2853, + "step": 7490 + }, + { + "epoch": 440.88, + "grad_norm": 0.38413408398628235, + "learning_rate": 3.557647058823529e-05, + "loss": 0.2568, + "step": 7495 + }, + { + "epoch": 441.18, + "grad_norm": 0.4527212977409363, + "learning_rate": 3.539999999999999e-05, + "loss": 0.2634, + "step": 7500 + }, + { + "epoch": 441.18, + "eval_loss": 0.26679953932762146, + "eval_runtime": 1.9606, + "eval_samples_per_second": 68.347, + "eval_steps_per_second": 8.671, + "step": 7500 + }, + { + "epoch": 441.47, + "grad_norm": 0.4173857569694519, + "learning_rate": 3.52235294117647e-05, + "loss": 0.2882, + "step": 7505 + }, + { + "epoch": 441.76, + "grad_norm": 0.3076818883419037, + "learning_rate": 3.5047058823529406e-05, + "loss": 0.2497, + "step": 7510 + }, + { + "epoch": 442.06, + "grad_norm": 0.3708653151988983, + "learning_rate": 3.4870588235294116e-05, + "loss": 0.2702, + "step": 7515 + }, + { + "epoch": 442.35, + "grad_norm": 0.39249733090400696, + "learning_rate": 3.469411764705882e-05, + "loss": 0.2687, + "step": 7520 + }, + { + "epoch": 442.35, + "eval_loss": 0.26672786474227905, + "eval_runtime": 1.9614, + "eval_samples_per_second": 68.319, + "eval_steps_per_second": 8.667, + "step": 7520 + }, + { + "epoch": 442.65, + "grad_norm": 0.26322394609451294, + "learning_rate": 3.451764705882353e-05, + "loss": 0.2787, + "step": 7525 + }, + { + "epoch": 442.94, + "grad_norm": 0.32261088490486145, + "learning_rate": 3.434117647058823e-05, + "loss": 0.2575, + "step": 7530 + }, + { + "epoch": 443.24, + "grad_norm": 0.3599368929862976, + "learning_rate": 3.416470588235294e-05, + "loss": 0.2795, + "step": 7535 + }, + { + "epoch": 443.53, + "grad_norm": 0.422442764043808, + "learning_rate": 3.3988235294117646e-05, + "loss": 0.2792, + "step": 7540 + }, + { + "epoch": 443.53, + "eval_loss": 0.26674211025238037, + "eval_runtime": 1.9621, + "eval_samples_per_second": 68.295, + "eval_steps_per_second": 8.664, + "step": 7540 + }, + { + "epoch": 443.82, + "grad_norm": 0.41690734028816223, + "learning_rate": 3.381176470588235e-05, + "loss": 0.2638, + "step": 7545 + }, + { + "epoch": 444.12, + "grad_norm": 0.33567383885383606, + "learning_rate": 3.363529411764706e-05, + "loss": 0.2664, + "step": 7550 + }, + { + "epoch": 444.41, + "grad_norm": 0.31545794010162354, + "learning_rate": 3.345882352941176e-05, + "loss": 0.2646, + "step": 7555 + }, + { + "epoch": 444.71, + "grad_norm": 0.42885950207710266, + "learning_rate": 3.3282352941176465e-05, + "loss": 0.2762, + "step": 7560 + }, + { + "epoch": 444.71, + "eval_loss": 0.26678889989852905, + "eval_runtime": 1.9582, + "eval_samples_per_second": 68.43, + "eval_steps_per_second": 8.681, + "step": 7560 + }, + { + "epoch": 445.0, + "grad_norm": 0.36836519837379456, + "learning_rate": 3.3105882352941175e-05, + "loss": 0.2656, + "step": 7565 + }, + { + "epoch": 445.29, + "grad_norm": 0.42900845408439636, + "learning_rate": 3.292941176470588e-05, + "loss": 0.2697, + "step": 7570 + }, + { + "epoch": 445.59, + "grad_norm": 0.3435908555984497, + "learning_rate": 3.275294117647058e-05, + "loss": 0.2745, + "step": 7575 + }, + { + "epoch": 445.88, + "grad_norm": 0.37933817505836487, + "learning_rate": 3.257647058823529e-05, + "loss": 0.278, + "step": 7580 + }, + { + "epoch": 445.88, + "eval_loss": 0.2668588161468506, + "eval_runtime": 1.9619, + "eval_samples_per_second": 68.301, + "eval_steps_per_second": 8.665, + "step": 7580 + }, + { + "epoch": 446.18, + "grad_norm": 0.40266454219818115, + "learning_rate": 3.2399999999999995e-05, + "loss": 0.2605, + "step": 7585 + }, + { + "epoch": 446.47, + "grad_norm": 0.3531177043914795, + "learning_rate": 3.2223529411764705e-05, + "loss": 0.2533, + "step": 7590 + }, + { + "epoch": 446.76, + "grad_norm": 0.41788914799690247, + "learning_rate": 3.204705882352941e-05, + "loss": 0.2933, + "step": 7595 + }, + { + "epoch": 447.06, + "grad_norm": 0.3251320421695709, + "learning_rate": 3.187058823529411e-05, + "loss": 0.2546, + "step": 7600 + }, + { + "epoch": 447.06, + "eval_loss": 0.2667209506034851, + "eval_runtime": 1.9597, + "eval_samples_per_second": 68.378, + "eval_steps_per_second": 8.675, + "step": 7600 + }, + { + "epoch": 447.35, + "grad_norm": 0.30309104919433594, + "learning_rate": 3.169411764705882e-05, + "loss": 0.2746, + "step": 7605 + }, + { + "epoch": 447.65, + "grad_norm": 0.35474392771720886, + "learning_rate": 3.1517647058823524e-05, + "loss": 0.2749, + "step": 7610 + }, + { + "epoch": 447.94, + "grad_norm": 0.3973749279975891, + "learning_rate": 3.1341176470588234e-05, + "loss": 0.2691, + "step": 7615 + }, + { + "epoch": 448.24, + "grad_norm": 0.4338699281215668, + "learning_rate": 3.116470588235294e-05, + "loss": 0.2706, + "step": 7620 + }, + { + "epoch": 448.24, + "eval_loss": 0.2667592763900757, + "eval_runtime": 1.9633, + "eval_samples_per_second": 68.254, + "eval_steps_per_second": 8.659, + "step": 7620 + }, + { + "epoch": 448.53, + "grad_norm": 0.374825119972229, + "learning_rate": 3.098823529411765e-05, + "loss": 0.2755, + "step": 7625 + }, + { + "epoch": 448.82, + "grad_norm": 0.38818541169166565, + "learning_rate": 3.081176470588235e-05, + "loss": 0.254, + "step": 7630 + }, + { + "epoch": 449.12, + "grad_norm": 0.40987521409988403, + "learning_rate": 3.063529411764706e-05, + "loss": 0.2732, + "step": 7635 + }, + { + "epoch": 449.41, + "grad_norm": 0.3300062417984009, + "learning_rate": 3.045882352941176e-05, + "loss": 0.2701, + "step": 7640 + }, + { + "epoch": 449.41, + "eval_loss": 0.26686015725135803, + "eval_runtime": 1.9588, + "eval_samples_per_second": 68.41, + "eval_steps_per_second": 8.679, + "step": 7640 + }, + { + "epoch": 449.71, + "grad_norm": 0.3479211628437042, + "learning_rate": 3.0282352941176467e-05, + "loss": 0.2604, + "step": 7645 + }, + { + "epoch": 450.0, + "grad_norm": 0.5464439392089844, + "learning_rate": 3.0105882352941173e-05, + "loss": 0.2835, + "step": 7650 + }, + { + "epoch": 450.29, + "grad_norm": 0.3759833872318268, + "learning_rate": 2.9929411764705877e-05, + "loss": 0.2682, + "step": 7655 + }, + { + "epoch": 450.59, + "grad_norm": 0.4379965662956238, + "learning_rate": 2.9752941176470587e-05, + "loss": 0.2756, + "step": 7660 + }, + { + "epoch": 450.59, + "eval_loss": 0.2668231129646301, + "eval_runtime": 1.9608, + "eval_samples_per_second": 68.338, + "eval_steps_per_second": 8.67, + "step": 7660 + }, + { + "epoch": 450.88, + "grad_norm": 0.36382120847702026, + "learning_rate": 2.957647058823529e-05, + "loss": 0.2646, + "step": 7665 + }, + { + "epoch": 451.18, + "grad_norm": 0.3182559013366699, + "learning_rate": 2.94e-05, + "loss": 0.2713, + "step": 7670 + }, + { + "epoch": 451.47, + "grad_norm": 0.3091278076171875, + "learning_rate": 2.9223529411764703e-05, + "loss": 0.2656, + "step": 7675 + }, + { + "epoch": 451.76, + "grad_norm": 0.37635642290115356, + "learning_rate": 2.904705882352941e-05, + "loss": 0.2643, + "step": 7680 + }, + { + "epoch": 451.76, + "eval_loss": 0.2668117582798004, + "eval_runtime": 1.9615, + "eval_samples_per_second": 68.314, + "eval_steps_per_second": 8.667, + "step": 7680 + }, + { + "epoch": 452.06, + "grad_norm": 0.3754452168941498, + "learning_rate": 2.8870588235294116e-05, + "loss": 0.2756, + "step": 7685 + }, + { + "epoch": 452.35, + "grad_norm": 0.42099499702453613, + "learning_rate": 2.8694117647058823e-05, + "loss": 0.2885, + "step": 7690 + }, + { + "epoch": 452.65, + "grad_norm": 0.2674359679222107, + "learning_rate": 2.8517647058823526e-05, + "loss": 0.2687, + "step": 7695 + }, + { + "epoch": 452.94, + "grad_norm": 0.4094257354736328, + "learning_rate": 2.8341176470588232e-05, + "loss": 0.2594, + "step": 7700 + }, + { + "epoch": 452.94, + "eval_loss": 0.26680055260658264, + "eval_runtime": 1.9622, + "eval_samples_per_second": 68.291, + "eval_steps_per_second": 8.664, + "step": 7700 + }, + { + "epoch": 453.24, + "grad_norm": 0.3790624737739563, + "learning_rate": 2.816470588235294e-05, + "loss": 0.2733, + "step": 7705 + }, + { + "epoch": 453.53, + "grad_norm": 0.27124133706092834, + "learning_rate": 2.7988235294117642e-05, + "loss": 0.2592, + "step": 7710 + }, + { + "epoch": 453.82, + "grad_norm": 0.3562621772289276, + "learning_rate": 2.7811764705882352e-05, + "loss": 0.2611, + "step": 7715 + }, + { + "epoch": 454.12, + "grad_norm": 0.4149491786956787, + "learning_rate": 2.7635294117647055e-05, + "loss": 0.2919, + "step": 7720 + }, + { + "epoch": 454.12, + "eval_loss": 0.2667385935783386, + "eval_runtime": 1.9624, + "eval_samples_per_second": 68.284, + "eval_steps_per_second": 8.663, + "step": 7720 + }, + { + "epoch": 454.41, + "grad_norm": 0.3743014335632324, + "learning_rate": 2.7458823529411765e-05, + "loss": 0.262, + "step": 7725 + }, + { + "epoch": 454.71, + "grad_norm": 0.41884955763816833, + "learning_rate": 2.728235294117647e-05, + "loss": 0.2789, + "step": 7730 + }, + { + "epoch": 455.0, + "grad_norm": 0.37537407875061035, + "learning_rate": 2.7105882352941175e-05, + "loss": 0.2567, + "step": 7735 + }, + { + "epoch": 455.29, + "grad_norm": 0.3855379521846771, + "learning_rate": 2.6929411764705878e-05, + "loss": 0.2695, + "step": 7740 + }, + { + "epoch": 455.29, + "eval_loss": 0.26676028966903687, + "eval_runtime": 1.9591, + "eval_samples_per_second": 68.398, + "eval_steps_per_second": 8.677, + "step": 7740 + }, + { + "epoch": 455.59, + "grad_norm": 0.4059057831764221, + "learning_rate": 2.6752941176470585e-05, + "loss": 0.2772, + "step": 7745 + }, + { + "epoch": 455.88, + "grad_norm": 0.32665759325027466, + "learning_rate": 2.657647058823529e-05, + "loss": 0.2668, + "step": 7750 + }, + { + "epoch": 456.18, + "grad_norm": 0.4204210340976715, + "learning_rate": 2.6399999999999995e-05, + "loss": 0.2761, + "step": 7755 + }, + { + "epoch": 456.47, + "grad_norm": 0.38408416509628296, + "learning_rate": 2.6223529411764705e-05, + "loss": 0.2513, + "step": 7760 + }, + { + "epoch": 456.47, + "eval_loss": 0.26671886444091797, + "eval_runtime": 1.9605, + "eval_samples_per_second": 68.348, + "eval_steps_per_second": 8.671, + "step": 7760 + }, + { + "epoch": 456.76, + "grad_norm": 0.33560365438461304, + "learning_rate": 2.6047058823529408e-05, + "loss": 0.2741, + "step": 7765 + }, + { + "epoch": 457.06, + "grad_norm": 0.3258912265300751, + "learning_rate": 2.5870588235294118e-05, + "loss": 0.2799, + "step": 7770 + }, + { + "epoch": 457.35, + "grad_norm": 0.4232853055000305, + "learning_rate": 2.569411764705882e-05, + "loss": 0.2609, + "step": 7775 + }, + { + "epoch": 457.65, + "grad_norm": 0.3053160011768341, + "learning_rate": 2.5517647058823527e-05, + "loss": 0.268, + "step": 7780 + }, + { + "epoch": 457.65, + "eval_loss": 0.2668216824531555, + "eval_runtime": 1.9596, + "eval_samples_per_second": 68.382, + "eval_steps_per_second": 8.675, + "step": 7780 + }, + { + "epoch": 457.94, + "grad_norm": 0.5235462784767151, + "learning_rate": 2.5341176470588234e-05, + "loss": 0.2745, + "step": 7785 + }, + { + "epoch": 458.24, + "grad_norm": 0.35902392864227295, + "learning_rate": 2.516470588235294e-05, + "loss": 0.2784, + "step": 7790 + }, + { + "epoch": 458.53, + "grad_norm": 0.3601309359073639, + "learning_rate": 2.4988235294117644e-05, + "loss": 0.2566, + "step": 7795 + }, + { + "epoch": 458.82, + "grad_norm": 0.38922035694122314, + "learning_rate": 2.481176470588235e-05, + "loss": 0.2757, + "step": 7800 + }, + { + "epoch": 458.82, + "eval_loss": 0.2668432593345642, + "eval_runtime": 1.963, + "eval_samples_per_second": 68.263, + "eval_steps_per_second": 8.66, + "step": 7800 + }, + { + "epoch": 459.12, + "grad_norm": 0.44390764832496643, + "learning_rate": 2.4635294117647057e-05, + "loss": 0.2709, + "step": 7805 + }, + { + "epoch": 459.41, + "grad_norm": 0.47585272789001465, + "learning_rate": 2.445882352941176e-05, + "loss": 0.2798, + "step": 7810 + }, + { + "epoch": 459.71, + "grad_norm": 0.4614483714103699, + "learning_rate": 2.428235294117647e-05, + "loss": 0.2501, + "step": 7815 + }, + { + "epoch": 460.0, + "grad_norm": 0.4739503860473633, + "learning_rate": 2.4105882352941173e-05, + "loss": 0.28, + "step": 7820 + }, + { + "epoch": 460.0, + "eval_loss": 0.2667703628540039, + "eval_runtime": 1.958, + "eval_samples_per_second": 68.436, + "eval_steps_per_second": 8.682, + "step": 7820 + }, + { + "epoch": 460.29, + "grad_norm": 0.30313849449157715, + "learning_rate": 2.3929411764705883e-05, + "loss": 0.264, + "step": 7825 + }, + { + "epoch": 460.59, + "grad_norm": 0.45110228657722473, + "learning_rate": 2.3752941176470586e-05, + "loss": 0.2535, + "step": 7830 + }, + { + "epoch": 460.88, + "grad_norm": 0.4526078701019287, + "learning_rate": 2.3576470588235293e-05, + "loss": 0.2966, + "step": 7835 + }, + { + "epoch": 461.18, + "grad_norm": 0.39065656065940857, + "learning_rate": 2.34e-05, + "loss": 0.2635, + "step": 7840 + }, + { + "epoch": 461.18, + "eval_loss": 0.2667505741119385, + "eval_runtime": 1.9605, + "eval_samples_per_second": 68.35, + "eval_steps_per_second": 8.671, + "step": 7840 + }, + { + "epoch": 461.47, + "grad_norm": 0.37350746989250183, + "learning_rate": 2.3223529411764703e-05, + "loss": 0.266, + "step": 7845 + }, + { + "epoch": 461.76, + "grad_norm": 0.4786175787448883, + "learning_rate": 2.304705882352941e-05, + "loss": 0.2752, + "step": 7850 + }, + { + "epoch": 462.06, + "grad_norm": 0.3212645351886749, + "learning_rate": 2.2870588235294113e-05, + "loss": 0.2719, + "step": 7855 + }, + { + "epoch": 462.35, + "grad_norm": 0.4684135615825653, + "learning_rate": 2.2694117647058822e-05, + "loss": 0.2706, + "step": 7860 + }, + { + "epoch": 462.35, + "eval_loss": 0.2667818069458008, + "eval_runtime": 1.9595, + "eval_samples_per_second": 68.384, + "eval_steps_per_second": 8.676, + "step": 7860 + }, + { + "epoch": 462.65, + "grad_norm": 0.42575693130493164, + "learning_rate": 2.2517647058823526e-05, + "loss": 0.2676, + "step": 7865 + }, + { + "epoch": 462.94, + "grad_norm": 0.31912949681282043, + "learning_rate": 2.2341176470588236e-05, + "loss": 0.2637, + "step": 7870 + }, + { + "epoch": 463.24, + "grad_norm": 0.35293543338775635, + "learning_rate": 2.216470588235294e-05, + "loss": 0.2729, + "step": 7875 + }, + { + "epoch": 463.53, + "grad_norm": 0.3320141136646271, + "learning_rate": 2.1988235294117645e-05, + "loss": 0.2645, + "step": 7880 + }, + { + "epoch": 463.53, + "eval_loss": 0.2667749524116516, + "eval_runtime": 1.9571, + "eval_samples_per_second": 68.47, + "eval_steps_per_second": 8.686, + "step": 7880 + }, + { + "epoch": 463.82, + "grad_norm": 0.3407422602176666, + "learning_rate": 2.1811764705882352e-05, + "loss": 0.2567, + "step": 7885 + }, + { + "epoch": 464.12, + "grad_norm": 0.3591556251049042, + "learning_rate": 2.163529411764706e-05, + "loss": 0.2835, + "step": 7890 + }, + { + "epoch": 464.41, + "grad_norm": 0.38440531492233276, + "learning_rate": 2.1458823529411762e-05, + "loss": 0.2745, + "step": 7895 + }, + { + "epoch": 464.71, + "grad_norm": 0.42135700583457947, + "learning_rate": 2.128235294117647e-05, + "loss": 0.2552, + "step": 7900 + }, + { + "epoch": 464.71, + "eval_loss": 0.2666613459587097, + "eval_runtime": 1.9604, + "eval_samples_per_second": 68.352, + "eval_steps_per_second": 8.672, + "step": 7900 + }, + { + "epoch": 465.0, + "grad_norm": 0.49422168731689453, + "learning_rate": 2.1105882352941175e-05, + "loss": 0.281, + "step": 7905 + }, + { + "epoch": 465.29, + "grad_norm": 0.34952616691589355, + "learning_rate": 2.0929411764705878e-05, + "loss": 0.267, + "step": 7910 + }, + { + "epoch": 465.59, + "grad_norm": 0.4076564610004425, + "learning_rate": 2.0752941176470588e-05, + "loss": 0.2569, + "step": 7915 + }, + { + "epoch": 465.88, + "grad_norm": 0.33137398958206177, + "learning_rate": 2.057647058823529e-05, + "loss": 0.264, + "step": 7920 + }, + { + "epoch": 465.88, + "eval_loss": 0.2667132019996643, + "eval_runtime": 2.0193, + "eval_samples_per_second": 66.36, + "eval_steps_per_second": 8.419, + "step": 7920 + }, + { + "epoch": 466.18, + "grad_norm": 0.406131774187088, + "learning_rate": 2.04e-05, + "loss": 0.2838, + "step": 7925 + }, + { + "epoch": 466.47, + "grad_norm": 0.29246267676353455, + "learning_rate": 2.0223529411764704e-05, + "loss": 0.2468, + "step": 7930 + }, + { + "epoch": 466.76, + "grad_norm": 0.3498767018318176, + "learning_rate": 2.004705882352941e-05, + "loss": 0.2601, + "step": 7935 + }, + { + "epoch": 467.06, + "grad_norm": 0.3905380070209503, + "learning_rate": 1.9870588235294118e-05, + "loss": 0.3025, + "step": 7940 + }, + { + "epoch": 467.06, + "eval_loss": 0.2666773498058319, + "eval_runtime": 1.9601, + "eval_samples_per_second": 68.364, + "eval_steps_per_second": 8.673, + "step": 7940 + }, + { + "epoch": 467.35, + "grad_norm": 0.3925326466560364, + "learning_rate": 1.969411764705882e-05, + "loss": 0.2764, + "step": 7945 + }, + { + "epoch": 467.65, + "grad_norm": 0.31804412603378296, + "learning_rate": 1.9517647058823527e-05, + "loss": 0.2673, + "step": 7950 + }, + { + "epoch": 467.94, + "grad_norm": 0.37219950556755066, + "learning_rate": 1.934117647058823e-05, + "loss": 0.2785, + "step": 7955 + }, + { + "epoch": 468.24, + "grad_norm": 0.3101259768009186, + "learning_rate": 1.916470588235294e-05, + "loss": 0.2479, + "step": 7960 + }, + { + "epoch": 468.24, + "eval_loss": 0.2667126953601837, + "eval_runtime": 1.9626, + "eval_samples_per_second": 68.276, + "eval_steps_per_second": 8.662, + "step": 7960 + }, + { + "epoch": 468.53, + "grad_norm": 0.3561471104621887, + "learning_rate": 1.8988235294117644e-05, + "loss": 0.2638, + "step": 7965 + }, + { + "epoch": 468.82, + "grad_norm": 0.44877585768699646, + "learning_rate": 1.8811764705882354e-05, + "loss": 0.2785, + "step": 7970 + }, + { + "epoch": 469.12, + "grad_norm": 0.2568972110748291, + "learning_rate": 1.8635294117647057e-05, + "loss": 0.2559, + "step": 7975 + }, + { + "epoch": 469.41, + "grad_norm": 0.39801520109176636, + "learning_rate": 1.8458823529411763e-05, + "loss": 0.2681, + "step": 7980 + }, + { + "epoch": 469.41, + "eval_loss": 0.2667180895805359, + "eval_runtime": 1.9593, + "eval_samples_per_second": 68.391, + "eval_steps_per_second": 8.676, + "step": 7980 + }, + { + "epoch": 469.71, + "grad_norm": 0.3243696391582489, + "learning_rate": 1.828235294117647e-05, + "loss": 0.2734, + "step": 7985 + }, + { + "epoch": 470.0, + "grad_norm": 0.6277484893798828, + "learning_rate": 1.8105882352941177e-05, + "loss": 0.2893, + "step": 7990 + }, + { + "epoch": 470.29, + "grad_norm": 0.3302992284297943, + "learning_rate": 1.792941176470588e-05, + "loss": 0.258, + "step": 7995 + }, + { + "epoch": 470.59, + "grad_norm": 0.4239538908004761, + "learning_rate": 1.7752941176470586e-05, + "loss": 0.2816, + "step": 8000 + }, + { + "epoch": 470.59, + "eval_loss": 0.26670899987220764, + "eval_runtime": 2.0084, + "eval_samples_per_second": 66.719, + "eval_steps_per_second": 8.464, + "step": 8000 + }, + { + "epoch": 470.88, + "grad_norm": 0.3887185752391815, + "learning_rate": 1.7576470588235293e-05, + "loss": 0.2773, + "step": 8005 + }, + { + "epoch": 471.18, + "grad_norm": 0.40548381209373474, + "learning_rate": 1.74e-05, + "loss": 0.2694, + "step": 8010 + }, + { + "epoch": 471.47, + "grad_norm": 0.4005472660064697, + "learning_rate": 1.7223529411764703e-05, + "loss": 0.2682, + "step": 8015 + }, + { + "epoch": 471.76, + "grad_norm": 0.3661952614784241, + "learning_rate": 1.704705882352941e-05, + "loss": 0.2676, + "step": 8020 + }, + { + "epoch": 471.76, + "eval_loss": 0.266693115234375, + "eval_runtime": 1.959, + "eval_samples_per_second": 68.402, + "eval_steps_per_second": 8.678, + "step": 8020 + }, + { + "epoch": 472.06, + "grad_norm": 0.3128514885902405, + "learning_rate": 1.6870588235294116e-05, + "loss": 0.2647, + "step": 8025 + }, + { + "epoch": 472.35, + "grad_norm": 0.4018368124961853, + "learning_rate": 1.6694117647058822e-05, + "loss": 0.2568, + "step": 8030 + }, + { + "epoch": 472.65, + "grad_norm": 0.4566549062728882, + "learning_rate": 1.651764705882353e-05, + "loss": 0.2754, + "step": 8035 + }, + { + "epoch": 472.94, + "grad_norm": 0.39016836881637573, + "learning_rate": 1.6341176470588235e-05, + "loss": 0.2797, + "step": 8040 + }, + { + "epoch": 472.94, + "eval_loss": 0.2667049467563629, + "eval_runtime": 1.9597, + "eval_samples_per_second": 68.377, + "eval_steps_per_second": 8.675, + "step": 8040 + }, + { + "epoch": 473.24, + "grad_norm": 0.35952600836753845, + "learning_rate": 1.616470588235294e-05, + "loss": 0.2607, + "step": 8045 + }, + { + "epoch": 473.53, + "grad_norm": 0.2912003993988037, + "learning_rate": 1.5988235294117645e-05, + "loss": 0.2732, + "step": 8050 + }, + { + "epoch": 473.82, + "grad_norm": 0.432422012090683, + "learning_rate": 1.5811764705882352e-05, + "loss": 0.2806, + "step": 8055 + }, + { + "epoch": 474.12, + "grad_norm": 0.358229398727417, + "learning_rate": 1.563529411764706e-05, + "loss": 0.2508, + "step": 8060 + }, + { + "epoch": 474.12, + "eval_loss": 0.2666056156158447, + "eval_runtime": 1.9618, + "eval_samples_per_second": 68.306, + "eval_steps_per_second": 8.666, + "step": 8060 + }, + { + "epoch": 474.41, + "grad_norm": 0.3028172254562378, + "learning_rate": 1.545882352941176e-05, + "loss": 0.2446, + "step": 8065 + }, + { + "epoch": 474.71, + "grad_norm": 0.3407018780708313, + "learning_rate": 1.5282352941176468e-05, + "loss": 0.2725, + "step": 8070 + }, + { + "epoch": 475.0, + "grad_norm": 0.4242802560329437, + "learning_rate": 1.5105882352941175e-05, + "loss": 0.2966, + "step": 8075 + }, + { + "epoch": 475.29, + "grad_norm": 0.43905311822891235, + "learning_rate": 1.4929411764705881e-05, + "loss": 0.2744, + "step": 8080 + }, + { + "epoch": 475.29, + "eval_loss": 0.266627699136734, + "eval_runtime": 1.9591, + "eval_samples_per_second": 68.398, + "eval_steps_per_second": 8.677, + "step": 8080 + }, + { + "epoch": 475.59, + "grad_norm": 0.3310249149799347, + "learning_rate": 1.4752941176470586e-05, + "loss": 0.2707, + "step": 8085 + }, + { + "epoch": 475.88, + "grad_norm": 0.3586319386959076, + "learning_rate": 1.4576470588235293e-05, + "loss": 0.2652, + "step": 8090 + }, + { + "epoch": 476.18, + "grad_norm": 0.39899566769599915, + "learning_rate": 1.44e-05, + "loss": 0.265, + "step": 8095 + }, + { + "epoch": 476.47, + "grad_norm": 0.3453006148338318, + "learning_rate": 1.4223529411764706e-05, + "loss": 0.2709, + "step": 8100 + }, + { + "epoch": 476.47, + "eval_loss": 0.26659029722213745, + "eval_runtime": 1.9595, + "eval_samples_per_second": 68.386, + "eval_steps_per_second": 8.676, + "step": 8100 + }, + { + "epoch": 476.76, + "grad_norm": 0.36494091153144836, + "learning_rate": 1.404705882352941e-05, + "loss": 0.272, + "step": 8105 + }, + { + "epoch": 477.06, + "grad_norm": 0.4796656668186188, + "learning_rate": 1.3870588235294117e-05, + "loss": 0.2728, + "step": 8110 + }, + { + "epoch": 477.35, + "grad_norm": 0.4194751977920532, + "learning_rate": 1.369411764705882e-05, + "loss": 0.2688, + "step": 8115 + }, + { + "epoch": 477.65, + "grad_norm": 0.3637272119522095, + "learning_rate": 1.3517647058823527e-05, + "loss": 0.2692, + "step": 8120 + }, + { + "epoch": 477.65, + "eval_loss": 0.26665207743644714, + "eval_runtime": 1.9595, + "eval_samples_per_second": 68.385, + "eval_steps_per_second": 8.676, + "step": 8120 + }, + { + "epoch": 477.94, + "grad_norm": 0.5548241138458252, + "learning_rate": 1.3341176470588234e-05, + "loss": 0.2695, + "step": 8125 + }, + { + "epoch": 478.24, + "grad_norm": 0.568526566028595, + "learning_rate": 1.316470588235294e-05, + "loss": 0.2826, + "step": 8130 + }, + { + "epoch": 478.53, + "grad_norm": 0.42577067017555237, + "learning_rate": 1.2988235294117645e-05, + "loss": 0.2524, + "step": 8135 + }, + { + "epoch": 478.82, + "grad_norm": 0.4070892930030823, + "learning_rate": 1.2811764705882352e-05, + "loss": 0.2812, + "step": 8140 + }, + { + "epoch": 478.82, + "eval_loss": 0.2666857838630676, + "eval_runtime": 1.9625, + "eval_samples_per_second": 68.279, + "eval_steps_per_second": 8.662, + "step": 8140 + }, + { + "epoch": 479.12, + "grad_norm": 0.4978716969490051, + "learning_rate": 1.2635294117647058e-05, + "loss": 0.2677, + "step": 8145 + }, + { + "epoch": 479.41, + "grad_norm": 0.4298340380191803, + "learning_rate": 1.2458823529411765e-05, + "loss": 0.2838, + "step": 8150 + }, + { + "epoch": 479.71, + "grad_norm": 0.38949719071388245, + "learning_rate": 1.228235294117647e-05, + "loss": 0.2448, + "step": 8155 + }, + { + "epoch": 480.0, + "grad_norm": 0.39076128602027893, + "learning_rate": 1.2105882352941176e-05, + "loss": 0.2726, + "step": 8160 + }, + { + "epoch": 480.0, + "eval_loss": 0.2666655480861664, + "eval_runtime": 1.9575, + "eval_samples_per_second": 68.453, + "eval_steps_per_second": 8.684, + "step": 8160 + }, + { + "epoch": 480.29, + "grad_norm": 0.4349394738674164, + "learning_rate": 1.192941176470588e-05, + "loss": 0.2663, + "step": 8165 + }, + { + "epoch": 480.59, + "grad_norm": 0.24221451580524445, + "learning_rate": 1.1752941176470586e-05, + "loss": 0.2653, + "step": 8170 + }, + { + "epoch": 480.88, + "grad_norm": 0.3978033661842346, + "learning_rate": 1.1576470588235293e-05, + "loss": 0.2624, + "step": 8175 + }, + { + "epoch": 481.18, + "grad_norm": 0.504089891910553, + "learning_rate": 1.14e-05, + "loss": 0.2847, + "step": 8180 + }, + { + "epoch": 481.18, + "eval_loss": 0.2666530907154083, + "eval_runtime": 1.958, + "eval_samples_per_second": 68.438, + "eval_steps_per_second": 8.682, + "step": 8180 + }, + { + "epoch": 481.47, + "grad_norm": 0.368954598903656, + "learning_rate": 1.1223529411764704e-05, + "loss": 0.2645, + "step": 8185 + }, + { + "epoch": 481.76, + "grad_norm": 0.4450467526912689, + "learning_rate": 1.104705882352941e-05, + "loss": 0.2642, + "step": 8190 + }, + { + "epoch": 482.06, + "grad_norm": 0.3225043714046478, + "learning_rate": 1.0870588235294117e-05, + "loss": 0.2735, + "step": 8195 + }, + { + "epoch": 482.35, + "grad_norm": 0.34923630952835083, + "learning_rate": 1.0694117647058824e-05, + "loss": 0.2668, + "step": 8200 + }, + { + "epoch": 482.35, + "eval_loss": 0.2666921019554138, + "eval_runtime": 1.9608, + "eval_samples_per_second": 68.34, + "eval_steps_per_second": 8.67, + "step": 8200 + }, + { + "epoch": 482.65, + "grad_norm": 0.36203065514564514, + "learning_rate": 1.0517647058823529e-05, + "loss": 0.2527, + "step": 8205 + }, + { + "epoch": 482.94, + "grad_norm": 0.4076547622680664, + "learning_rate": 1.0341176470588235e-05, + "loss": 0.2837, + "step": 8210 + }, + { + "epoch": 483.24, + "grad_norm": 0.37790539860725403, + "learning_rate": 1.016470588235294e-05, + "loss": 0.2814, + "step": 8215 + }, + { + "epoch": 483.53, + "grad_norm": 0.4456799030303955, + "learning_rate": 9.988235294117645e-06, + "loss": 0.2611, + "step": 8220 + }, + { + "epoch": 483.53, + "eval_loss": 0.26666709780693054, + "eval_runtime": 1.9611, + "eval_samples_per_second": 68.331, + "eval_steps_per_second": 8.669, + "step": 8220 + }, + { + "epoch": 483.82, + "grad_norm": 0.3900556266307831, + "learning_rate": 9.811764705882352e-06, + "loss": 0.2636, + "step": 8225 + }, + { + "epoch": 484.12, + "grad_norm": 0.3448634147644043, + "learning_rate": 9.635294117647058e-06, + "loss": 0.288, + "step": 8230 + }, + { + "epoch": 484.41, + "grad_norm": 0.3983355462551117, + "learning_rate": 9.458823529411763e-06, + "loss": 0.278, + "step": 8235 + }, + { + "epoch": 484.71, + "grad_norm": 0.35284551978111267, + "learning_rate": 9.28235294117647e-06, + "loss": 0.2538, + "step": 8240 + }, + { + "epoch": 484.71, + "eval_loss": 0.2666126787662506, + "eval_runtime": 1.9591, + "eval_samples_per_second": 68.399, + "eval_steps_per_second": 8.677, + "step": 8240 + }, + { + "epoch": 485.0, + "grad_norm": 0.34337693452835083, + "learning_rate": 9.105882352941176e-06, + "loss": 0.2689, + "step": 8245 + }, + { + "epoch": 485.29, + "grad_norm": 0.34553298354148865, + "learning_rate": 8.929411764705881e-06, + "loss": 0.2763, + "step": 8250 + }, + { + "epoch": 485.59, + "grad_norm": 0.3715963363647461, + "learning_rate": 8.752941176470588e-06, + "loss": 0.2742, + "step": 8255 + }, + { + "epoch": 485.88, + "grad_norm": 0.2521974444389343, + "learning_rate": 8.576470588235293e-06, + "loss": 0.2595, + "step": 8260 + }, + { + "epoch": 485.88, + "eval_loss": 0.2666740417480469, + "eval_runtime": 1.9593, + "eval_samples_per_second": 68.393, + "eval_steps_per_second": 8.677, + "step": 8260 + }, + { + "epoch": 486.18, + "grad_norm": 0.4987650513648987, + "learning_rate": 8.4e-06, + "loss": 0.2801, + "step": 8265 + }, + { + "epoch": 486.47, + "grad_norm": 0.4373900294303894, + "learning_rate": 8.223529411764706e-06, + "loss": 0.2696, + "step": 8270 + }, + { + "epoch": 486.76, + "grad_norm": 0.3599467873573303, + "learning_rate": 8.04705882352941e-06, + "loss": 0.2726, + "step": 8275 + }, + { + "epoch": 487.06, + "grad_norm": 0.3576565086841583, + "learning_rate": 7.870588235294117e-06, + "loss": 0.262, + "step": 8280 + }, + { + "epoch": 487.06, + "eval_loss": 0.26664215326309204, + "eval_runtime": 1.9605, + "eval_samples_per_second": 68.348, + "eval_steps_per_second": 8.671, + "step": 8280 + }, + { + "epoch": 487.35, + "grad_norm": 0.3797486126422882, + "learning_rate": 7.694117647058822e-06, + "loss": 0.2661, + "step": 8285 + }, + { + "epoch": 487.65, + "grad_norm": 0.42296284437179565, + "learning_rate": 7.517647058823529e-06, + "loss": 0.2743, + "step": 8290 + }, + { + "epoch": 487.94, + "grad_norm": 0.3592970371246338, + "learning_rate": 7.341176470588235e-06, + "loss": 0.2674, + "step": 8295 + }, + { + "epoch": 488.24, + "grad_norm": 0.4181000292301178, + "learning_rate": 7.16470588235294e-06, + "loss": 0.2677, + "step": 8300 + }, + { + "epoch": 488.24, + "eval_loss": 0.26670971512794495, + "eval_runtime": 1.9597, + "eval_samples_per_second": 68.376, + "eval_steps_per_second": 8.675, + "step": 8300 + }, + { + "epoch": 488.53, + "grad_norm": 0.38137099146842957, + "learning_rate": 6.988235294117646e-06, + "loss": 0.2635, + "step": 8305 + }, + { + "epoch": 488.82, + "grad_norm": 0.4150102734565735, + "learning_rate": 6.8117647058823524e-06, + "loss": 0.2641, + "step": 8310 + }, + { + "epoch": 489.12, + "grad_norm": 0.2933318018913269, + "learning_rate": 6.635294117647058e-06, + "loss": 0.2778, + "step": 8315 + }, + { + "epoch": 489.41, + "grad_norm": 0.34779807925224304, + "learning_rate": 6.458823529411765e-06, + "loss": 0.27, + "step": 8320 + }, + { + "epoch": 489.41, + "eval_loss": 0.2666507661342621, + "eval_runtime": 1.9581, + "eval_samples_per_second": 68.433, + "eval_steps_per_second": 8.682, + "step": 8320 + }, + { + "epoch": 489.71, + "grad_norm": 0.37067240476608276, + "learning_rate": 6.28235294117647e-06, + "loss": 0.2612, + "step": 8325 + }, + { + "epoch": 490.0, + "grad_norm": 0.40835967659950256, + "learning_rate": 6.105882352941175e-06, + "loss": 0.2666, + "step": 8330 + }, + { + "epoch": 490.29, + "grad_norm": 0.3351464867591858, + "learning_rate": 5.929411764705882e-06, + "loss": 0.2677, + "step": 8335 + }, + { + "epoch": 490.59, + "grad_norm": 0.35004377365112305, + "learning_rate": 5.752941176470588e-06, + "loss": 0.2767, + "step": 8340 + }, + { + "epoch": 490.59, + "eval_loss": 0.2665998637676239, + "eval_runtime": 1.9596, + "eval_samples_per_second": 68.381, + "eval_steps_per_second": 8.675, + "step": 8340 + }, + { + "epoch": 490.88, + "grad_norm": 0.5012837648391724, + "learning_rate": 5.576470588235294e-06, + "loss": 0.2663, + "step": 8345 + }, + { + "epoch": 491.18, + "grad_norm": 0.41551175713539124, + "learning_rate": 5.399999999999999e-06, + "loss": 0.2791, + "step": 8350 + }, + { + "epoch": 491.47, + "grad_norm": 0.32633715867996216, + "learning_rate": 5.223529411764705e-06, + "loss": 0.2732, + "step": 8355 + }, + { + "epoch": 491.76, + "grad_norm": 0.295227974653244, + "learning_rate": 5.0470588235294114e-06, + "loss": 0.2471, + "step": 8360 + }, + { + "epoch": 491.76, + "eval_loss": 0.26662540435791016, + "eval_runtime": 1.9894, + "eval_samples_per_second": 67.359, + "eval_steps_per_second": 8.545, + "step": 8360 + }, + { + "epoch": 492.06, + "grad_norm": 0.3564363121986389, + "learning_rate": 4.870588235294117e-06, + "loss": 0.2596, + "step": 8365 + }, + { + "epoch": 492.35, + "grad_norm": 0.38165000081062317, + "learning_rate": 4.694117647058824e-06, + "loss": 0.2578, + "step": 8370 + }, + { + "epoch": 492.65, + "grad_norm": 0.3212469518184662, + "learning_rate": 4.5176470588235295e-06, + "loss": 0.267, + "step": 8375 + }, + { + "epoch": 492.94, + "grad_norm": 0.4372556805610657, + "learning_rate": 4.341176470588234e-06, + "loss": 0.2839, + "step": 8380 + }, + { + "epoch": 492.94, + "eval_loss": 0.26661908626556396, + "eval_runtime": 1.9611, + "eval_samples_per_second": 68.328, + "eval_steps_per_second": 8.669, + "step": 8380 + }, + { + "epoch": 493.24, + "grad_norm": 0.35395434498786926, + "learning_rate": 4.164705882352941e-06, + "loss": 0.2528, + "step": 8385 + }, + { + "epoch": 493.53, + "grad_norm": 0.3885289132595062, + "learning_rate": 3.988235294117647e-06, + "loss": 0.2849, + "step": 8390 + }, + { + "epoch": 493.82, + "grad_norm": 0.3720896542072296, + "learning_rate": 3.8117647058823524e-06, + "loss": 0.2633, + "step": 8395 + }, + { + "epoch": 494.12, + "grad_norm": 0.35971447825431824, + "learning_rate": 3.6352941176470585e-06, + "loss": 0.2601, + "step": 8400 + }, + { + "epoch": 494.12, + "eval_loss": 0.26667940616607666, + "eval_runtime": 1.9603, + "eval_samples_per_second": 68.356, + "eval_steps_per_second": 8.672, + "step": 8400 + }, + { + "epoch": 494.41, + "grad_norm": 0.46969982981681824, + "learning_rate": 3.4588235294117643e-06, + "loss": 0.2707, + "step": 8405 + }, + { + "epoch": 494.71, + "grad_norm": 0.36853352189064026, + "learning_rate": 3.2823529411764704e-06, + "loss": 0.2825, + "step": 8410 + }, + { + "epoch": 495.0, + "grad_norm": 0.589340329170227, + "learning_rate": 3.1058823529411766e-06, + "loss": 0.2617, + "step": 8415 + }, + { + "epoch": 495.29, + "grad_norm": 0.3106718063354492, + "learning_rate": 2.929411764705882e-06, + "loss": 0.2784, + "step": 8420 + }, + { + "epoch": 495.29, + "eval_loss": 0.2666472792625427, + "eval_runtime": 1.9622, + "eval_samples_per_second": 68.289, + "eval_steps_per_second": 8.664, + "step": 8420 + }, + { + "epoch": 495.59, + "grad_norm": 0.40149345993995667, + "learning_rate": 2.752941176470588e-06, + "loss": 0.2733, + "step": 8425 + }, + { + "epoch": 495.88, + "grad_norm": 0.28339532017707825, + "learning_rate": 2.5764705882352937e-06, + "loss": 0.2393, + "step": 8430 + }, + { + "epoch": 496.18, + "grad_norm": 0.447640061378479, + "learning_rate": 2.4e-06, + "loss": 0.3095, + "step": 8435 + }, + { + "epoch": 496.47, + "grad_norm": 0.3531031608581543, + "learning_rate": 2.2235294117647056e-06, + "loss": 0.2516, + "step": 8440 + }, + { + "epoch": 496.47, + "eval_loss": 0.26662933826446533, + "eval_runtime": 1.961, + "eval_samples_per_second": 68.331, + "eval_steps_per_second": 8.669, + "step": 8440 + }, + { + "epoch": 496.76, + "grad_norm": 0.4573391079902649, + "learning_rate": 2.0470588235294113e-06, + "loss": 0.2743, + "step": 8445 + }, + { + "epoch": 497.06, + "grad_norm": 0.48423144221305847, + "learning_rate": 1.8705882352941173e-06, + "loss": 0.2629, + "step": 8450 + }, + { + "epoch": 497.35, + "grad_norm": 0.42150747776031494, + "learning_rate": 1.6941176470588234e-06, + "loss": 0.2651, + "step": 8455 + }, + { + "epoch": 497.65, + "grad_norm": 0.34400367736816406, + "learning_rate": 1.5176470588235294e-06, + "loss": 0.2638, + "step": 8460 + }, + { + "epoch": 497.65, + "eval_loss": 0.2666572332382202, + "eval_runtime": 1.9831, + "eval_samples_per_second": 67.572, + "eval_steps_per_second": 8.573, + "step": 8460 + }, + { + "epoch": 497.94, + "grad_norm": 0.43939855694770813, + "learning_rate": 1.3411764705882351e-06, + "loss": 0.2695, + "step": 8465 + }, + { + "epoch": 498.24, + "grad_norm": 0.3315202295780182, + "learning_rate": 1.164705882352941e-06, + "loss": 0.2536, + "step": 8470 + }, + { + "epoch": 498.53, + "grad_norm": 0.34936991333961487, + "learning_rate": 9.88235294117647e-07, + "loss": 0.2691, + "step": 8475 + }, + { + "epoch": 498.82, + "grad_norm": 0.35634204745292664, + "learning_rate": 8.117647058823528e-07, + "loss": 0.2736, + "step": 8480 + }, + { + "epoch": 498.82, + "eval_loss": 0.26657968759536743, + "eval_runtime": 1.9605, + "eval_samples_per_second": 68.349, + "eval_steps_per_second": 8.671, + "step": 8480 + }, + { + "epoch": 499.12, + "grad_norm": 0.27652254700660706, + "learning_rate": 6.352941176470588e-07, + "loss": 0.2719, + "step": 8485 + }, + { + "epoch": 499.41, + "grad_norm": 0.33388492465019226, + "learning_rate": 4.588235294117647e-07, + "loss": 0.2403, + "step": 8490 + }, + { + "epoch": 499.71, + "grad_norm": 0.31336426734924316, + "learning_rate": 2.823529411764706e-07, + "loss": 0.274, + "step": 8495 + }, + { + "epoch": 500.0, + "grad_norm": 0.5548220872879028, + "learning_rate": 1.0588235294117647e-07, + "loss": 0.2957, + "step": 8500 + }, + { + "epoch": 500.0, + "eval_loss": 0.2666637897491455, + "eval_runtime": 1.9597, + "eval_samples_per_second": 68.377, + "eval_steps_per_second": 8.675, + "step": 8500 + } + ], + "logging_steps": 5, + "max_steps": 8500, + "num_input_tokens_seen": 0, + "num_train_epochs": 500, + "save_steps": 20, + "total_flos": 7.61871316058112e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}