diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5199 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999279383151978, + "eval_steps": 100, + "global_step": 3469, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0014412336960438136, + "grad_norm": 15.083786010742188, + "learning_rate": 2.8735632183908047e-07, + "loss": 2.483, + "step": 5 + }, + { + "epoch": 0.0028824673920876272, + "grad_norm": 14.648148536682129, + "learning_rate": 5.747126436781609e-07, + "loss": 2.217, + "step": 10 + }, + { + "epoch": 0.004323701088131441, + "grad_norm": 15.199370384216309, + "learning_rate": 8.620689655172415e-07, + "loss": 2.2391, + "step": 15 + }, + { + "epoch": 0.0057649347841752544, + "grad_norm": 15.044349670410156, + "learning_rate": 1.1494252873563219e-06, + "loss": 2.4945, + "step": 20 + }, + { + "epoch": 0.007206168480219067, + "grad_norm": 14.505260467529297, + "learning_rate": 1.4367816091954023e-06, + "loss": 2.2728, + "step": 25 + }, + { + "epoch": 0.008647402176262881, + "grad_norm": 14.381186485290527, + "learning_rate": 1.724137931034483e-06, + "loss": 2.1199, + "step": 30 + }, + { + "epoch": 0.010088635872306694, + "grad_norm": 12.466582298278809, + "learning_rate": 2.0114942528735633e-06, + "loss": 1.8471, + "step": 35 + }, + { + "epoch": 0.011529869568350509, + "grad_norm": 13.661079406738281, + "learning_rate": 2.2988505747126437e-06, + "loss": 1.7115, + "step": 40 + }, + { + "epoch": 0.012971103264394322, + "grad_norm": 13.910940170288086, + "learning_rate": 2.5862068965517246e-06, + "loss": 1.7219, + "step": 45 + }, + { + "epoch": 0.014412336960438135, + "grad_norm": 14.404576301574707, + "learning_rate": 2.8735632183908046e-06, + "loss": 1.6988, + "step": 50 + }, + { + "epoch": 0.01585357065648195, + "grad_norm": 13.622361183166504, + "learning_rate": 3.1609195402298854e-06, + "loss": 1.1913, + "step": 55 + }, + { + "epoch": 0.017294804352525762, + "grad_norm": 4.847095966339111, + "learning_rate": 3.448275862068966e-06, + "loss": 0.9125, + "step": 60 + }, + { + "epoch": 0.018736038048569575, + "grad_norm": 5.12880277633667, + "learning_rate": 3.7356321839080462e-06, + "loss": 0.8373, + "step": 65 + }, + { + "epoch": 0.02017727174461339, + "grad_norm": 1.7816091775894165, + "learning_rate": 4.022988505747127e-06, + "loss": 0.6926, + "step": 70 + }, + { + "epoch": 0.0216185054406572, + "grad_norm": 2.607332706451416, + "learning_rate": 4.310344827586207e-06, + "loss": 0.6349, + "step": 75 + }, + { + "epoch": 0.023059739136701018, + "grad_norm": 1.3011763095855713, + "learning_rate": 4.5977011494252875e-06, + "loss": 0.557, + "step": 80 + }, + { + "epoch": 0.02450097283274483, + "grad_norm": 1.5208930969238281, + "learning_rate": 4.885057471264369e-06, + "loss": 0.5506, + "step": 85 + }, + { + "epoch": 0.025942206528788644, + "grad_norm": 2.021416425704956, + "learning_rate": 5.172413793103449e-06, + "loss": 0.6639, + "step": 90 + }, + { + "epoch": 0.027383440224832457, + "grad_norm": 1.7699145078659058, + "learning_rate": 5.45977011494253e-06, + "loss": 0.4614, + "step": 95 + }, + { + "epoch": 0.02882467392087627, + "grad_norm": 4.601930141448975, + "learning_rate": 5.747126436781609e-06, + "loss": 0.5361, + "step": 100 + }, + { + "epoch": 0.02882467392087627, + "eval_loss": 0.49337437748908997, + "eval_mse": 0.49337437438964843, + "eval_runtime": 3.4708, + "eval_samples_per_second": 288.122, + "eval_steps_per_second": 18.152, + "step": 100 + }, + { + "epoch": 0.030265907616920083, + "grad_norm": 4.816018581390381, + "learning_rate": 6.03448275862069e-06, + "loss": 0.5876, + "step": 105 + }, + { + "epoch": 0.0317071413129639, + "grad_norm": 4.016970157623291, + "learning_rate": 6.321839080459771e-06, + "loss": 0.4752, + "step": 110 + }, + { + "epoch": 0.03314837500900771, + "grad_norm": 2.397547483444214, + "learning_rate": 6.609195402298851e-06, + "loss": 0.5025, + "step": 115 + }, + { + "epoch": 0.034589608705051525, + "grad_norm": 2.304940938949585, + "learning_rate": 6.896551724137932e-06, + "loss": 0.4742, + "step": 120 + }, + { + "epoch": 0.03603084240109534, + "grad_norm": 4.651687145233154, + "learning_rate": 7.183908045977011e-06, + "loss": 0.4382, + "step": 125 + }, + { + "epoch": 0.03747207609713915, + "grad_norm": 2.2445216178894043, + "learning_rate": 7.4712643678160925e-06, + "loss": 0.3867, + "step": 130 + }, + { + "epoch": 0.038913309793182964, + "grad_norm": 2.322697877883911, + "learning_rate": 7.758620689655173e-06, + "loss": 0.4629, + "step": 135 + }, + { + "epoch": 0.04035454348922678, + "grad_norm": 2.4827287197113037, + "learning_rate": 8.045977011494253e-06, + "loss": 0.3633, + "step": 140 + }, + { + "epoch": 0.04179577718527059, + "grad_norm": 3.959317445755005, + "learning_rate": 8.333333333333334e-06, + "loss": 0.3743, + "step": 145 + }, + { + "epoch": 0.0432370108813144, + "grad_norm": 2.0469164848327637, + "learning_rate": 8.620689655172414e-06, + "loss": 0.3836, + "step": 150 + }, + { + "epoch": 0.044678244577358216, + "grad_norm": 2.5716373920440674, + "learning_rate": 8.908045977011495e-06, + "loss": 0.3875, + "step": 155 + }, + { + "epoch": 0.046119478273402036, + "grad_norm": 6.04033088684082, + "learning_rate": 9.195402298850575e-06, + "loss": 0.3584, + "step": 160 + }, + { + "epoch": 0.04756071196944585, + "grad_norm": 10.341621398925781, + "learning_rate": 9.482758620689655e-06, + "loss": 0.3824, + "step": 165 + }, + { + "epoch": 0.04900194566548966, + "grad_norm": 3.144866704940796, + "learning_rate": 9.770114942528738e-06, + "loss": 0.3281, + "step": 170 + }, + { + "epoch": 0.050443179361533474, + "grad_norm": 5.703673362731934, + "learning_rate": 9.996965098634295e-06, + "loss": 0.354, + "step": 175 + }, + { + "epoch": 0.05188441305757729, + "grad_norm": 2.0394303798675537, + "learning_rate": 9.981790591805767e-06, + "loss": 0.3734, + "step": 180 + }, + { + "epoch": 0.0533256467536211, + "grad_norm": 3.995746374130249, + "learning_rate": 9.966616084977238e-06, + "loss": 0.3225, + "step": 185 + }, + { + "epoch": 0.05476688044966491, + "grad_norm": 8.836956977844238, + "learning_rate": 9.951441578148711e-06, + "loss": 0.3439, + "step": 190 + }, + { + "epoch": 0.056208114145708726, + "grad_norm": 2.608346939086914, + "learning_rate": 9.936267071320182e-06, + "loss": 0.371, + "step": 195 + }, + { + "epoch": 0.05764934784175254, + "grad_norm": 4.031014442443848, + "learning_rate": 9.921092564491654e-06, + "loss": 0.3483, + "step": 200 + }, + { + "epoch": 0.05764934784175254, + "eval_loss": 0.352454274892807, + "eval_mse": 0.3524542865753174, + "eval_runtime": 3.6486, + "eval_samples_per_second": 274.078, + "eval_steps_per_second": 17.267, + "step": 200 + }, + { + "epoch": 0.05909058153779635, + "grad_norm": 4.824347972869873, + "learning_rate": 9.905918057663127e-06, + "loss": 0.3258, + "step": 205 + }, + { + "epoch": 0.060531815233840165, + "grad_norm": 2.2714250087738037, + "learning_rate": 9.890743550834598e-06, + "loss": 0.3321, + "step": 210 + }, + { + "epoch": 0.06197304892988398, + "grad_norm": 5.385800361633301, + "learning_rate": 9.87556904400607e-06, + "loss": 0.2992, + "step": 215 + }, + { + "epoch": 0.0634142826259278, + "grad_norm": 2.349539279937744, + "learning_rate": 9.860394537177543e-06, + "loss": 0.3111, + "step": 220 + }, + { + "epoch": 0.0648555163219716, + "grad_norm": 3.6904749870300293, + "learning_rate": 9.845220030349014e-06, + "loss": 0.3812, + "step": 225 + }, + { + "epoch": 0.06629675001801542, + "grad_norm": 3.031271457672119, + "learning_rate": 9.830045523520486e-06, + "loss": 0.2955, + "step": 230 + }, + { + "epoch": 0.06773798371405923, + "grad_norm": 5.256127834320068, + "learning_rate": 9.814871016691959e-06, + "loss": 0.3384, + "step": 235 + }, + { + "epoch": 0.06917921741010305, + "grad_norm": 3.7872514724731445, + "learning_rate": 9.79969650986343e-06, + "loss": 0.3169, + "step": 240 + }, + { + "epoch": 0.07062045110614686, + "grad_norm": 2.257871150970459, + "learning_rate": 9.784522003034902e-06, + "loss": 0.3045, + "step": 245 + }, + { + "epoch": 0.07206168480219068, + "grad_norm": 3.1732468605041504, + "learning_rate": 9.769347496206375e-06, + "loss": 0.2859, + "step": 250 + }, + { + "epoch": 0.07350291849823448, + "grad_norm": 1.8080546855926514, + "learning_rate": 9.754172989377846e-06, + "loss": 0.2752, + "step": 255 + }, + { + "epoch": 0.0749441521942783, + "grad_norm": 3.3401401042938232, + "learning_rate": 9.738998482549317e-06, + "loss": 0.3189, + "step": 260 + }, + { + "epoch": 0.07638538589032212, + "grad_norm": 4.685635089874268, + "learning_rate": 9.72382397572079e-06, + "loss": 0.3365, + "step": 265 + }, + { + "epoch": 0.07782661958636593, + "grad_norm": 3.807539224624634, + "learning_rate": 9.708649468892262e-06, + "loss": 0.3306, + "step": 270 + }, + { + "epoch": 0.07926785328240975, + "grad_norm": 2.541351795196533, + "learning_rate": 9.693474962063733e-06, + "loss": 0.3665, + "step": 275 + }, + { + "epoch": 0.08070908697845355, + "grad_norm": 2.3847010135650635, + "learning_rate": 9.678300455235205e-06, + "loss": 0.3359, + "step": 280 + }, + { + "epoch": 0.08215032067449737, + "grad_norm": 9.623117446899414, + "learning_rate": 9.663125948406678e-06, + "loss": 0.3699, + "step": 285 + }, + { + "epoch": 0.08359155437054118, + "grad_norm": 9.328727722167969, + "learning_rate": 9.64795144157815e-06, + "loss": 0.3608, + "step": 290 + }, + { + "epoch": 0.085032788066585, + "grad_norm": 6.9004435539245605, + "learning_rate": 9.63277693474962e-06, + "loss": 0.338, + "step": 295 + }, + { + "epoch": 0.0864740217626288, + "grad_norm": 5.4706830978393555, + "learning_rate": 9.617602427921094e-06, + "loss": 0.3238, + "step": 300 + }, + { + "epoch": 0.0864740217626288, + "eval_loss": 0.29314419627189636, + "eval_mse": 0.2931441988945007, + "eval_runtime": 3.5222, + "eval_samples_per_second": 283.916, + "eval_steps_per_second": 17.887, + "step": 300 + }, + { + "epoch": 0.08791525545867263, + "grad_norm": 2.093715190887451, + "learning_rate": 9.602427921092565e-06, + "loss": 0.3098, + "step": 305 + }, + { + "epoch": 0.08935648915471643, + "grad_norm": 2.114649534225464, + "learning_rate": 9.587253414264037e-06, + "loss": 0.2742, + "step": 310 + }, + { + "epoch": 0.09079772285076025, + "grad_norm": 7.1367387771606445, + "learning_rate": 9.57207890743551e-06, + "loss": 0.3443, + "step": 315 + }, + { + "epoch": 0.09223895654680407, + "grad_norm": 3.2078042030334473, + "learning_rate": 9.556904400606981e-06, + "loss": 0.2965, + "step": 320 + }, + { + "epoch": 0.09368019024284788, + "grad_norm": 4.233352184295654, + "learning_rate": 9.541729893778452e-06, + "loss": 0.2553, + "step": 325 + }, + { + "epoch": 0.0951214239388917, + "grad_norm": 8.637604713439941, + "learning_rate": 9.526555386949926e-06, + "loss": 0.3027, + "step": 330 + }, + { + "epoch": 0.0965626576349355, + "grad_norm": 2.668332576751709, + "learning_rate": 9.511380880121397e-06, + "loss": 0.3195, + "step": 335 + }, + { + "epoch": 0.09800389133097932, + "grad_norm": 5.594767093658447, + "learning_rate": 9.496206373292868e-06, + "loss": 0.3111, + "step": 340 + }, + { + "epoch": 0.09944512502702313, + "grad_norm": 1.80435049533844, + "learning_rate": 9.481031866464341e-06, + "loss": 0.2833, + "step": 345 + }, + { + "epoch": 0.10088635872306695, + "grad_norm": 6.785181045532227, + "learning_rate": 9.465857359635813e-06, + "loss": 0.3184, + "step": 350 + }, + { + "epoch": 0.10232759241911075, + "grad_norm": 7.8451008796691895, + "learning_rate": 9.450682852807284e-06, + "loss": 0.3031, + "step": 355 + }, + { + "epoch": 0.10376882611515457, + "grad_norm": 2.5645573139190674, + "learning_rate": 9.435508345978757e-06, + "loss": 0.2877, + "step": 360 + }, + { + "epoch": 0.10521005981119838, + "grad_norm": 2.994230031967163, + "learning_rate": 9.420333839150229e-06, + "loss": 0.3102, + "step": 365 + }, + { + "epoch": 0.1066512935072422, + "grad_norm": 5.640050411224365, + "learning_rate": 9.4051593323217e-06, + "loss": 0.293, + "step": 370 + }, + { + "epoch": 0.108092527203286, + "grad_norm": 7.134743690490723, + "learning_rate": 9.389984825493173e-06, + "loss": 0.3103, + "step": 375 + }, + { + "epoch": 0.10953376089932983, + "grad_norm": 3.617605209350586, + "learning_rate": 9.374810318664645e-06, + "loss": 0.2883, + "step": 380 + }, + { + "epoch": 0.11097499459537365, + "grad_norm": 1.7953038215637207, + "learning_rate": 9.359635811836116e-06, + "loss": 0.2946, + "step": 385 + }, + { + "epoch": 0.11241622829141745, + "grad_norm": 4.107087135314941, + "learning_rate": 9.344461305007587e-06, + "loss": 0.3091, + "step": 390 + }, + { + "epoch": 0.11385746198746127, + "grad_norm": 2.5875136852264404, + "learning_rate": 9.32928679817906e-06, + "loss": 0.2776, + "step": 395 + }, + { + "epoch": 0.11529869568350508, + "grad_norm": 2.0523602962493896, + "learning_rate": 9.314112291350532e-06, + "loss": 0.2734, + "step": 400 + }, + { + "epoch": 0.11529869568350508, + "eval_loss": 0.31296026706695557, + "eval_mse": 0.31296028327941894, + "eval_runtime": 3.4119, + "eval_samples_per_second": 293.091, + "eval_steps_per_second": 18.465, + "step": 400 + }, + { + "epoch": 0.1167399293795489, + "grad_norm": 2.2445826530456543, + "learning_rate": 9.298937784522003e-06, + "loss": 0.3148, + "step": 405 + }, + { + "epoch": 0.1181811630755927, + "grad_norm": 3.3342812061309814, + "learning_rate": 9.283763277693477e-06, + "loss": 0.2743, + "step": 410 + }, + { + "epoch": 0.11962239677163652, + "grad_norm": 2.5152697563171387, + "learning_rate": 9.268588770864948e-06, + "loss": 0.3092, + "step": 415 + }, + { + "epoch": 0.12106363046768033, + "grad_norm": 2.3605024814605713, + "learning_rate": 9.25341426403642e-06, + "loss": 0.3348, + "step": 420 + }, + { + "epoch": 0.12250486416372415, + "grad_norm": 4.979254245758057, + "learning_rate": 9.238239757207892e-06, + "loss": 0.3268, + "step": 425 + }, + { + "epoch": 0.12394609785976796, + "grad_norm": 2.3477885723114014, + "learning_rate": 9.223065250379364e-06, + "loss": 0.309, + "step": 430 + }, + { + "epoch": 0.12538733155581178, + "grad_norm": 2.372631549835205, + "learning_rate": 9.207890743550835e-06, + "loss": 0.2893, + "step": 435 + }, + { + "epoch": 0.1268285652518556, + "grad_norm": 4.252530097961426, + "learning_rate": 9.192716236722308e-06, + "loss": 0.2758, + "step": 440 + }, + { + "epoch": 0.1282697989478994, + "grad_norm": 5.626593112945557, + "learning_rate": 9.17754172989378e-06, + "loss": 0.2904, + "step": 445 + }, + { + "epoch": 0.1297110326439432, + "grad_norm": 6.885512351989746, + "learning_rate": 9.162367223065251e-06, + "loss": 0.2694, + "step": 450 + }, + { + "epoch": 0.13115226633998703, + "grad_norm": 3.3293423652648926, + "learning_rate": 9.147192716236724e-06, + "loss": 0.2826, + "step": 455 + }, + { + "epoch": 0.13259350003603085, + "grad_norm": 2.934630870819092, + "learning_rate": 9.132018209408196e-06, + "loss": 0.3319, + "step": 460 + }, + { + "epoch": 0.13403473373207467, + "grad_norm": 4.578772068023682, + "learning_rate": 9.116843702579667e-06, + "loss": 0.34, + "step": 465 + }, + { + "epoch": 0.13547596742811846, + "grad_norm": 2.9086310863494873, + "learning_rate": 9.10166919575114e-06, + "loss": 0.2912, + "step": 470 + }, + { + "epoch": 0.13691720112416228, + "grad_norm": 2.20721697807312, + "learning_rate": 9.08649468892261e-06, + "loss": 0.2737, + "step": 475 + }, + { + "epoch": 0.1383584348202061, + "grad_norm": 2.334871530532837, + "learning_rate": 9.071320182094083e-06, + "loss": 0.3058, + "step": 480 + }, + { + "epoch": 0.13979966851624992, + "grad_norm": 4.9962592124938965, + "learning_rate": 9.056145675265554e-06, + "loss": 0.2907, + "step": 485 + }, + { + "epoch": 0.1412409022122937, + "grad_norm": 5.542920112609863, + "learning_rate": 9.040971168437026e-06, + "loss": 0.3144, + "step": 490 + }, + { + "epoch": 0.14268213590833753, + "grad_norm": 2.817955255508423, + "learning_rate": 9.025796661608497e-06, + "loss": 0.2554, + "step": 495 + }, + { + "epoch": 0.14412336960438135, + "grad_norm": 3.7314751148223877, + "learning_rate": 9.01062215477997e-06, + "loss": 0.2891, + "step": 500 + }, + { + "epoch": 0.14412336960438135, + "eval_loss": 0.3298364281654358, + "eval_mse": 0.32983644971996545, + "eval_runtime": 3.402, + "eval_samples_per_second": 293.943, + "eval_steps_per_second": 18.518, + "step": 500 + }, + { + "epoch": 0.14556460330042517, + "grad_norm": 4.2389445304870605, + "learning_rate": 8.995447647951442e-06, + "loss": 0.2957, + "step": 505 + }, + { + "epoch": 0.14700583699646896, + "grad_norm": 4.599143981933594, + "learning_rate": 8.980273141122913e-06, + "loss": 0.3008, + "step": 510 + }, + { + "epoch": 0.14844707069251278, + "grad_norm": 7.0564751625061035, + "learning_rate": 8.965098634294386e-06, + "loss": 0.2908, + "step": 515 + }, + { + "epoch": 0.1498883043885566, + "grad_norm": 1.786997675895691, + "learning_rate": 8.949924127465858e-06, + "loss": 0.276, + "step": 520 + }, + { + "epoch": 0.15132953808460042, + "grad_norm": 3.3596365451812744, + "learning_rate": 8.934749620637329e-06, + "loss": 0.3179, + "step": 525 + }, + { + "epoch": 0.15277077178064424, + "grad_norm": 4.094123363494873, + "learning_rate": 8.919575113808802e-06, + "loss": 0.2689, + "step": 530 + }, + { + "epoch": 0.15421200547668804, + "grad_norm": 1.8090394735336304, + "learning_rate": 8.904400606980273e-06, + "loss": 0.266, + "step": 535 + }, + { + "epoch": 0.15565323917273186, + "grad_norm": 2.1146230697631836, + "learning_rate": 8.889226100151745e-06, + "loss": 0.2918, + "step": 540 + }, + { + "epoch": 0.15709447286877568, + "grad_norm": 2.7003800868988037, + "learning_rate": 8.874051593323218e-06, + "loss": 0.3014, + "step": 545 + }, + { + "epoch": 0.1585357065648195, + "grad_norm": 2.025775909423828, + "learning_rate": 8.85887708649469e-06, + "loss": 0.2933, + "step": 550 + }, + { + "epoch": 0.1599769402608633, + "grad_norm": 3.2832581996917725, + "learning_rate": 8.84370257966616e-06, + "loss": 0.2697, + "step": 555 + }, + { + "epoch": 0.1614181739569071, + "grad_norm": 4.607889175415039, + "learning_rate": 8.828528072837634e-06, + "loss": 0.32, + "step": 560 + }, + { + "epoch": 0.16285940765295093, + "grad_norm": 7.916262149810791, + "learning_rate": 8.813353566009105e-06, + "loss": 0.3381, + "step": 565 + }, + { + "epoch": 0.16430064134899475, + "grad_norm": 1.8572163581848145, + "learning_rate": 8.798179059180577e-06, + "loss": 0.2642, + "step": 570 + }, + { + "epoch": 0.16574187504503854, + "grad_norm": 1.9104336500167847, + "learning_rate": 8.78300455235205e-06, + "loss": 0.3111, + "step": 575 + }, + { + "epoch": 0.16718310874108236, + "grad_norm": 5.321012020111084, + "learning_rate": 8.767830045523521e-06, + "loss": 0.2776, + "step": 580 + }, + { + "epoch": 0.16862434243712618, + "grad_norm": 6.43941593170166, + "learning_rate": 8.752655538694993e-06, + "loss": 0.3395, + "step": 585 + }, + { + "epoch": 0.17006557613317, + "grad_norm": 4.0543646812438965, + "learning_rate": 8.737481031866466e-06, + "loss": 0.2403, + "step": 590 + }, + { + "epoch": 0.17150680982921382, + "grad_norm": 3.5150725841522217, + "learning_rate": 8.722306525037937e-06, + "loss": 0.2943, + "step": 595 + }, + { + "epoch": 0.1729480435252576, + "grad_norm": 4.989763259887695, + "learning_rate": 8.707132018209408e-06, + "loss": 0.2807, + "step": 600 + }, + { + "epoch": 0.1729480435252576, + "eval_loss": 0.26590582728385925, + "eval_mse": 0.2659058196991682, + "eval_runtime": 3.4384, + "eval_samples_per_second": 290.832, + "eval_steps_per_second": 18.322, + "step": 600 + }, + { + "epoch": 0.17438927722130143, + "grad_norm": 2.0770037174224854, + "learning_rate": 8.69195751138088e-06, + "loss": 0.2716, + "step": 605 + }, + { + "epoch": 0.17583051091734525, + "grad_norm": 3.540919065475464, + "learning_rate": 8.676783004552353e-06, + "loss": 0.2697, + "step": 610 + }, + { + "epoch": 0.17727174461338907, + "grad_norm": 4.88574743270874, + "learning_rate": 8.661608497723824e-06, + "loss": 0.2922, + "step": 615 + }, + { + "epoch": 0.17871297830943286, + "grad_norm": 2.6014974117279053, + "learning_rate": 8.646433990895296e-06, + "loss": 0.2942, + "step": 620 + }, + { + "epoch": 0.18015421200547668, + "grad_norm": 4.289388179779053, + "learning_rate": 8.631259484066769e-06, + "loss": 0.2707, + "step": 625 + }, + { + "epoch": 0.1815954457015205, + "grad_norm": 2.1737794876098633, + "learning_rate": 8.61608497723824e-06, + "loss": 0.2716, + "step": 630 + }, + { + "epoch": 0.18303667939756432, + "grad_norm": 3.533886432647705, + "learning_rate": 8.600910470409712e-06, + "loss": 0.2821, + "step": 635 + }, + { + "epoch": 0.18447791309360814, + "grad_norm": 3.032208204269409, + "learning_rate": 8.585735963581185e-06, + "loss": 0.2641, + "step": 640 + }, + { + "epoch": 0.18591914678965193, + "grad_norm": 1.685655951499939, + "learning_rate": 8.570561456752656e-06, + "loss": 0.3315, + "step": 645 + }, + { + "epoch": 0.18736038048569575, + "grad_norm": 1.627114176750183, + "learning_rate": 8.555386949924128e-06, + "loss": 0.2782, + "step": 650 + }, + { + "epoch": 0.18880161418173957, + "grad_norm": 3.437077760696411, + "learning_rate": 8.5402124430956e-06, + "loss": 0.2733, + "step": 655 + }, + { + "epoch": 0.1902428478777834, + "grad_norm": 3.599497079849243, + "learning_rate": 8.525037936267072e-06, + "loss": 0.2509, + "step": 660 + }, + { + "epoch": 0.1916840815738272, + "grad_norm": 2.423408269882202, + "learning_rate": 8.509863429438544e-06, + "loss": 0.2672, + "step": 665 + }, + { + "epoch": 0.193125315269871, + "grad_norm": 3.0533182621002197, + "learning_rate": 8.494688922610017e-06, + "loss": 0.2876, + "step": 670 + }, + { + "epoch": 0.19456654896591483, + "grad_norm": 2.392457962036133, + "learning_rate": 8.479514415781488e-06, + "loss": 0.254, + "step": 675 + }, + { + "epoch": 0.19600778266195865, + "grad_norm": 1.6407463550567627, + "learning_rate": 8.46433990895296e-06, + "loss": 0.2659, + "step": 680 + }, + { + "epoch": 0.19744901635800244, + "grad_norm": 2.979224443435669, + "learning_rate": 8.449165402124433e-06, + "loss": 0.2974, + "step": 685 + }, + { + "epoch": 0.19889025005404626, + "grad_norm": 5.143660545349121, + "learning_rate": 8.433990895295904e-06, + "loss": 0.2529, + "step": 690 + }, + { + "epoch": 0.20033148375009008, + "grad_norm": 2.1899147033691406, + "learning_rate": 8.418816388467375e-06, + "loss": 0.2532, + "step": 695 + }, + { + "epoch": 0.2017727174461339, + "grad_norm": 3.272409677505493, + "learning_rate": 8.403641881638848e-06, + "loss": 0.2727, + "step": 700 + }, + { + "epoch": 0.2017727174461339, + "eval_loss": 0.26900768280029297, + "eval_mse": 0.2690076799674425, + "eval_runtime": 3.5072, + "eval_samples_per_second": 285.129, + "eval_steps_per_second": 17.963, + "step": 700 + }, + { + "epoch": 0.20321395114217772, + "grad_norm": 3.7777695655822754, + "learning_rate": 8.38846737481032e-06, + "loss": 0.2701, + "step": 705 + }, + { + "epoch": 0.2046551848382215, + "grad_norm": 2.7959799766540527, + "learning_rate": 8.373292867981791e-06, + "loss": 0.2697, + "step": 710 + }, + { + "epoch": 0.20609641853426533, + "grad_norm": 2.1683778762817383, + "learning_rate": 8.358118361153263e-06, + "loss": 0.2919, + "step": 715 + }, + { + "epoch": 0.20753765223030915, + "grad_norm": 4.523341655731201, + "learning_rate": 8.342943854324736e-06, + "loss": 0.291, + "step": 720 + }, + { + "epoch": 0.20897888592635297, + "grad_norm": 3.7949328422546387, + "learning_rate": 8.327769347496207e-06, + "loss": 0.2673, + "step": 725 + }, + { + "epoch": 0.21042011962239676, + "grad_norm": 2.369877815246582, + "learning_rate": 8.312594840667679e-06, + "loss": 0.3002, + "step": 730 + }, + { + "epoch": 0.21186135331844058, + "grad_norm": 3.621488094329834, + "learning_rate": 8.297420333839152e-06, + "loss": 0.2715, + "step": 735 + }, + { + "epoch": 0.2133025870144844, + "grad_norm": 6.6333394050598145, + "learning_rate": 8.282245827010623e-06, + "loss": 0.3091, + "step": 740 + }, + { + "epoch": 0.21474382071052822, + "grad_norm": 4.557513236999512, + "learning_rate": 8.267071320182094e-06, + "loss": 0.3085, + "step": 745 + }, + { + "epoch": 0.216185054406572, + "grad_norm": 1.754773497581482, + "learning_rate": 8.251896813353568e-06, + "loss": 0.2687, + "step": 750 + }, + { + "epoch": 0.21762628810261583, + "grad_norm": 1.8777259588241577, + "learning_rate": 8.236722306525039e-06, + "loss": 0.261, + "step": 755 + }, + { + "epoch": 0.21906752179865965, + "grad_norm": 1.8965113162994385, + "learning_rate": 8.22154779969651e-06, + "loss": 0.2664, + "step": 760 + }, + { + "epoch": 0.22050875549470347, + "grad_norm": 1.9722626209259033, + "learning_rate": 8.206373292867983e-06, + "loss": 0.2866, + "step": 765 + }, + { + "epoch": 0.2219499891907473, + "grad_norm": 2.329744815826416, + "learning_rate": 8.191198786039455e-06, + "loss": 0.2818, + "step": 770 + }, + { + "epoch": 0.22339122288679109, + "grad_norm": 5.612697601318359, + "learning_rate": 8.176024279210926e-06, + "loss": 0.2852, + "step": 775 + }, + { + "epoch": 0.2248324565828349, + "grad_norm": 1.9772837162017822, + "learning_rate": 8.1608497723824e-06, + "loss": 0.2627, + "step": 780 + }, + { + "epoch": 0.22627369027887873, + "grad_norm": 1.7957868576049805, + "learning_rate": 8.14567526555387e-06, + "loss": 0.2903, + "step": 785 + }, + { + "epoch": 0.22771492397492255, + "grad_norm": 1.7506916522979736, + "learning_rate": 8.130500758725342e-06, + "loss": 0.2651, + "step": 790 + }, + { + "epoch": 0.22915615767096634, + "grad_norm": 4.4279046058654785, + "learning_rate": 8.115326251896815e-06, + "loss": 0.3138, + "step": 795 + }, + { + "epoch": 0.23059739136701016, + "grad_norm": 2.2973923683166504, + "learning_rate": 8.100151745068287e-06, + "loss": 0.2701, + "step": 800 + }, + { + "epoch": 0.23059739136701016, + "eval_loss": 0.25545039772987366, + "eval_mse": 0.2554504075245932, + "eval_runtime": 3.4804, + "eval_samples_per_second": 287.327, + "eval_steps_per_second": 18.102, + "step": 800 + }, + { + "epoch": 0.23203862506305398, + "grad_norm": 5.442203044891357, + "learning_rate": 8.084977238239758e-06, + "loss": 0.2784, + "step": 805 + }, + { + "epoch": 0.2334798587590978, + "grad_norm": 4.991244316101074, + "learning_rate": 8.06980273141123e-06, + "loss": 0.2731, + "step": 810 + }, + { + "epoch": 0.2349210924551416, + "grad_norm": 3.7908804416656494, + "learning_rate": 8.054628224582701e-06, + "loss": 0.253, + "step": 815 + }, + { + "epoch": 0.2363623261511854, + "grad_norm": 3.7784805297851562, + "learning_rate": 8.039453717754174e-06, + "loss": 0.292, + "step": 820 + }, + { + "epoch": 0.23780355984722923, + "grad_norm": 1.8876696825027466, + "learning_rate": 8.024279210925645e-06, + "loss": 0.2637, + "step": 825 + }, + { + "epoch": 0.23924479354327305, + "grad_norm": 2.832737922668457, + "learning_rate": 8.009104704097117e-06, + "loss": 0.2975, + "step": 830 + }, + { + "epoch": 0.24068602723931687, + "grad_norm": 3.5835187435150146, + "learning_rate": 7.993930197268588e-06, + "loss": 0.2746, + "step": 835 + }, + { + "epoch": 0.24212726093536066, + "grad_norm": 2.2903802394866943, + "learning_rate": 7.978755690440061e-06, + "loss": 0.2832, + "step": 840 + }, + { + "epoch": 0.24356849463140448, + "grad_norm": 2.3398866653442383, + "learning_rate": 7.963581183611533e-06, + "loss": 0.2816, + "step": 845 + }, + { + "epoch": 0.2450097283274483, + "grad_norm": 1.899884819984436, + "learning_rate": 7.948406676783004e-06, + "loss": 0.2942, + "step": 850 + }, + { + "epoch": 0.24645096202349212, + "grad_norm": 1.6949434280395508, + "learning_rate": 7.933232169954477e-06, + "loss": 0.2617, + "step": 855 + }, + { + "epoch": 0.2478921957195359, + "grad_norm": 2.029087543487549, + "learning_rate": 7.918057663125949e-06, + "loss": 0.2493, + "step": 860 + }, + { + "epoch": 0.24933342941557973, + "grad_norm": 2.05265474319458, + "learning_rate": 7.90288315629742e-06, + "loss": 0.2965, + "step": 865 + }, + { + "epoch": 0.25077466311162355, + "grad_norm": 4.606462001800537, + "learning_rate": 7.887708649468893e-06, + "loss": 0.2815, + "step": 870 + }, + { + "epoch": 0.2522158968076674, + "grad_norm": 2.490574359893799, + "learning_rate": 7.872534142640365e-06, + "loss": 0.2488, + "step": 875 + }, + { + "epoch": 0.2536571305037112, + "grad_norm": 1.7441041469573975, + "learning_rate": 7.857359635811836e-06, + "loss": 0.2621, + "step": 880 + }, + { + "epoch": 0.255098364199755, + "grad_norm": 2.0153064727783203, + "learning_rate": 7.842185128983309e-06, + "loss": 0.2612, + "step": 885 + }, + { + "epoch": 0.2565395978957988, + "grad_norm": 2.384290933609009, + "learning_rate": 7.82701062215478e-06, + "loss": 0.2721, + "step": 890 + }, + { + "epoch": 0.2579808315918426, + "grad_norm": 2.901371717453003, + "learning_rate": 7.811836115326252e-06, + "loss": 0.2611, + "step": 895 + }, + { + "epoch": 0.2594220652878864, + "grad_norm": 2.0064079761505127, + "learning_rate": 7.796661608497725e-06, + "loss": 0.2954, + "step": 900 + }, + { + "epoch": 0.2594220652878864, + "eval_loss": 0.25009381771087646, + "eval_mse": 0.2500938248075545, + "eval_runtime": 3.4662, + "eval_samples_per_second": 288.504, + "eval_steps_per_second": 18.176, + "step": 900 + }, + { + "epoch": 0.26086329898393024, + "grad_norm": 3.5631322860717773, + "learning_rate": 7.781487101669196e-06, + "loss": 0.2746, + "step": 905 + }, + { + "epoch": 0.26230453267997406, + "grad_norm": 2.9283640384674072, + "learning_rate": 7.766312594840668e-06, + "loss": 0.2791, + "step": 910 + }, + { + "epoch": 0.2637457663760179, + "grad_norm": 2.096801996231079, + "learning_rate": 7.75113808801214e-06, + "loss": 0.2549, + "step": 915 + }, + { + "epoch": 0.2651870000720617, + "grad_norm": 2.258312225341797, + "learning_rate": 7.735963581183612e-06, + "loss": 0.2512, + "step": 920 + }, + { + "epoch": 0.2666282337681055, + "grad_norm": 5.4184699058532715, + "learning_rate": 7.720789074355084e-06, + "loss": 0.2648, + "step": 925 + }, + { + "epoch": 0.26806946746414934, + "grad_norm": 2.8605494499206543, + "learning_rate": 7.705614567526557e-06, + "loss": 0.2619, + "step": 930 + }, + { + "epoch": 0.2695107011601931, + "grad_norm": 3.81048846244812, + "learning_rate": 7.690440060698028e-06, + "loss": 0.2873, + "step": 935 + }, + { + "epoch": 0.2709519348562369, + "grad_norm": 2.1442720890045166, + "learning_rate": 7.6752655538695e-06, + "loss": 0.2927, + "step": 940 + }, + { + "epoch": 0.27239316855228074, + "grad_norm": 2.3381333351135254, + "learning_rate": 7.660091047040971e-06, + "loss": 0.3048, + "step": 945 + }, + { + "epoch": 0.27383440224832456, + "grad_norm": 5.771536350250244, + "learning_rate": 7.644916540212444e-06, + "loss": 0.3202, + "step": 950 + }, + { + "epoch": 0.2752756359443684, + "grad_norm": 4.533117771148682, + "learning_rate": 7.629742033383915e-06, + "loss": 0.2824, + "step": 955 + }, + { + "epoch": 0.2767168696404122, + "grad_norm": 1.5523793697357178, + "learning_rate": 7.614567526555388e-06, + "loss": 0.2394, + "step": 960 + }, + { + "epoch": 0.278158103336456, + "grad_norm": 3.8669960498809814, + "learning_rate": 7.599393019726859e-06, + "loss": 0.2838, + "step": 965 + }, + { + "epoch": 0.27959933703249984, + "grad_norm": 6.376443862915039, + "learning_rate": 7.584218512898331e-06, + "loss": 0.3049, + "step": 970 + }, + { + "epoch": 0.28104057072854366, + "grad_norm": 2.06191349029541, + "learning_rate": 7.569044006069804e-06, + "loss": 0.2668, + "step": 975 + }, + { + "epoch": 0.2824818044245874, + "grad_norm": 8.73715877532959, + "learning_rate": 7.553869499241275e-06, + "loss": 0.3139, + "step": 980 + }, + { + "epoch": 0.28392303812063124, + "grad_norm": 2.4534220695495605, + "learning_rate": 7.538694992412747e-06, + "loss": 0.276, + "step": 985 + }, + { + "epoch": 0.28536427181667506, + "grad_norm": 1.751397728919983, + "learning_rate": 7.5235204855842195e-06, + "loss": 0.2545, + "step": 990 + }, + { + "epoch": 0.2868055055127189, + "grad_norm": 1.4238481521606445, + "learning_rate": 7.508345978755691e-06, + "loss": 0.2841, + "step": 995 + }, + { + "epoch": 0.2882467392087627, + "grad_norm": 1.4616000652313232, + "learning_rate": 7.493171471927163e-06, + "loss": 0.2618, + "step": 1000 + }, + { + "epoch": 0.2882467392087627, + "eval_loss": 0.248306006193161, + "eval_mse": 0.24830602062121035, + "eval_runtime": 3.5569, + "eval_samples_per_second": 281.142, + "eval_steps_per_second": 17.712, + "step": 1000 + }, + { + "epoch": 0.2896879729048065, + "grad_norm": 2.0549042224884033, + "learning_rate": 7.477996965098635e-06, + "loss": 0.2566, + "step": 1005 + }, + { + "epoch": 0.29112920660085034, + "grad_norm": 4.6840715408325195, + "learning_rate": 7.462822458270107e-06, + "loss": 0.3632, + "step": 1010 + }, + { + "epoch": 0.29257044029689416, + "grad_norm": 1.8822166919708252, + "learning_rate": 7.447647951441579e-06, + "loss": 0.2745, + "step": 1015 + }, + { + "epoch": 0.2940116739929379, + "grad_norm": 4.04414176940918, + "learning_rate": 7.4324734446130505e-06, + "loss": 0.2286, + "step": 1020 + }, + { + "epoch": 0.29545290768898175, + "grad_norm": 1.8269232511520386, + "learning_rate": 7.417298937784523e-06, + "loss": 0.2764, + "step": 1025 + }, + { + "epoch": 0.29689414138502557, + "grad_norm": 2.176198720932007, + "learning_rate": 7.402124430955995e-06, + "loss": 0.271, + "step": 1030 + }, + { + "epoch": 0.2983353750810694, + "grad_norm": 2.1249961853027344, + "learning_rate": 7.386949924127466e-06, + "loss": 0.2552, + "step": 1035 + }, + { + "epoch": 0.2997766087771132, + "grad_norm": 2.1173455715179443, + "learning_rate": 7.371775417298939e-06, + "loss": 0.2931, + "step": 1040 + }, + { + "epoch": 0.301217842473157, + "grad_norm": 1.8627756834030151, + "learning_rate": 7.356600910470411e-06, + "loss": 0.2653, + "step": 1045 + }, + { + "epoch": 0.30265907616920085, + "grad_norm": 3.421990394592285, + "learning_rate": 7.341426403641882e-06, + "loss": 0.2465, + "step": 1050 + }, + { + "epoch": 0.30410030986524467, + "grad_norm": 5.541344165802002, + "learning_rate": 7.3262518968133545e-06, + "loss": 0.2628, + "step": 1055 + }, + { + "epoch": 0.3055415435612885, + "grad_norm": 4.115916728973389, + "learning_rate": 7.311077389984827e-06, + "loss": 0.2809, + "step": 1060 + }, + { + "epoch": 0.30698277725733225, + "grad_norm": 5.273794651031494, + "learning_rate": 7.295902883156298e-06, + "loss": 0.2583, + "step": 1065 + }, + { + "epoch": 0.30842401095337607, + "grad_norm": 2.4409689903259277, + "learning_rate": 7.2807283763277704e-06, + "loss": 0.2551, + "step": 1070 + }, + { + "epoch": 0.3098652446494199, + "grad_norm": 1.662716269493103, + "learning_rate": 7.265553869499242e-06, + "loss": 0.2529, + "step": 1075 + }, + { + "epoch": 0.3113064783454637, + "grad_norm": 1.8716989755630493, + "learning_rate": 7.250379362670714e-06, + "loss": 0.2837, + "step": 1080 + }, + { + "epoch": 0.31274771204150753, + "grad_norm": 1.728489637374878, + "learning_rate": 7.235204855842186e-06, + "loss": 0.2885, + "step": 1085 + }, + { + "epoch": 0.31418894573755135, + "grad_norm": 2.6448850631713867, + "learning_rate": 7.220030349013658e-06, + "loss": 0.2883, + "step": 1090 + }, + { + "epoch": 0.31563017943359517, + "grad_norm": 2.3963072299957275, + "learning_rate": 7.20485584218513e-06, + "loss": 0.2581, + "step": 1095 + }, + { + "epoch": 0.317071413129639, + "grad_norm": 3.5989487171173096, + "learning_rate": 7.189681335356602e-06, + "loss": 0.3081, + "step": 1100 + }, + { + "epoch": 0.317071413129639, + "eval_loss": 0.2456330806016922, + "eval_mse": 0.2456330911256373, + "eval_runtime": 3.4886, + "eval_samples_per_second": 286.649, + "eval_steps_per_second": 18.059, + "step": 1100 + }, + { + "epoch": 0.3185126468256828, + "grad_norm": 6.0055012702941895, + "learning_rate": 7.174506828528074e-06, + "loss": 0.2705, + "step": 1105 + }, + { + "epoch": 0.3199538805217266, + "grad_norm": 2.128803014755249, + "learning_rate": 7.159332321699546e-06, + "loss": 0.2451, + "step": 1110 + }, + { + "epoch": 0.3213951142177704, + "grad_norm": 4.768571376800537, + "learning_rate": 7.144157814871018e-06, + "loss": 0.3035, + "step": 1115 + }, + { + "epoch": 0.3228363479138142, + "grad_norm": 3.2803542613983154, + "learning_rate": 7.1289833080424896e-06, + "loss": 0.2706, + "step": 1120 + }, + { + "epoch": 0.32427758160985803, + "grad_norm": 3.9689948558807373, + "learning_rate": 7.113808801213962e-06, + "loss": 0.2767, + "step": 1125 + }, + { + "epoch": 0.32571881530590185, + "grad_norm": 3.02150821685791, + "learning_rate": 7.098634294385432e-06, + "loss": 0.2778, + "step": 1130 + }, + { + "epoch": 0.3271600490019457, + "grad_norm": 1.8594034910202026, + "learning_rate": 7.083459787556905e-06, + "loss": 0.2732, + "step": 1135 + }, + { + "epoch": 0.3286012826979895, + "grad_norm": 5.168829917907715, + "learning_rate": 7.068285280728376e-06, + "loss": 0.2669, + "step": 1140 + }, + { + "epoch": 0.3300425163940333, + "grad_norm": 2.214434862136841, + "learning_rate": 7.053110773899848e-06, + "loss": 0.2847, + "step": 1145 + }, + { + "epoch": 0.3314837500900771, + "grad_norm": 8.464557647705078, + "learning_rate": 7.0379362670713205e-06, + "loss": 0.2758, + "step": 1150 + }, + { + "epoch": 0.3329249837861209, + "grad_norm": 4.346470355987549, + "learning_rate": 7.022761760242792e-06, + "loss": 0.2456, + "step": 1155 + }, + { + "epoch": 0.3343662174821647, + "grad_norm": 2.3579695224761963, + "learning_rate": 7.007587253414264e-06, + "loss": 0.2654, + "step": 1160 + }, + { + "epoch": 0.33580745117820854, + "grad_norm": 2.9135961532592773, + "learning_rate": 6.9924127465857364e-06, + "loss": 0.2676, + "step": 1165 + }, + { + "epoch": 0.33724868487425236, + "grad_norm": 2.272632122039795, + "learning_rate": 6.977238239757208e-06, + "loss": 0.2579, + "step": 1170 + }, + { + "epoch": 0.3386899185702962, + "grad_norm": 4.541927814483643, + "learning_rate": 6.96206373292868e-06, + "loss": 0.2448, + "step": 1175 + }, + { + "epoch": 0.34013115226634, + "grad_norm": 1.557702898979187, + "learning_rate": 6.946889226100152e-06, + "loss": 0.2729, + "step": 1180 + }, + { + "epoch": 0.3415723859623838, + "grad_norm": 5.71626615524292, + "learning_rate": 6.931714719271624e-06, + "loss": 0.2691, + "step": 1185 + }, + { + "epoch": 0.34301361965842764, + "grad_norm": 1.9943206310272217, + "learning_rate": 6.916540212443096e-06, + "loss": 0.2357, + "step": 1190 + }, + { + "epoch": 0.3444548533544714, + "grad_norm": 1.7325447797775269, + "learning_rate": 6.901365705614567e-06, + "loss": 0.2543, + "step": 1195 + }, + { + "epoch": 0.3458960870505152, + "grad_norm": 2.789851427078247, + "learning_rate": 6.88619119878604e-06, + "loss": 0.2544, + "step": 1200 + }, + { + "epoch": 0.3458960870505152, + "eval_loss": 0.23698876798152924, + "eval_mse": 0.2369887590147555, + "eval_runtime": 3.6793, + "eval_samples_per_second": 271.794, + "eval_steps_per_second": 17.123, + "step": 1200 + }, + { + "epoch": 0.34733732074655904, + "grad_norm": 3.181692361831665, + "learning_rate": 6.871016691957512e-06, + "loss": 0.2553, + "step": 1205 + }, + { + "epoch": 0.34877855444260286, + "grad_norm": 1.9078682661056519, + "learning_rate": 6.855842185128983e-06, + "loss": 0.2621, + "step": 1210 + }, + { + "epoch": 0.3502197881386467, + "grad_norm": 8.280984878540039, + "learning_rate": 6.8406676783004556e-06, + "loss": 0.2373, + "step": 1215 + }, + { + "epoch": 0.3516610218346905, + "grad_norm": 2.165010929107666, + "learning_rate": 6.825493171471928e-06, + "loss": 0.2674, + "step": 1220 + }, + { + "epoch": 0.3531022555307343, + "grad_norm": 1.7144526243209839, + "learning_rate": 6.810318664643399e-06, + "loss": 0.2628, + "step": 1225 + }, + { + "epoch": 0.35454348922677814, + "grad_norm": 2.039090394973755, + "learning_rate": 6.7951441578148715e-06, + "loss": 0.257, + "step": 1230 + }, + { + "epoch": 0.35598472292282196, + "grad_norm": 2.4305779933929443, + "learning_rate": 6.779969650986343e-06, + "loss": 0.2385, + "step": 1235 + }, + { + "epoch": 0.3574259566188657, + "grad_norm": 2.1004676818847656, + "learning_rate": 6.764795144157815e-06, + "loss": 0.2667, + "step": 1240 + }, + { + "epoch": 0.35886719031490955, + "grad_norm": 3.492386817932129, + "learning_rate": 6.749620637329287e-06, + "loss": 0.2809, + "step": 1245 + }, + { + "epoch": 0.36030842401095337, + "grad_norm": 2.710477352142334, + "learning_rate": 6.734446130500759e-06, + "loss": 0.2571, + "step": 1250 + }, + { + "epoch": 0.3617496577069972, + "grad_norm": 2.1155197620391846, + "learning_rate": 6.719271623672231e-06, + "loss": 0.2362, + "step": 1255 + }, + { + "epoch": 0.363190891403041, + "grad_norm": 2.056640625, + "learning_rate": 6.704097116843703e-06, + "loss": 0.2744, + "step": 1260 + }, + { + "epoch": 0.3646321250990848, + "grad_norm": 3.4779868125915527, + "learning_rate": 6.688922610015175e-06, + "loss": 0.2635, + "step": 1265 + }, + { + "epoch": 0.36607335879512864, + "grad_norm": 2.7803256511688232, + "learning_rate": 6.673748103186647e-06, + "loss": 0.2594, + "step": 1270 + }, + { + "epoch": 0.36751459249117246, + "grad_norm": 3.466438055038452, + "learning_rate": 6.658573596358119e-06, + "loss": 0.2791, + "step": 1275 + }, + { + "epoch": 0.3689558261872163, + "grad_norm": 1.9647109508514404, + "learning_rate": 6.643399089529591e-06, + "loss": 0.2589, + "step": 1280 + }, + { + "epoch": 0.37039705988326005, + "grad_norm": 3.195117950439453, + "learning_rate": 6.628224582701063e-06, + "loss": 0.2722, + "step": 1285 + }, + { + "epoch": 0.37183829357930387, + "grad_norm": 2.193518877029419, + "learning_rate": 6.613050075872534e-06, + "loss": 0.2467, + "step": 1290 + }, + { + "epoch": 0.3732795272753477, + "grad_norm": 3.5873641967773438, + "learning_rate": 6.5978755690440065e-06, + "loss": 0.2837, + "step": 1295 + }, + { + "epoch": 0.3747207609713915, + "grad_norm": 1.9536793231964111, + "learning_rate": 6.582701062215479e-06, + "loss": 0.2593, + "step": 1300 + }, + { + "epoch": 0.3747207609713915, + "eval_loss": 0.23485003411769867, + "eval_mse": 0.23485004005674273, + "eval_runtime": 3.6259, + "eval_samples_per_second": 275.793, + "eval_steps_per_second": 17.375, + "step": 1300 + }, + { + "epoch": 0.37616199466743533, + "grad_norm": 1.7588564157485962, + "learning_rate": 6.56752655538695e-06, + "loss": 0.2633, + "step": 1305 + }, + { + "epoch": 0.37760322836347915, + "grad_norm": 2.348902940750122, + "learning_rate": 6.552352048558422e-06, + "loss": 0.2499, + "step": 1310 + }, + { + "epoch": 0.37904446205952297, + "grad_norm": 3.767819881439209, + "learning_rate": 6.537177541729895e-06, + "loss": 0.2939, + "step": 1315 + }, + { + "epoch": 0.3804856957555668, + "grad_norm": 5.1513495445251465, + "learning_rate": 6.522003034901366e-06, + "loss": 0.2408, + "step": 1320 + }, + { + "epoch": 0.38192692945161055, + "grad_norm": 1.6535985469818115, + "learning_rate": 6.506828528072838e-06, + "loss": 0.2944, + "step": 1325 + }, + { + "epoch": 0.3833681631476544, + "grad_norm": 3.1404776573181152, + "learning_rate": 6.4916540212443106e-06, + "loss": 0.2764, + "step": 1330 + }, + { + "epoch": 0.3848093968436982, + "grad_norm": 5.461513996124268, + "learning_rate": 6.476479514415782e-06, + "loss": 0.2748, + "step": 1335 + }, + { + "epoch": 0.386250630539742, + "grad_norm": 3.2750980854034424, + "learning_rate": 6.461305007587254e-06, + "loss": 0.2512, + "step": 1340 + }, + { + "epoch": 0.38769186423578583, + "grad_norm": 1.7520034313201904, + "learning_rate": 6.446130500758726e-06, + "loss": 0.2625, + "step": 1345 + }, + { + "epoch": 0.38913309793182965, + "grad_norm": 2.008507251739502, + "learning_rate": 6.430955993930198e-06, + "loss": 0.2512, + "step": 1350 + }, + { + "epoch": 0.3905743316278735, + "grad_norm": 2.8682732582092285, + "learning_rate": 6.41578148710167e-06, + "loss": 0.2745, + "step": 1355 + }, + { + "epoch": 0.3920155653239173, + "grad_norm": 6.220619201660156, + "learning_rate": 6.4006069802731415e-06, + "loss": 0.2537, + "step": 1360 + }, + { + "epoch": 0.3934567990199611, + "grad_norm": 4.935564994812012, + "learning_rate": 6.385432473444614e-06, + "loss": 0.2733, + "step": 1365 + }, + { + "epoch": 0.3948980327160049, + "grad_norm": 3.9264633655548096, + "learning_rate": 6.370257966616086e-06, + "loss": 0.283, + "step": 1370 + }, + { + "epoch": 0.3963392664120487, + "grad_norm": 2.657594680786133, + "learning_rate": 6.3550834597875574e-06, + "loss": 0.3157, + "step": 1375 + }, + { + "epoch": 0.3977805001080925, + "grad_norm": 4.369290351867676, + "learning_rate": 6.33990895295903e-06, + "loss": 0.2876, + "step": 1380 + }, + { + "epoch": 0.39922173380413634, + "grad_norm": 1.5138938426971436, + "learning_rate": 6.324734446130502e-06, + "loss": 0.2504, + "step": 1385 + }, + { + "epoch": 0.40066296750018016, + "grad_norm": 2.0192885398864746, + "learning_rate": 6.309559939301973e-06, + "loss": 0.2467, + "step": 1390 + }, + { + "epoch": 0.402104201196224, + "grad_norm": 3.4528374671936035, + "learning_rate": 6.294385432473446e-06, + "loss": 0.2801, + "step": 1395 + }, + { + "epoch": 0.4035454348922678, + "grad_norm": 3.7361555099487305, + "learning_rate": 6.279210925644917e-06, + "loss": 0.2361, + "step": 1400 + }, + { + "epoch": 0.4035454348922678, + "eval_loss": 0.24057632684707642, + "eval_mse": 0.24057634546631015, + "eval_runtime": 3.4722, + "eval_samples_per_second": 288.002, + "eval_steps_per_second": 18.144, + "step": 1400 + }, + { + "epoch": 0.4049866685883116, + "grad_norm": 5.139772891998291, + "learning_rate": 6.264036418816389e-06, + "loss": 0.2905, + "step": 1405 + }, + { + "epoch": 0.40642790228435544, + "grad_norm": 1.7238795757293701, + "learning_rate": 6.2488619119878615e-06, + "loss": 0.2533, + "step": 1410 + }, + { + "epoch": 0.4078691359803992, + "grad_norm": 3.280794620513916, + "learning_rate": 6.233687405159333e-06, + "loss": 0.2654, + "step": 1415 + }, + { + "epoch": 0.409310369676443, + "grad_norm": 4.247371673583984, + "learning_rate": 6.218512898330805e-06, + "loss": 0.2631, + "step": 1420 + }, + { + "epoch": 0.41075160337248684, + "grad_norm": 2.0990066528320312, + "learning_rate": 6.203338391502277e-06, + "loss": 0.2299, + "step": 1425 + }, + { + "epoch": 0.41219283706853066, + "grad_norm": 2.9505622386932373, + "learning_rate": 6.188163884673749e-06, + "loss": 0.3077, + "step": 1430 + }, + { + "epoch": 0.4136340707645745, + "grad_norm": 5.29465913772583, + "learning_rate": 6.172989377845221e-06, + "loss": 0.2661, + "step": 1435 + }, + { + "epoch": 0.4150753044606183, + "grad_norm": 4.061058044433594, + "learning_rate": 6.157814871016693e-06, + "loss": 0.2862, + "step": 1440 + }, + { + "epoch": 0.4165165381566621, + "grad_norm": 2.681509017944336, + "learning_rate": 6.142640364188165e-06, + "loss": 0.2757, + "step": 1445 + }, + { + "epoch": 0.41795777185270594, + "grad_norm": 5.9882493019104, + "learning_rate": 6.127465857359637e-06, + "loss": 0.2501, + "step": 1450 + }, + { + "epoch": 0.4193990055487497, + "grad_norm": 1.7219948768615723, + "learning_rate": 6.112291350531108e-06, + "loss": 0.251, + "step": 1455 + }, + { + "epoch": 0.4208402392447935, + "grad_norm": 3.918973207473755, + "learning_rate": 6.09711684370258e-06, + "loss": 0.2806, + "step": 1460 + }, + { + "epoch": 0.42228147294083734, + "grad_norm": 4.979806900024414, + "learning_rate": 6.081942336874051e-06, + "loss": 0.2602, + "step": 1465 + }, + { + "epoch": 0.42372270663688116, + "grad_norm": 2.2970187664031982, + "learning_rate": 6.0667678300455234e-06, + "loss": 0.2471, + "step": 1470 + }, + { + "epoch": 0.425163940332925, + "grad_norm": 2.741425037384033, + "learning_rate": 6.051593323216996e-06, + "loss": 0.2476, + "step": 1475 + }, + { + "epoch": 0.4266051740289688, + "grad_norm": 2.258983612060547, + "learning_rate": 6.036418816388467e-06, + "loss": 0.2542, + "step": 1480 + }, + { + "epoch": 0.4280464077250126, + "grad_norm": 2.742643356323242, + "learning_rate": 6.021244309559939e-06, + "loss": 0.2677, + "step": 1485 + }, + { + "epoch": 0.42948764142105644, + "grad_norm": 2.7305283546447754, + "learning_rate": 6.006069802731412e-06, + "loss": 0.2645, + "step": 1490 + }, + { + "epoch": 0.43092887511710026, + "grad_norm": 4.1162519454956055, + "learning_rate": 5.990895295902883e-06, + "loss": 0.2201, + "step": 1495 + }, + { + "epoch": 0.432370108813144, + "grad_norm": 2.419027805328369, + "learning_rate": 5.975720789074355e-06, + "loss": 0.2536, + "step": 1500 + }, + { + "epoch": 0.432370108813144, + "eval_loss": 0.245252326130867, + "eval_mse": 0.24525232070405037, + "eval_runtime": 3.5207, + "eval_samples_per_second": 284.032, + "eval_steps_per_second": 17.894, + "step": 1500 + }, + { + "epoch": 0.43381134250918785, + "grad_norm": 1.9734622240066528, + "learning_rate": 5.9605462822458275e-06, + "loss": 0.2538, + "step": 1505 + }, + { + "epoch": 0.43525257620523167, + "grad_norm": 2.8142011165618896, + "learning_rate": 5.945371775417299e-06, + "loss": 0.286, + "step": 1510 + }, + { + "epoch": 0.4366938099012755, + "grad_norm": 3.7075448036193848, + "learning_rate": 5.930197268588771e-06, + "loss": 0.2504, + "step": 1515 + }, + { + "epoch": 0.4381350435973193, + "grad_norm": 3.2373270988464355, + "learning_rate": 5.9150227617602426e-06, + "loss": 0.2605, + "step": 1520 + }, + { + "epoch": 0.4395762772933631, + "grad_norm": 7.530263423919678, + "learning_rate": 5.899848254931715e-06, + "loss": 0.2825, + "step": 1525 + }, + { + "epoch": 0.44101751098940695, + "grad_norm": 4.141541481018066, + "learning_rate": 5.884673748103187e-06, + "loss": 0.2765, + "step": 1530 + }, + { + "epoch": 0.44245874468545077, + "grad_norm": 2.2406415939331055, + "learning_rate": 5.8694992412746585e-06, + "loss": 0.2392, + "step": 1535 + }, + { + "epoch": 0.4438999783814946, + "grad_norm": 5.379973411560059, + "learning_rate": 5.854324734446131e-06, + "loss": 0.2416, + "step": 1540 + }, + { + "epoch": 0.44534121207753835, + "grad_norm": 1.5827113389968872, + "learning_rate": 5.839150227617603e-06, + "loss": 0.2664, + "step": 1545 + }, + { + "epoch": 0.44678244577358217, + "grad_norm": 3.172206401824951, + "learning_rate": 5.823975720789074e-06, + "loss": 0.2655, + "step": 1550 + }, + { + "epoch": 0.448223679469626, + "grad_norm": 1.9343550205230713, + "learning_rate": 5.808801213960547e-06, + "loss": 0.2716, + "step": 1555 + }, + { + "epoch": 0.4496649131656698, + "grad_norm": 1.622921109199524, + "learning_rate": 5.793626707132019e-06, + "loss": 0.2295, + "step": 1560 + }, + { + "epoch": 0.45110614686171363, + "grad_norm": 2.6616415977478027, + "learning_rate": 5.77845220030349e-06, + "loss": 0.2832, + "step": 1565 + }, + { + "epoch": 0.45254738055775745, + "grad_norm": 3.7298784255981445, + "learning_rate": 5.7632776934749625e-06, + "loss": 0.2376, + "step": 1570 + }, + { + "epoch": 0.45398861425380127, + "grad_norm": 2.567563056945801, + "learning_rate": 5.748103186646434e-06, + "loss": 0.2639, + "step": 1575 + }, + { + "epoch": 0.4554298479498451, + "grad_norm": 3.1561615467071533, + "learning_rate": 5.732928679817906e-06, + "loss": 0.2606, + "step": 1580 + }, + { + "epoch": 0.45687108164588885, + "grad_norm": 2.5020530223846436, + "learning_rate": 5.7177541729893784e-06, + "loss": 0.2384, + "step": 1585 + }, + { + "epoch": 0.4583123153419327, + "grad_norm": 1.5679707527160645, + "learning_rate": 5.70257966616085e-06, + "loss": 0.2332, + "step": 1590 + }, + { + "epoch": 0.4597535490379765, + "grad_norm": 1.720269799232483, + "learning_rate": 5.687405159332322e-06, + "loss": 0.2548, + "step": 1595 + }, + { + "epoch": 0.4611947827340203, + "grad_norm": 6.216645240783691, + "learning_rate": 5.672230652503794e-06, + "loss": 0.26, + "step": 1600 + }, + { + "epoch": 0.4611947827340203, + "eval_loss": 0.2568361163139343, + "eval_mse": 0.256836162135005, + "eval_runtime": 3.4453, + "eval_samples_per_second": 290.255, + "eval_steps_per_second": 18.286, + "step": 1600 + }, + { + "epoch": 0.46263601643006413, + "grad_norm": 1.7799232006072998, + "learning_rate": 5.657056145675266e-06, + "loss": 0.2363, + "step": 1605 + }, + { + "epoch": 0.46407725012610795, + "grad_norm": 3.4367353916168213, + "learning_rate": 5.641881638846738e-06, + "loss": 0.2506, + "step": 1610 + }, + { + "epoch": 0.4655184838221518, + "grad_norm": 6.947693824768066, + "learning_rate": 5.62670713201821e-06, + "loss": 0.2813, + "step": 1615 + }, + { + "epoch": 0.4669597175181956, + "grad_norm": 2.6440799236297607, + "learning_rate": 5.611532625189682e-06, + "loss": 0.2371, + "step": 1620 + }, + { + "epoch": 0.4684009512142394, + "grad_norm": 2.7614519596099854, + "learning_rate": 5.596358118361154e-06, + "loss": 0.2631, + "step": 1625 + }, + { + "epoch": 0.4698421849102832, + "grad_norm": 3.076611042022705, + "learning_rate": 5.581183611532625e-06, + "loss": 0.2669, + "step": 1630 + }, + { + "epoch": 0.471283418606327, + "grad_norm": 2.3479678630828857, + "learning_rate": 5.5660091047040976e-06, + "loss": 0.2498, + "step": 1635 + }, + { + "epoch": 0.4727246523023708, + "grad_norm": 3.953763723373413, + "learning_rate": 5.55083459787557e-06, + "loss": 0.2795, + "step": 1640 + }, + { + "epoch": 0.47416588599841464, + "grad_norm": 2.7778728008270264, + "learning_rate": 5.535660091047041e-06, + "loss": 0.2549, + "step": 1645 + }, + { + "epoch": 0.47560711969445846, + "grad_norm": 2.6670873165130615, + "learning_rate": 5.5204855842185135e-06, + "loss": 0.2808, + "step": 1650 + }, + { + "epoch": 0.4770483533905023, + "grad_norm": 2.9756405353546143, + "learning_rate": 5.505311077389986e-06, + "loss": 0.2716, + "step": 1655 + }, + { + "epoch": 0.4784895870865461, + "grad_norm": 2.9402782917022705, + "learning_rate": 5.490136570561457e-06, + "loss": 0.2515, + "step": 1660 + }, + { + "epoch": 0.4799308207825899, + "grad_norm": 3.0504231452941895, + "learning_rate": 5.474962063732929e-06, + "loss": 0.2305, + "step": 1665 + }, + { + "epoch": 0.48137205447863374, + "grad_norm": 2.28179931640625, + "learning_rate": 5.459787556904402e-06, + "loss": 0.2637, + "step": 1670 + }, + { + "epoch": 0.4828132881746775, + "grad_norm": 1.8344733715057373, + "learning_rate": 5.444613050075873e-06, + "loss": 0.261, + "step": 1675 + }, + { + "epoch": 0.4842545218707213, + "grad_norm": 2.4778668880462646, + "learning_rate": 5.429438543247345e-06, + "loss": 0.2681, + "step": 1680 + }, + { + "epoch": 0.48569575556676514, + "grad_norm": 2.5058882236480713, + "learning_rate": 5.414264036418817e-06, + "loss": 0.2576, + "step": 1685 + }, + { + "epoch": 0.48713698926280896, + "grad_norm": 5.368293762207031, + "learning_rate": 5.399089529590289e-06, + "loss": 0.2598, + "step": 1690 + }, + { + "epoch": 0.4885782229588528, + "grad_norm": 2.2240071296691895, + "learning_rate": 5.383915022761761e-06, + "loss": 0.2631, + "step": 1695 + }, + { + "epoch": 0.4900194566548966, + "grad_norm": 2.5634841918945312, + "learning_rate": 5.368740515933233e-06, + "loss": 0.2897, + "step": 1700 + }, + { + "epoch": 0.4900194566548966, + "eval_loss": 0.2567506432533264, + "eval_mse": 0.2567506678728969, + "eval_runtime": 3.488, + "eval_samples_per_second": 286.701, + "eval_steps_per_second": 18.062, + "step": 1700 + }, + { + "epoch": 0.4914606903509404, + "grad_norm": 2.202760696411133, + "learning_rate": 5.353566009104705e-06, + "loss": 0.2736, + "step": 1705 + }, + { + "epoch": 0.49290192404698424, + "grad_norm": 2.0145926475524902, + "learning_rate": 5.338391502276177e-06, + "loss": 0.2326, + "step": 1710 + }, + { + "epoch": 0.494343157743028, + "grad_norm": 3.5262668132781982, + "learning_rate": 5.3232169954476485e-06, + "loss": 0.2561, + "step": 1715 + }, + { + "epoch": 0.4957843914390718, + "grad_norm": 2.2268214225769043, + "learning_rate": 5.308042488619121e-06, + "loss": 0.2755, + "step": 1720 + }, + { + "epoch": 0.49722562513511565, + "grad_norm": 3.3295412063598633, + "learning_rate": 5.292867981790593e-06, + "loss": 0.228, + "step": 1725 + }, + { + "epoch": 0.49866685883115947, + "grad_norm": 4.007626533508301, + "learning_rate": 5.277693474962064e-06, + "loss": 0.2713, + "step": 1730 + }, + { + "epoch": 0.5001080925272033, + "grad_norm": 2.5232491493225098, + "learning_rate": 5.262518968133537e-06, + "loss": 0.2558, + "step": 1735 + }, + { + "epoch": 0.5015493262232471, + "grad_norm": 1.9738813638687134, + "learning_rate": 5.247344461305008e-06, + "loss": 0.2377, + "step": 1740 + }, + { + "epoch": 0.5029905599192909, + "grad_norm": 1.8534965515136719, + "learning_rate": 5.23216995447648e-06, + "loss": 0.2598, + "step": 1745 + }, + { + "epoch": 0.5044317936153347, + "grad_norm": 2.304119110107422, + "learning_rate": 5.2169954476479526e-06, + "loss": 0.2546, + "step": 1750 + }, + { + "epoch": 0.5058730273113785, + "grad_norm": 3.076138734817505, + "learning_rate": 5.201820940819424e-06, + "loss": 0.2757, + "step": 1755 + }, + { + "epoch": 0.5073142610074224, + "grad_norm": 3.0180857181549072, + "learning_rate": 5.186646433990896e-06, + "loss": 0.2987, + "step": 1760 + }, + { + "epoch": 0.5087554947034661, + "grad_norm": 1.7307599782943726, + "learning_rate": 5.1714719271623685e-06, + "loss": 0.2563, + "step": 1765 + }, + { + "epoch": 0.51019672839951, + "grad_norm": 3.3566253185272217, + "learning_rate": 5.15629742033384e-06, + "loss": 0.2677, + "step": 1770 + }, + { + "epoch": 0.5116379620955538, + "grad_norm": 5.262298107147217, + "learning_rate": 5.141122913505312e-06, + "loss": 0.2835, + "step": 1775 + }, + { + "epoch": 0.5130791957915976, + "grad_norm": 2.29787540435791, + "learning_rate": 5.125948406676784e-06, + "loss": 0.278, + "step": 1780 + }, + { + "epoch": 0.5145204294876414, + "grad_norm": 2.0509817600250244, + "learning_rate": 5.110773899848256e-06, + "loss": 0.2633, + "step": 1785 + }, + { + "epoch": 0.5159616631836852, + "grad_norm": 3.3420064449310303, + "learning_rate": 5.095599393019726e-06, + "loss": 0.2354, + "step": 1790 + }, + { + "epoch": 0.5174028968797291, + "grad_norm": 4.251697063446045, + "learning_rate": 5.080424886191199e-06, + "loss": 0.2567, + "step": 1795 + }, + { + "epoch": 0.5188441305757728, + "grad_norm": 1.7263171672821045, + "learning_rate": 5.065250379362671e-06, + "loss": 0.2597, + "step": 1800 + }, + { + "epoch": 0.5188441305757728, + "eval_loss": 0.23586300015449524, + "eval_mse": 0.23586300941184163, + "eval_runtime": 3.5548, + "eval_samples_per_second": 281.313, + "eval_steps_per_second": 17.723, + "step": 1800 + }, + { + "epoch": 0.5202853642718167, + "grad_norm": 2.9523870944976807, + "learning_rate": 5.050075872534142e-06, + "loss": 0.2565, + "step": 1805 + }, + { + "epoch": 0.5217265979678605, + "grad_norm": 2.9274137020111084, + "learning_rate": 5.0349013657056145e-06, + "loss": 0.2499, + "step": 1810 + }, + { + "epoch": 0.5231678316639043, + "grad_norm": 2.106623411178589, + "learning_rate": 5.019726858877087e-06, + "loss": 0.2448, + "step": 1815 + }, + { + "epoch": 0.5246090653599481, + "grad_norm": 2.8787355422973633, + "learning_rate": 5.004552352048558e-06, + "loss": 0.2668, + "step": 1820 + }, + { + "epoch": 0.5260502990559919, + "grad_norm": 2.016101837158203, + "learning_rate": 4.989377845220031e-06, + "loss": 0.2573, + "step": 1825 + }, + { + "epoch": 0.5274915327520358, + "grad_norm": 3.1823604106903076, + "learning_rate": 4.974203338391503e-06, + "loss": 0.2628, + "step": 1830 + }, + { + "epoch": 0.5289327664480795, + "grad_norm": 2.1019415855407715, + "learning_rate": 4.959028831562975e-06, + "loss": 0.2825, + "step": 1835 + }, + { + "epoch": 0.5303740001441234, + "grad_norm": 2.71608829498291, + "learning_rate": 4.943854324734446e-06, + "loss": 0.2495, + "step": 1840 + }, + { + "epoch": 0.5318152338401672, + "grad_norm": 1.8275377750396729, + "learning_rate": 4.9286798179059185e-06, + "loss": 0.2535, + "step": 1845 + }, + { + "epoch": 0.533256467536211, + "grad_norm": 3.9251434803009033, + "learning_rate": 4.913505311077391e-06, + "loss": 0.2881, + "step": 1850 + }, + { + "epoch": 0.5346977012322548, + "grad_norm": 2.2690351009368896, + "learning_rate": 4.898330804248862e-06, + "loss": 0.2495, + "step": 1855 + }, + { + "epoch": 0.5361389349282987, + "grad_norm": 1.780943751335144, + "learning_rate": 4.8831562974203345e-06, + "loss": 0.2658, + "step": 1860 + }, + { + "epoch": 0.5375801686243424, + "grad_norm": 2.1613879203796387, + "learning_rate": 4.867981790591807e-06, + "loss": 0.2688, + "step": 1865 + }, + { + "epoch": 0.5390214023203862, + "grad_norm": 4.357624530792236, + "learning_rate": 4.852807283763278e-06, + "loss": 0.2557, + "step": 1870 + }, + { + "epoch": 0.5404626360164301, + "grad_norm": 3.1877598762512207, + "learning_rate": 4.8376327769347495e-06, + "loss": 0.2637, + "step": 1875 + }, + { + "epoch": 0.5419038697124738, + "grad_norm": 1.587506651878357, + "learning_rate": 4.822458270106222e-06, + "loss": 0.243, + "step": 1880 + }, + { + "epoch": 0.5433451034085177, + "grad_norm": 1.8893779516220093, + "learning_rate": 4.807283763277694e-06, + "loss": 0.2513, + "step": 1885 + }, + { + "epoch": 0.5447863371045615, + "grad_norm": 5.860842704772949, + "learning_rate": 4.792109256449165e-06, + "loss": 0.27, + "step": 1890 + }, + { + "epoch": 0.5462275708006054, + "grad_norm": 1.7181977033615112, + "learning_rate": 4.776934749620638e-06, + "loss": 0.2545, + "step": 1895 + }, + { + "epoch": 0.5476688044966491, + "grad_norm": 2.9068872928619385, + "learning_rate": 4.761760242792109e-06, + "loss": 0.2489, + "step": 1900 + }, + { + "epoch": 0.5476688044966491, + "eval_loss": 0.24126099050045013, + "eval_mse": 0.24126099393393452, + "eval_runtime": 3.4541, + "eval_samples_per_second": 289.514, + "eval_steps_per_second": 18.239, + "step": 1900 + }, + { + "epoch": 0.549110038192693, + "grad_norm": 2.1639840602874756, + "learning_rate": 4.746585735963581e-06, + "loss": 0.2583, + "step": 1905 + }, + { + "epoch": 0.5505512718887368, + "grad_norm": 3.045797824859619, + "learning_rate": 4.731411229135054e-06, + "loss": 0.2373, + "step": 1910 + }, + { + "epoch": 0.5519925055847805, + "grad_norm": 2.1132616996765137, + "learning_rate": 4.716236722306525e-06, + "loss": 0.2742, + "step": 1915 + }, + { + "epoch": 0.5534337392808244, + "grad_norm": 4.257943153381348, + "learning_rate": 4.701062215477997e-06, + "loss": 0.2401, + "step": 1920 + }, + { + "epoch": 0.5548749729768682, + "grad_norm": 1.8180277347564697, + "learning_rate": 4.6858877086494695e-06, + "loss": 0.2211, + "step": 1925 + }, + { + "epoch": 0.556316206672912, + "grad_norm": 1.9830940961837769, + "learning_rate": 4.670713201820941e-06, + "loss": 0.2611, + "step": 1930 + }, + { + "epoch": 0.5577574403689558, + "grad_norm": 1.979367971420288, + "learning_rate": 4.655538694992413e-06, + "loss": 0.2499, + "step": 1935 + }, + { + "epoch": 0.5591986740649997, + "grad_norm": 3.538367748260498, + "learning_rate": 4.640364188163885e-06, + "loss": 0.2495, + "step": 1940 + }, + { + "epoch": 0.5606399077610434, + "grad_norm": 2.1496660709381104, + "learning_rate": 4.625189681335357e-06, + "loss": 0.2233, + "step": 1945 + }, + { + "epoch": 0.5620811414570873, + "grad_norm": 2.313380002975464, + "learning_rate": 4.610015174506829e-06, + "loss": 0.2713, + "step": 1950 + }, + { + "epoch": 0.5635223751531311, + "grad_norm": 1.9493372440338135, + "learning_rate": 4.5948406676783005e-06, + "loss": 0.2745, + "step": 1955 + }, + { + "epoch": 0.5649636088491748, + "grad_norm": 1.78380286693573, + "learning_rate": 4.579666160849773e-06, + "loss": 0.2705, + "step": 1960 + }, + { + "epoch": 0.5664048425452187, + "grad_norm": 5.174985885620117, + "learning_rate": 4.564491654021245e-06, + "loss": 0.2626, + "step": 1965 + }, + { + "epoch": 0.5678460762412625, + "grad_norm": 5.110122203826904, + "learning_rate": 4.549317147192716e-06, + "loss": 0.2482, + "step": 1970 + }, + { + "epoch": 0.5692873099373064, + "grad_norm": 2.324326753616333, + "learning_rate": 4.534142640364189e-06, + "loss": 0.2478, + "step": 1975 + }, + { + "epoch": 0.5707285436333501, + "grad_norm": 2.391932249069214, + "learning_rate": 4.518968133535661e-06, + "loss": 0.2335, + "step": 1980 + }, + { + "epoch": 0.572169777329394, + "grad_norm": 2.217822313308716, + "learning_rate": 4.503793626707132e-06, + "loss": 0.2382, + "step": 1985 + }, + { + "epoch": 0.5736110110254378, + "grad_norm": 3.003229856491089, + "learning_rate": 4.4886191198786045e-06, + "loss": 0.2585, + "step": 1990 + }, + { + "epoch": 0.5750522447214816, + "grad_norm": 3.210510492324829, + "learning_rate": 4.473444613050077e-06, + "loss": 0.2145, + "step": 1995 + }, + { + "epoch": 0.5764934784175254, + "grad_norm": 2.712481737136841, + "learning_rate": 4.458270106221548e-06, + "loss": 0.2376, + "step": 2000 + }, + { + "epoch": 0.5764934784175254, + "eval_loss": 0.24156416952610016, + "eval_mse": 0.2415641685180599, + "eval_runtime": 3.4679, + "eval_samples_per_second": 288.359, + "eval_steps_per_second": 18.167, + "step": 2000 + }, + { + "epoch": 0.5779347121135692, + "grad_norm": 4.5920209884643555, + "learning_rate": 4.44309559939302e-06, + "loss": 0.2551, + "step": 2005 + }, + { + "epoch": 0.579375945809613, + "grad_norm": 2.582946538925171, + "learning_rate": 4.427921092564492e-06, + "loss": 0.2492, + "step": 2010 + }, + { + "epoch": 0.5808171795056568, + "grad_norm": 3.868809700012207, + "learning_rate": 4.412746585735964e-06, + "loss": 0.271, + "step": 2015 + }, + { + "epoch": 0.5822584132017007, + "grad_norm": 2.1130621433258057, + "learning_rate": 4.397572078907436e-06, + "loss": 0.2543, + "step": 2020 + }, + { + "epoch": 0.5836996468977445, + "grad_norm": 3.2280352115631104, + "learning_rate": 4.382397572078908e-06, + "loss": 0.264, + "step": 2025 + }, + { + "epoch": 0.5851408805937883, + "grad_norm": 1.9445719718933105, + "learning_rate": 4.36722306525038e-06, + "loss": 0.2465, + "step": 2030 + }, + { + "epoch": 0.5865821142898321, + "grad_norm": 2.186732292175293, + "learning_rate": 4.352048558421852e-06, + "loss": 0.2687, + "step": 2035 + }, + { + "epoch": 0.5880233479858759, + "grad_norm": 6.876237392425537, + "learning_rate": 4.336874051593324e-06, + "loss": 0.2861, + "step": 2040 + }, + { + "epoch": 0.5894645816819197, + "grad_norm": 4.177600860595703, + "learning_rate": 4.321699544764795e-06, + "loss": 0.2553, + "step": 2045 + }, + { + "epoch": 0.5909058153779635, + "grad_norm": 1.9591648578643799, + "learning_rate": 4.306525037936267e-06, + "loss": 0.2375, + "step": 2050 + }, + { + "epoch": 0.5923470490740074, + "grad_norm": 4.677102565765381, + "learning_rate": 4.2913505311077395e-06, + "loss": 0.264, + "step": 2055 + }, + { + "epoch": 0.5937882827700511, + "grad_norm": 2.2224631309509277, + "learning_rate": 4.276176024279211e-06, + "loss": 0.2485, + "step": 2060 + }, + { + "epoch": 0.595229516466095, + "grad_norm": 2.693962574005127, + "learning_rate": 4.261001517450683e-06, + "loss": 0.2388, + "step": 2065 + }, + { + "epoch": 0.5966707501621388, + "grad_norm": 2.4235777854919434, + "learning_rate": 4.245827010622155e-06, + "loss": 0.2399, + "step": 2070 + }, + { + "epoch": 0.5981119838581826, + "grad_norm": 1.874377965927124, + "learning_rate": 4.230652503793627e-06, + "loss": 0.2326, + "step": 2075 + }, + { + "epoch": 0.5995532175542264, + "grad_norm": 3.6788363456726074, + "learning_rate": 4.215477996965099e-06, + "loss": 0.2338, + "step": 2080 + }, + { + "epoch": 0.6009944512502702, + "grad_norm": 2.06693172454834, + "learning_rate": 4.2003034901365705e-06, + "loss": 0.2347, + "step": 2085 + }, + { + "epoch": 0.602435684946314, + "grad_norm": 4.380660533905029, + "learning_rate": 4.185128983308043e-06, + "loss": 0.2569, + "step": 2090 + }, + { + "epoch": 0.6038769186423578, + "grad_norm": 2.032864809036255, + "learning_rate": 4.169954476479515e-06, + "loss": 0.2471, + "step": 2095 + }, + { + "epoch": 0.6053181523384017, + "grad_norm": 4.096874237060547, + "learning_rate": 4.154779969650986e-06, + "loss": 0.2424, + "step": 2100 + }, + { + "epoch": 0.6053181523384017, + "eval_loss": 0.24177902936935425, + "eval_mse": 0.24177903914719717, + "eval_runtime": 3.4534, + "eval_samples_per_second": 289.57, + "eval_steps_per_second": 18.243, + "step": 2100 + }, + { + "epoch": 0.6067593860344455, + "grad_norm": 2.5046980381011963, + "learning_rate": 4.139605462822459e-06, + "loss": 0.234, + "step": 2105 + }, + { + "epoch": 0.6082006197304893, + "grad_norm": 4.800332069396973, + "learning_rate": 4.124430955993931e-06, + "loss": 0.2742, + "step": 2110 + }, + { + "epoch": 0.6096418534265331, + "grad_norm": 2.0443146228790283, + "learning_rate": 4.109256449165402e-06, + "loss": 0.2408, + "step": 2115 + }, + { + "epoch": 0.611083087122577, + "grad_norm": 5.306743144989014, + "learning_rate": 4.0940819423368746e-06, + "loss": 0.2447, + "step": 2120 + }, + { + "epoch": 0.6125243208186207, + "grad_norm": 2.0920376777648926, + "learning_rate": 4.078907435508346e-06, + "loss": 0.2598, + "step": 2125 + }, + { + "epoch": 0.6139655545146645, + "grad_norm": 2.3251876831054688, + "learning_rate": 4.063732928679818e-06, + "loss": 0.2343, + "step": 2130 + }, + { + "epoch": 0.6154067882107084, + "grad_norm": 1.6690804958343506, + "learning_rate": 4.0485584218512905e-06, + "loss": 0.2488, + "step": 2135 + }, + { + "epoch": 0.6168480219067521, + "grad_norm": 4.557926177978516, + "learning_rate": 4.033383915022762e-06, + "loss": 0.2371, + "step": 2140 + }, + { + "epoch": 0.618289255602796, + "grad_norm": 1.826122760772705, + "learning_rate": 4.018209408194234e-06, + "loss": 0.2562, + "step": 2145 + }, + { + "epoch": 0.6197304892988398, + "grad_norm": 2.284559488296509, + "learning_rate": 4.003034901365706e-06, + "loss": 0.2647, + "step": 2150 + }, + { + "epoch": 0.6211717229948837, + "grad_norm": 1.968127727508545, + "learning_rate": 3.987860394537178e-06, + "loss": 0.24, + "step": 2155 + }, + { + "epoch": 0.6226129566909274, + "grad_norm": 1.8883932828903198, + "learning_rate": 3.97268588770865e-06, + "loss": 0.2508, + "step": 2160 + }, + { + "epoch": 0.6240541903869713, + "grad_norm": 4.677165985107422, + "learning_rate": 3.957511380880122e-06, + "loss": 0.2698, + "step": 2165 + }, + { + "epoch": 0.6254954240830151, + "grad_norm": 3.8323287963867188, + "learning_rate": 3.942336874051594e-06, + "loss": 0.2505, + "step": 2170 + }, + { + "epoch": 0.6269366577790588, + "grad_norm": 2.6321306228637695, + "learning_rate": 3.927162367223066e-06, + "loss": 0.2578, + "step": 2175 + }, + { + "epoch": 0.6283778914751027, + "grad_norm": 4.790838241577148, + "learning_rate": 3.911987860394537e-06, + "loss": 0.2682, + "step": 2180 + }, + { + "epoch": 0.6298191251711465, + "grad_norm": 1.60104501247406, + "learning_rate": 3.89681335356601e-06, + "loss": 0.2172, + "step": 2185 + }, + { + "epoch": 0.6312603588671903, + "grad_norm": 2.6829323768615723, + "learning_rate": 3.881638846737482e-06, + "loss": 0.319, + "step": 2190 + }, + { + "epoch": 0.6327015925632341, + "grad_norm": 5.924467086791992, + "learning_rate": 3.866464339908953e-06, + "loss": 0.2753, + "step": 2195 + }, + { + "epoch": 0.634142826259278, + "grad_norm": 2.6062183380126953, + "learning_rate": 3.8512898330804255e-06, + "loss": 0.2798, + "step": 2200 + }, + { + "epoch": 0.634142826259278, + "eval_loss": 0.2461671233177185, + "eval_mse": 0.24616713661520043, + "eval_runtime": 3.5245, + "eval_samples_per_second": 283.725, + "eval_steps_per_second": 17.875, + "step": 2200 + }, + { + "epoch": 0.6355840599553217, + "grad_norm": 5.319560527801514, + "learning_rate": 3.836115326251897e-06, + "loss": 0.2797, + "step": 2205 + }, + { + "epoch": 0.6370252936513656, + "grad_norm": 2.300398826599121, + "learning_rate": 3.820940819423369e-06, + "loss": 0.2638, + "step": 2210 + }, + { + "epoch": 0.6384665273474094, + "grad_norm": 1.8542537689208984, + "learning_rate": 3.8057663125948406e-06, + "loss": 0.2269, + "step": 2215 + }, + { + "epoch": 0.6399077610434531, + "grad_norm": 2.6789774894714355, + "learning_rate": 3.790591805766313e-06, + "loss": 0.2631, + "step": 2220 + }, + { + "epoch": 0.641348994739497, + "grad_norm": 3.5065345764160156, + "learning_rate": 3.7754172989377846e-06, + "loss": 0.2393, + "step": 2225 + }, + { + "epoch": 0.6427902284355408, + "grad_norm": 5.016166687011719, + "learning_rate": 3.7602427921092565e-06, + "loss": 0.2488, + "step": 2230 + }, + { + "epoch": 0.6442314621315847, + "grad_norm": 2.594754457473755, + "learning_rate": 3.7450682852807287e-06, + "loss": 0.2344, + "step": 2235 + }, + { + "epoch": 0.6456726958276284, + "grad_norm": 2.8543953895568848, + "learning_rate": 3.7298937784522006e-06, + "loss": 0.2477, + "step": 2240 + }, + { + "epoch": 0.6471139295236723, + "grad_norm": 1.9123929738998413, + "learning_rate": 3.7147192716236724e-06, + "loss": 0.274, + "step": 2245 + }, + { + "epoch": 0.6485551632197161, + "grad_norm": 2.7275772094726562, + "learning_rate": 3.699544764795144e-06, + "loss": 0.253, + "step": 2250 + }, + { + "epoch": 0.6499963969157599, + "grad_norm": 2.763127088546753, + "learning_rate": 3.6843702579666165e-06, + "loss": 0.2365, + "step": 2255 + }, + { + "epoch": 0.6514376306118037, + "grad_norm": 2.082650899887085, + "learning_rate": 3.6691957511380883e-06, + "loss": 0.2015, + "step": 2260 + }, + { + "epoch": 0.6528788643078475, + "grad_norm": 2.4569077491760254, + "learning_rate": 3.65402124430956e-06, + "loss": 0.2796, + "step": 2265 + }, + { + "epoch": 0.6543200980038913, + "grad_norm": 1.6337858438491821, + "learning_rate": 3.638846737481032e-06, + "loss": 0.2276, + "step": 2270 + }, + { + "epoch": 0.6557613316999351, + "grad_norm": 3.221034526824951, + "learning_rate": 3.623672230652504e-06, + "loss": 0.2193, + "step": 2275 + }, + { + "epoch": 0.657202565395979, + "grad_norm": 1.661058783531189, + "learning_rate": 3.608497723823976e-06, + "loss": 0.2468, + "step": 2280 + }, + { + "epoch": 0.6586437990920228, + "grad_norm": 3.846362590789795, + "learning_rate": 3.593323216995448e-06, + "loss": 0.2891, + "step": 2285 + }, + { + "epoch": 0.6600850327880666, + "grad_norm": 2.7255539894104004, + "learning_rate": 3.57814871016692e-06, + "loss": 0.2961, + "step": 2290 + }, + { + "epoch": 0.6615262664841104, + "grad_norm": 2.3028368949890137, + "learning_rate": 3.562974203338392e-06, + "loss": 0.2125, + "step": 2295 + }, + { + "epoch": 0.6629675001801542, + "grad_norm": 2.312697649002075, + "learning_rate": 3.5477996965098638e-06, + "loss": 0.2523, + "step": 2300 + }, + { + "epoch": 0.6629675001801542, + "eval_loss": 0.23224954307079315, + "eval_mse": 0.23224954986944796, + "eval_runtime": 3.5059, + "eval_samples_per_second": 285.236, + "eval_steps_per_second": 17.97, + "step": 2300 + }, + { + "epoch": 0.664408733876198, + "grad_norm": 1.403989315032959, + "learning_rate": 3.5326251896813356e-06, + "loss": 0.2127, + "step": 2305 + }, + { + "epoch": 0.6658499675722418, + "grad_norm": 2.4680073261260986, + "learning_rate": 3.517450682852808e-06, + "loss": 0.2631, + "step": 2310 + }, + { + "epoch": 0.6672912012682857, + "grad_norm": 2.680781841278076, + "learning_rate": 3.5022761760242797e-06, + "loss": 0.2498, + "step": 2315 + }, + { + "epoch": 0.6687324349643294, + "grad_norm": 2.0593039989471436, + "learning_rate": 3.4871016691957515e-06, + "loss": 0.2674, + "step": 2320 + }, + { + "epoch": 0.6701736686603733, + "grad_norm": 2.249380350112915, + "learning_rate": 3.4719271623672233e-06, + "loss": 0.2632, + "step": 2325 + }, + { + "epoch": 0.6716149023564171, + "grad_norm": 4.473768711090088, + "learning_rate": 3.4567526555386956e-06, + "loss": 0.274, + "step": 2330 + }, + { + "epoch": 0.673056136052461, + "grad_norm": 2.8323299884796143, + "learning_rate": 3.4415781487101674e-06, + "loss": 0.23, + "step": 2335 + }, + { + "epoch": 0.6744973697485047, + "grad_norm": 1.9655417203903198, + "learning_rate": 3.4264036418816392e-06, + "loss": 0.2741, + "step": 2340 + }, + { + "epoch": 0.6759386034445485, + "grad_norm": 2.6639366149902344, + "learning_rate": 3.4112291350531115e-06, + "loss": 0.2512, + "step": 2345 + }, + { + "epoch": 0.6773798371405924, + "grad_norm": 1.6781830787658691, + "learning_rate": 3.3960546282245833e-06, + "loss": 0.2326, + "step": 2350 + }, + { + "epoch": 0.6788210708366361, + "grad_norm": 2.3150107860565186, + "learning_rate": 3.380880121396055e-06, + "loss": 0.2317, + "step": 2355 + }, + { + "epoch": 0.68026230453268, + "grad_norm": 3.279113292694092, + "learning_rate": 3.365705614567527e-06, + "loss": 0.2542, + "step": 2360 + }, + { + "epoch": 0.6817035382287238, + "grad_norm": 1.7288917303085327, + "learning_rate": 3.350531107738999e-06, + "loss": 0.2458, + "step": 2365 + }, + { + "epoch": 0.6831447719247676, + "grad_norm": 1.6974340677261353, + "learning_rate": 3.3353566009104706e-06, + "loss": 0.2632, + "step": 2370 + }, + { + "epoch": 0.6845860056208114, + "grad_norm": 3.335235595703125, + "learning_rate": 3.3201820940819424e-06, + "loss": 0.2753, + "step": 2375 + }, + { + "epoch": 0.6860272393168553, + "grad_norm": 2.5615906715393066, + "learning_rate": 3.3050075872534143e-06, + "loss": 0.2435, + "step": 2380 + }, + { + "epoch": 0.687468473012899, + "grad_norm": 1.9486761093139648, + "learning_rate": 3.289833080424886e-06, + "loss": 0.2571, + "step": 2385 + }, + { + "epoch": 0.6889097067089428, + "grad_norm": 3.594895124435425, + "learning_rate": 3.2746585735963583e-06, + "loss": 0.2338, + "step": 2390 + }, + { + "epoch": 0.6903509404049867, + "grad_norm": 2.6904447078704834, + "learning_rate": 3.25948406676783e-06, + "loss": 0.2471, + "step": 2395 + }, + { + "epoch": 0.6917921741010304, + "grad_norm": 2.230098009109497, + "learning_rate": 3.244309559939302e-06, + "loss": 0.286, + "step": 2400 + }, + { + "epoch": 0.6917921741010304, + "eval_loss": 0.2431608885526657, + "eval_mse": 0.2431609066054225, + "eval_runtime": 3.4991, + "eval_samples_per_second": 285.789, + "eval_steps_per_second": 18.005, + "step": 2400 + }, + { + "epoch": 0.6932334077970743, + "grad_norm": 2.067410945892334, + "learning_rate": 3.229135053110774e-06, + "loss": 0.2474, + "step": 2405 + }, + { + "epoch": 0.6946746414931181, + "grad_norm": 2.398256540298462, + "learning_rate": 3.213960546282246e-06, + "loss": 0.2429, + "step": 2410 + }, + { + "epoch": 0.696115875189162, + "grad_norm": 3.5365965366363525, + "learning_rate": 3.198786039453718e-06, + "loss": 0.2478, + "step": 2415 + }, + { + "epoch": 0.6975571088852057, + "grad_norm": 2.981490135192871, + "learning_rate": 3.1836115326251897e-06, + "loss": 0.237, + "step": 2420 + }, + { + "epoch": 0.6989983425812496, + "grad_norm": 2.0100955963134766, + "learning_rate": 3.168437025796662e-06, + "loss": 0.2653, + "step": 2425 + }, + { + "epoch": 0.7004395762772934, + "grad_norm": 3.208611488342285, + "learning_rate": 3.153262518968134e-06, + "loss": 0.2621, + "step": 2430 + }, + { + "epoch": 0.7018808099733371, + "grad_norm": 3.3685500621795654, + "learning_rate": 3.1380880121396056e-06, + "loss": 0.2764, + "step": 2435 + }, + { + "epoch": 0.703322043669381, + "grad_norm": 1.9647629261016846, + "learning_rate": 3.1229135053110775e-06, + "loss": 0.2493, + "step": 2440 + }, + { + "epoch": 0.7047632773654248, + "grad_norm": 1.7301753759384155, + "learning_rate": 3.1077389984825497e-06, + "loss": 0.2773, + "step": 2445 + }, + { + "epoch": 0.7062045110614686, + "grad_norm": 2.260758638381958, + "learning_rate": 3.0925644916540215e-06, + "loss": 0.2487, + "step": 2450 + }, + { + "epoch": 0.7076457447575124, + "grad_norm": 1.7644176483154297, + "learning_rate": 3.0773899848254934e-06, + "loss": 0.27, + "step": 2455 + }, + { + "epoch": 0.7090869784535563, + "grad_norm": 2.1906681060791016, + "learning_rate": 3.062215477996965e-06, + "loss": 0.2653, + "step": 2460 + }, + { + "epoch": 0.7105282121496, + "grad_norm": 1.6636836528778076, + "learning_rate": 3.0470409711684375e-06, + "loss": 0.22, + "step": 2465 + }, + { + "epoch": 0.7119694458456439, + "grad_norm": 1.9646042585372925, + "learning_rate": 3.0318664643399093e-06, + "loss": 0.2279, + "step": 2470 + }, + { + "epoch": 0.7134106795416877, + "grad_norm": 3.8436663150787354, + "learning_rate": 3.016691957511381e-06, + "loss": 0.2675, + "step": 2475 + }, + { + "epoch": 0.7148519132377315, + "grad_norm": 1.7162784337997437, + "learning_rate": 3.0015174506828534e-06, + "loss": 0.2424, + "step": 2480 + }, + { + "epoch": 0.7162931469337753, + "grad_norm": 7.108310699462891, + "learning_rate": 2.986342943854325e-06, + "loss": 0.2585, + "step": 2485 + }, + { + "epoch": 0.7177343806298191, + "grad_norm": 1.4513427019119263, + "learning_rate": 2.971168437025797e-06, + "loss": 0.2531, + "step": 2490 + }, + { + "epoch": 0.719175614325863, + "grad_norm": 3.262995719909668, + "learning_rate": 2.955993930197269e-06, + "loss": 0.2407, + "step": 2495 + }, + { + "epoch": 0.7206168480219067, + "grad_norm": 2.73363995552063, + "learning_rate": 2.940819423368741e-06, + "loss": 0.247, + "step": 2500 + }, + { + "epoch": 0.7206168480219067, + "eval_loss": 0.23825927078723907, + "eval_mse": 0.23825928315892816, + "eval_runtime": 3.4721, + "eval_samples_per_second": 288.01, + "eval_steps_per_second": 18.145, + "step": 2500 + }, + { + "epoch": 0.7220580817179506, + "grad_norm": 2.238675594329834, + "learning_rate": 2.925644916540213e-06, + "loss": 0.2472, + "step": 2505 + }, + { + "epoch": 0.7234993154139944, + "grad_norm": 2.1989986896514893, + "learning_rate": 2.9104704097116847e-06, + "loss": 0.251, + "step": 2510 + }, + { + "epoch": 0.7249405491100382, + "grad_norm": 1.8326098918914795, + "learning_rate": 2.8952959028831566e-06, + "loss": 0.2885, + "step": 2515 + }, + { + "epoch": 0.726381782806082, + "grad_norm": 4.506540775299072, + "learning_rate": 2.880121396054629e-06, + "loss": 0.2509, + "step": 2520 + }, + { + "epoch": 0.7278230165021258, + "grad_norm": 2.287397623062134, + "learning_rate": 2.8649468892261007e-06, + "loss": 0.2502, + "step": 2525 + }, + { + "epoch": 0.7292642501981697, + "grad_norm": 1.5754716396331787, + "learning_rate": 2.849772382397572e-06, + "loss": 0.2345, + "step": 2530 + }, + { + "epoch": 0.7307054838942134, + "grad_norm": 2.309974193572998, + "learning_rate": 2.834597875569044e-06, + "loss": 0.2249, + "step": 2535 + }, + { + "epoch": 0.7321467175902573, + "grad_norm": 2.3784449100494385, + "learning_rate": 2.8194233687405157e-06, + "loss": 0.2505, + "step": 2540 + }, + { + "epoch": 0.733587951286301, + "grad_norm": 1.9537062644958496, + "learning_rate": 2.804248861911988e-06, + "loss": 0.2277, + "step": 2545 + }, + { + "epoch": 0.7350291849823449, + "grad_norm": 2.5582821369171143, + "learning_rate": 2.78907435508346e-06, + "loss": 0.2349, + "step": 2550 + }, + { + "epoch": 0.7364704186783887, + "grad_norm": 3.8102619647979736, + "learning_rate": 2.7738998482549316e-06, + "loss": 0.223, + "step": 2555 + }, + { + "epoch": 0.7379116523744326, + "grad_norm": 2.270439624786377, + "learning_rate": 2.758725341426404e-06, + "loss": 0.2737, + "step": 2560 + }, + { + "epoch": 0.7393528860704763, + "grad_norm": 1.8109246492385864, + "learning_rate": 2.7435508345978757e-06, + "loss": 0.2288, + "step": 2565 + }, + { + "epoch": 0.7407941197665201, + "grad_norm": 3.8969645500183105, + "learning_rate": 2.7283763277693475e-06, + "loss": 0.2474, + "step": 2570 + }, + { + "epoch": 0.742235353462564, + "grad_norm": 3.1700432300567627, + "learning_rate": 2.7132018209408194e-06, + "loss": 0.2739, + "step": 2575 + }, + { + "epoch": 0.7436765871586077, + "grad_norm": 1.7024991512298584, + "learning_rate": 2.6980273141122916e-06, + "loss": 0.2735, + "step": 2580 + }, + { + "epoch": 0.7451178208546516, + "grad_norm": 3.1429338455200195, + "learning_rate": 2.6828528072837634e-06, + "loss": 0.2599, + "step": 2585 + }, + { + "epoch": 0.7465590545506954, + "grad_norm": 2.875809907913208, + "learning_rate": 2.6676783004552353e-06, + "loss": 0.2446, + "step": 2590 + }, + { + "epoch": 0.7480002882467393, + "grad_norm": 2.0145938396453857, + "learning_rate": 2.652503793626707e-06, + "loss": 0.2458, + "step": 2595 + }, + { + "epoch": 0.749441521942783, + "grad_norm": 2.050328493118286, + "learning_rate": 2.6373292867981793e-06, + "loss": 0.2856, + "step": 2600 + }, + { + "epoch": 0.749441521942783, + "eval_loss": 0.23749777674674988, + "eval_mse": 0.2374977778196335, + "eval_runtime": 3.4587, + "eval_samples_per_second": 289.127, + "eval_steps_per_second": 18.215, + "step": 2600 + }, + { + "epoch": 0.7508827556388268, + "grad_norm": 2.5174169540405273, + "learning_rate": 2.622154779969651e-06, + "loss": 0.2519, + "step": 2605 + }, + { + "epoch": 0.7523239893348707, + "grad_norm": 4.114897727966309, + "learning_rate": 2.606980273141123e-06, + "loss": 0.2592, + "step": 2610 + }, + { + "epoch": 0.7537652230309144, + "grad_norm": 1.5561412572860718, + "learning_rate": 2.5918057663125952e-06, + "loss": 0.2474, + "step": 2615 + }, + { + "epoch": 0.7552064567269583, + "grad_norm": 2.4157848358154297, + "learning_rate": 2.576631259484067e-06, + "loss": 0.2643, + "step": 2620 + }, + { + "epoch": 0.7566476904230021, + "grad_norm": 2.7704153060913086, + "learning_rate": 2.561456752655539e-06, + "loss": 0.2305, + "step": 2625 + }, + { + "epoch": 0.7580889241190459, + "grad_norm": 3.1055073738098145, + "learning_rate": 2.5462822458270107e-06, + "loss": 0.2451, + "step": 2630 + }, + { + "epoch": 0.7595301578150897, + "grad_norm": 1.9698563814163208, + "learning_rate": 2.531107738998483e-06, + "loss": 0.2292, + "step": 2635 + }, + { + "epoch": 0.7609713915111336, + "grad_norm": 1.9055688381195068, + "learning_rate": 2.515933232169955e-06, + "loss": 0.2681, + "step": 2640 + }, + { + "epoch": 0.7624126252071773, + "grad_norm": 3.620293617248535, + "learning_rate": 2.5007587253414266e-06, + "loss": 0.2469, + "step": 2645 + }, + { + "epoch": 0.7638538589032211, + "grad_norm": 2.351473569869995, + "learning_rate": 2.4855842185128985e-06, + "loss": 0.2612, + "step": 2650 + }, + { + "epoch": 0.765295092599265, + "grad_norm": 2.991499185562134, + "learning_rate": 2.4704097116843703e-06, + "loss": 0.2513, + "step": 2655 + }, + { + "epoch": 0.7667363262953087, + "grad_norm": 3.2980542182922363, + "learning_rate": 2.455235204855842e-06, + "loss": 0.2249, + "step": 2660 + }, + { + "epoch": 0.7681775599913526, + "grad_norm": 1.7889645099639893, + "learning_rate": 2.4400606980273144e-06, + "loss": 0.2566, + "step": 2665 + }, + { + "epoch": 0.7696187936873964, + "grad_norm": 2.3199574947357178, + "learning_rate": 2.424886191198786e-06, + "loss": 0.2419, + "step": 2670 + }, + { + "epoch": 0.7710600273834403, + "grad_norm": 1.5038098096847534, + "learning_rate": 2.409711684370258e-06, + "loss": 0.2308, + "step": 2675 + }, + { + "epoch": 0.772501261079484, + "grad_norm": 4.429576396942139, + "learning_rate": 2.39453717754173e-06, + "loss": 0.2702, + "step": 2680 + }, + { + "epoch": 0.7739424947755279, + "grad_norm": 3.6350767612457275, + "learning_rate": 2.379362670713202e-06, + "loss": 0.2343, + "step": 2685 + }, + { + "epoch": 0.7753837284715717, + "grad_norm": 2.1402158737182617, + "learning_rate": 2.364188163884674e-06, + "loss": 0.265, + "step": 2690 + }, + { + "epoch": 0.7768249621676154, + "grad_norm": 3.15134334564209, + "learning_rate": 2.3490136570561458e-06, + "loss": 0.2433, + "step": 2695 + }, + { + "epoch": 0.7782661958636593, + "grad_norm": 2.184305429458618, + "learning_rate": 2.333839150227618e-06, + "loss": 0.2216, + "step": 2700 + }, + { + "epoch": 0.7782661958636593, + "eval_loss": 0.2383294701576233, + "eval_mse": 0.23832946814969183, + "eval_runtime": 3.7811, + "eval_samples_per_second": 264.471, + "eval_steps_per_second": 16.662, + "step": 2700 + }, + { + "epoch": 0.7797074295597031, + "grad_norm": 3.330735206604004, + "learning_rate": 2.31866464339909e-06, + "loss": 0.2463, + "step": 2705 + }, + { + "epoch": 0.781148663255747, + "grad_norm": 3.6282222270965576, + "learning_rate": 2.3034901365705617e-06, + "loss": 0.2269, + "step": 2710 + }, + { + "epoch": 0.7825898969517907, + "grad_norm": 3.381441593170166, + "learning_rate": 2.2883156297420335e-06, + "loss": 0.2783, + "step": 2715 + }, + { + "epoch": 0.7840311306478346, + "grad_norm": 2.2831079959869385, + "learning_rate": 2.2731411229135057e-06, + "loss": 0.2633, + "step": 2720 + }, + { + "epoch": 0.7854723643438783, + "grad_norm": 3.28764271736145, + "learning_rate": 2.2579666160849776e-06, + "loss": 0.2424, + "step": 2725 + }, + { + "epoch": 0.7869135980399222, + "grad_norm": 5.361327648162842, + "learning_rate": 2.2427921092564494e-06, + "loss": 0.2379, + "step": 2730 + }, + { + "epoch": 0.788354831735966, + "grad_norm": 2.6280875205993652, + "learning_rate": 2.2276176024279212e-06, + "loss": 0.2405, + "step": 2735 + }, + { + "epoch": 0.7897960654320098, + "grad_norm": 2.9547958374023438, + "learning_rate": 2.212443095599393e-06, + "loss": 0.2531, + "step": 2740 + }, + { + "epoch": 0.7912372991280536, + "grad_norm": 2.026799201965332, + "learning_rate": 2.197268588770865e-06, + "loss": 0.2267, + "step": 2745 + }, + { + "epoch": 0.7926785328240974, + "grad_norm": 4.866410732269287, + "learning_rate": 2.182094081942337e-06, + "loss": 0.2465, + "step": 2750 + }, + { + "epoch": 0.7941197665201413, + "grad_norm": 1.9076712131500244, + "learning_rate": 2.166919575113809e-06, + "loss": 0.2622, + "step": 2755 + }, + { + "epoch": 0.795561000216185, + "grad_norm": 3.61765193939209, + "learning_rate": 2.1517450682852808e-06, + "loss": 0.2348, + "step": 2760 + }, + { + "epoch": 0.7970022339122289, + "grad_norm": 2.6565895080566406, + "learning_rate": 2.1365705614567526e-06, + "loss": 0.262, + "step": 2765 + }, + { + "epoch": 0.7984434676082727, + "grad_norm": 2.1399002075195312, + "learning_rate": 2.121396054628225e-06, + "loss": 0.2507, + "step": 2770 + }, + { + "epoch": 0.7998847013043165, + "grad_norm": 2.7440185546875, + "learning_rate": 2.1062215477996967e-06, + "loss": 0.2274, + "step": 2775 + }, + { + "epoch": 0.8013259350003603, + "grad_norm": 1.979787826538086, + "learning_rate": 2.0910470409711685e-06, + "loss": 0.2485, + "step": 2780 + }, + { + "epoch": 0.8027671686964041, + "grad_norm": 1.6057958602905273, + "learning_rate": 2.0758725341426408e-06, + "loss": 0.244, + "step": 2785 + }, + { + "epoch": 0.804208402392448, + "grad_norm": 1.937915563583374, + "learning_rate": 2.0606980273141126e-06, + "loss": 0.2574, + "step": 2790 + }, + { + "epoch": 0.8056496360884917, + "grad_norm": 2.035689115524292, + "learning_rate": 2.0455235204855844e-06, + "loss": 0.2744, + "step": 2795 + }, + { + "epoch": 0.8070908697845356, + "grad_norm": 2.3180646896362305, + "learning_rate": 2.0303490136570563e-06, + "loss": 0.255, + "step": 2800 + }, + { + "epoch": 0.8070908697845356, + "eval_loss": 0.2366907298564911, + "eval_mse": 0.23669074228499085, + "eval_runtime": 3.4073, + "eval_samples_per_second": 293.49, + "eval_steps_per_second": 18.49, + "step": 2800 + }, + { + "epoch": 0.8085321034805794, + "grad_norm": 1.9396241903305054, + "learning_rate": 2.0151745068285285e-06, + "loss": 0.2309, + "step": 2805 + }, + { + "epoch": 0.8099733371766232, + "grad_norm": 4.935720443725586, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.2523, + "step": 2810 + }, + { + "epoch": 0.811414570872667, + "grad_norm": 2.1301615238189697, + "learning_rate": 1.984825493171472e-06, + "loss": 0.2448, + "step": 2815 + }, + { + "epoch": 0.8128558045687109, + "grad_norm": 2.9700865745544434, + "learning_rate": 1.969650986342944e-06, + "loss": 0.239, + "step": 2820 + }, + { + "epoch": 0.8142970382647546, + "grad_norm": 2.1401546001434326, + "learning_rate": 1.954476479514416e-06, + "loss": 0.2349, + "step": 2825 + }, + { + "epoch": 0.8157382719607984, + "grad_norm": 3.954643726348877, + "learning_rate": 1.9393019726858876e-06, + "loss": 0.2609, + "step": 2830 + }, + { + "epoch": 0.8171795056568423, + "grad_norm": 1.8797838687896729, + "learning_rate": 1.92412746585736e-06, + "loss": 0.2492, + "step": 2835 + }, + { + "epoch": 0.818620739352886, + "grad_norm": 2.2840428352355957, + "learning_rate": 1.9089529590288317e-06, + "loss": 0.2382, + "step": 2840 + }, + { + "epoch": 0.8200619730489299, + "grad_norm": 1.9538135528564453, + "learning_rate": 1.8937784522003035e-06, + "loss": 0.2307, + "step": 2845 + }, + { + "epoch": 0.8215032067449737, + "grad_norm": 3.3001744747161865, + "learning_rate": 1.8786039453717756e-06, + "loss": 0.2311, + "step": 2850 + }, + { + "epoch": 0.8229444404410176, + "grad_norm": 2.6055421829223633, + "learning_rate": 1.8634294385432474e-06, + "loss": 0.2732, + "step": 2855 + }, + { + "epoch": 0.8243856741370613, + "grad_norm": 4.642452716827393, + "learning_rate": 1.8482549317147195e-06, + "loss": 0.2642, + "step": 2860 + }, + { + "epoch": 0.8258269078331051, + "grad_norm": 1.3938387632369995, + "learning_rate": 1.8330804248861913e-06, + "loss": 0.2478, + "step": 2865 + }, + { + "epoch": 0.827268141529149, + "grad_norm": 2.2278647422790527, + "learning_rate": 1.8179059180576633e-06, + "loss": 0.2533, + "step": 2870 + }, + { + "epoch": 0.8287093752251927, + "grad_norm": 4.208284378051758, + "learning_rate": 1.8027314112291352e-06, + "loss": 0.2449, + "step": 2875 + }, + { + "epoch": 0.8301506089212366, + "grad_norm": 1.990187644958496, + "learning_rate": 1.7875569044006072e-06, + "loss": 0.2511, + "step": 2880 + }, + { + "epoch": 0.8315918426172804, + "grad_norm": 1.8299331665039062, + "learning_rate": 1.7723823975720792e-06, + "loss": 0.2484, + "step": 2885 + }, + { + "epoch": 0.8330330763133242, + "grad_norm": 2.172791004180908, + "learning_rate": 1.757207890743551e-06, + "loss": 0.2365, + "step": 2890 + }, + { + "epoch": 0.834474310009368, + "grad_norm": 1.920158863067627, + "learning_rate": 1.742033383915023e-06, + "loss": 0.2335, + "step": 2895 + }, + { + "epoch": 0.8359155437054119, + "grad_norm": 1.844710111618042, + "learning_rate": 1.726858877086495e-06, + "loss": 0.2406, + "step": 2900 + }, + { + "epoch": 0.8359155437054119, + "eval_loss": 0.2344696968793869, + "eval_mse": 0.2344697018004954, + "eval_runtime": 3.4794, + "eval_samples_per_second": 287.41, + "eval_steps_per_second": 18.107, + "step": 2900 + }, + { + "epoch": 0.8373567774014556, + "grad_norm": 1.9122151136398315, + "learning_rate": 1.7116843702579665e-06, + "loss": 0.2462, + "step": 2905 + }, + { + "epoch": 0.8387980110974994, + "grad_norm": 1.5618327856063843, + "learning_rate": 1.6965098634294386e-06, + "loss": 0.2413, + "step": 2910 + }, + { + "epoch": 0.8402392447935433, + "grad_norm": 3.7296929359436035, + "learning_rate": 1.6813353566009106e-06, + "loss": 0.2405, + "step": 2915 + }, + { + "epoch": 0.841680478489587, + "grad_norm": 3.897550106048584, + "learning_rate": 1.6661608497723824e-06, + "loss": 0.2833, + "step": 2920 + }, + { + "epoch": 0.8431217121856309, + "grad_norm": 2.775019645690918, + "learning_rate": 1.6509863429438545e-06, + "loss": 0.2083, + "step": 2925 + }, + { + "epoch": 0.8445629458816747, + "grad_norm": 1.737260341644287, + "learning_rate": 1.6358118361153263e-06, + "loss": 0.2535, + "step": 2930 + }, + { + "epoch": 0.8460041795777186, + "grad_norm": 2.2079930305480957, + "learning_rate": 1.6206373292867984e-06, + "loss": 0.2649, + "step": 2935 + }, + { + "epoch": 0.8474454132737623, + "grad_norm": 2.6033072471618652, + "learning_rate": 1.6054628224582702e-06, + "loss": 0.2324, + "step": 2940 + }, + { + "epoch": 0.8488866469698062, + "grad_norm": 2.3140974044799805, + "learning_rate": 1.5902883156297422e-06, + "loss": 0.2314, + "step": 2945 + }, + { + "epoch": 0.85032788066585, + "grad_norm": 1.9832013845443726, + "learning_rate": 1.575113808801214e-06, + "loss": 0.2709, + "step": 2950 + }, + { + "epoch": 0.8517691143618937, + "grad_norm": 2.839756727218628, + "learning_rate": 1.559939301972686e-06, + "loss": 0.2349, + "step": 2955 + }, + { + "epoch": 0.8532103480579376, + "grad_norm": 2.0620322227478027, + "learning_rate": 1.544764795144158e-06, + "loss": 0.2697, + "step": 2960 + }, + { + "epoch": 0.8546515817539814, + "grad_norm": 2.1879138946533203, + "learning_rate": 1.52959028831563e-06, + "loss": 0.2415, + "step": 2965 + }, + { + "epoch": 0.8560928154500252, + "grad_norm": 2.201324224472046, + "learning_rate": 1.514415781487102e-06, + "loss": 0.2469, + "step": 2970 + }, + { + "epoch": 0.857534049146069, + "grad_norm": 1.4632495641708374, + "learning_rate": 1.4992412746585738e-06, + "loss": 0.2194, + "step": 2975 + }, + { + "epoch": 0.8589752828421129, + "grad_norm": 1.9901318550109863, + "learning_rate": 1.4840667678300459e-06, + "loss": 0.2597, + "step": 2980 + }, + { + "epoch": 0.8604165165381567, + "grad_norm": 1.8942216634750366, + "learning_rate": 1.4688922610015175e-06, + "loss": 0.2481, + "step": 2985 + }, + { + "epoch": 0.8618577502342005, + "grad_norm": 1.9384193420410156, + "learning_rate": 1.4537177541729893e-06, + "loss": 0.2399, + "step": 2990 + }, + { + "epoch": 0.8632989839302443, + "grad_norm": 1.9823272228240967, + "learning_rate": 1.4385432473444613e-06, + "loss": 0.2001, + "step": 2995 + }, + { + "epoch": 0.864740217626288, + "grad_norm": 4.13219690322876, + "learning_rate": 1.4233687405159332e-06, + "loss": 0.2388, + "step": 3000 + }, + { + "epoch": 0.864740217626288, + "eval_loss": 0.22815744578838348, + "eval_mse": 0.22815744203701616, + "eval_runtime": 3.6856, + "eval_samples_per_second": 271.329, + "eval_steps_per_second": 17.094, + "step": 3000 + }, + { + "epoch": 0.8661814513223319, + "grad_norm": 2.3792169094085693, + "learning_rate": 1.4081942336874052e-06, + "loss": 0.2651, + "step": 3005 + }, + { + "epoch": 0.8676226850183757, + "grad_norm": 1.8244715929031372, + "learning_rate": 1.3930197268588772e-06, + "loss": 0.2437, + "step": 3010 + }, + { + "epoch": 0.8690639187144196, + "grad_norm": 2.443810224533081, + "learning_rate": 1.377845220030349e-06, + "loss": 0.2527, + "step": 3015 + }, + { + "epoch": 0.8705051524104633, + "grad_norm": 2.1576616764068604, + "learning_rate": 1.3626707132018211e-06, + "loss": 0.2713, + "step": 3020 + }, + { + "epoch": 0.8719463861065072, + "grad_norm": 2.210902452468872, + "learning_rate": 1.347496206373293e-06, + "loss": 0.238, + "step": 3025 + }, + { + "epoch": 0.873387619802551, + "grad_norm": 2.990591049194336, + "learning_rate": 1.332321699544765e-06, + "loss": 0.2217, + "step": 3030 + }, + { + "epoch": 0.8748288534985948, + "grad_norm": 1.5626643896102905, + "learning_rate": 1.3171471927162368e-06, + "loss": 0.2387, + "step": 3035 + }, + { + "epoch": 0.8762700871946386, + "grad_norm": 1.5288246870040894, + "learning_rate": 1.3019726858877088e-06, + "loss": 0.2163, + "step": 3040 + }, + { + "epoch": 0.8777113208906824, + "grad_norm": 3.954557180404663, + "learning_rate": 1.2867981790591807e-06, + "loss": 0.252, + "step": 3045 + }, + { + "epoch": 0.8791525545867263, + "grad_norm": 5.214181900024414, + "learning_rate": 1.2716236722306527e-06, + "loss": 0.2403, + "step": 3050 + }, + { + "epoch": 0.88059378828277, + "grad_norm": 2.6781516075134277, + "learning_rate": 1.2564491654021245e-06, + "loss": 0.2493, + "step": 3055 + }, + { + "epoch": 0.8820350219788139, + "grad_norm": 4.418315410614014, + "learning_rate": 1.2412746585735964e-06, + "loss": 0.2694, + "step": 3060 + }, + { + "epoch": 0.8834762556748577, + "grad_norm": 2.1344754695892334, + "learning_rate": 1.2261001517450684e-06, + "loss": 0.2676, + "step": 3065 + }, + { + "epoch": 0.8849174893709015, + "grad_norm": 2.3457322120666504, + "learning_rate": 1.2109256449165402e-06, + "loss": 0.2516, + "step": 3070 + }, + { + "epoch": 0.8863587230669453, + "grad_norm": 1.9455342292785645, + "learning_rate": 1.1957511380880123e-06, + "loss": 0.2513, + "step": 3075 + }, + { + "epoch": 0.8877999567629892, + "grad_norm": 3.034874439239502, + "learning_rate": 1.1805766312594843e-06, + "loss": 0.2651, + "step": 3080 + }, + { + "epoch": 0.8892411904590329, + "grad_norm": 3.1415164470672607, + "learning_rate": 1.1654021244309561e-06, + "loss": 0.2494, + "step": 3085 + }, + { + "epoch": 0.8906824241550767, + "grad_norm": 2.136166572570801, + "learning_rate": 1.150227617602428e-06, + "loss": 0.2577, + "step": 3090 + }, + { + "epoch": 0.8921236578511206, + "grad_norm": 2.6537928581237793, + "learning_rate": 1.1350531107738998e-06, + "loss": 0.2432, + "step": 3095 + }, + { + "epoch": 0.8935648915471643, + "grad_norm": 1.8011763095855713, + "learning_rate": 1.1198786039453718e-06, + "loss": 0.2571, + "step": 3100 + }, + { + "epoch": 0.8935648915471643, + "eval_loss": 0.23310115933418274, + "eval_mse": 0.23310116421058774, + "eval_runtime": 3.4187, + "eval_samples_per_second": 292.511, + "eval_steps_per_second": 18.428, + "step": 3100 + }, + { + "epoch": 0.8950061252432082, + "grad_norm": 1.5741316080093384, + "learning_rate": 1.1047040971168439e-06, + "loss": 0.223, + "step": 3105 + }, + { + "epoch": 0.896447358939252, + "grad_norm": 1.5244059562683105, + "learning_rate": 1.0895295902883157e-06, + "loss": 0.2428, + "step": 3110 + }, + { + "epoch": 0.8978885926352959, + "grad_norm": 1.8226120471954346, + "learning_rate": 1.0743550834597877e-06, + "loss": 0.2548, + "step": 3115 + }, + { + "epoch": 0.8993298263313396, + "grad_norm": 2.6525444984436035, + "learning_rate": 1.0591805766312596e-06, + "loss": 0.2354, + "step": 3120 + }, + { + "epoch": 0.9007710600273834, + "grad_norm": 3.4096426963806152, + "learning_rate": 1.0440060698027316e-06, + "loss": 0.2535, + "step": 3125 + }, + { + "epoch": 0.9022122937234273, + "grad_norm": 1.6631042957305908, + "learning_rate": 1.0288315629742034e-06, + "loss": 0.2549, + "step": 3130 + }, + { + "epoch": 0.903653527419471, + "grad_norm": 4.044405460357666, + "learning_rate": 1.0136570561456753e-06, + "loss": 0.2438, + "step": 3135 + }, + { + "epoch": 0.9050947611155149, + "grad_norm": 2.1548964977264404, + "learning_rate": 9.984825493171473e-07, + "loss": 0.2171, + "step": 3140 + }, + { + "epoch": 0.9065359948115587, + "grad_norm": 2.5267910957336426, + "learning_rate": 9.833080424886191e-07, + "loss": 0.2341, + "step": 3145 + }, + { + "epoch": 0.9079772285076025, + "grad_norm": 1.7925020456314087, + "learning_rate": 9.681335356600912e-07, + "loss": 0.2653, + "step": 3150 + }, + { + "epoch": 0.9094184622036463, + "grad_norm": 2.5686533451080322, + "learning_rate": 9.529590288315631e-07, + "loss": 0.2671, + "step": 3155 + }, + { + "epoch": 0.9108596958996902, + "grad_norm": 1.508566975593567, + "learning_rate": 9.37784522003035e-07, + "loss": 0.2554, + "step": 3160 + }, + { + "epoch": 0.912300929595734, + "grad_norm": 2.414174795150757, + "learning_rate": 9.22610015174507e-07, + "loss": 0.242, + "step": 3165 + }, + { + "epoch": 0.9137421632917777, + "grad_norm": 2.007922887802124, + "learning_rate": 9.074355083459788e-07, + "loss": 0.2442, + "step": 3170 + }, + { + "epoch": 0.9151833969878216, + "grad_norm": 2.534485340118408, + "learning_rate": 8.922610015174507e-07, + "loss": 0.2478, + "step": 3175 + }, + { + "epoch": 0.9166246306838653, + "grad_norm": 2.9684510231018066, + "learning_rate": 8.770864946889227e-07, + "loss": 0.2607, + "step": 3180 + }, + { + "epoch": 0.9180658643799092, + "grad_norm": 2.6235086917877197, + "learning_rate": 8.619119878603946e-07, + "loss": 0.2442, + "step": 3185 + }, + { + "epoch": 0.919507098075953, + "grad_norm": 4.0555644035339355, + "learning_rate": 8.467374810318665e-07, + "loss": 0.2722, + "step": 3190 + }, + { + "epoch": 0.9209483317719969, + "grad_norm": 3.3704187870025635, + "learning_rate": 8.315629742033385e-07, + "loss": 0.2483, + "step": 3195 + }, + { + "epoch": 0.9223895654680406, + "grad_norm": 3.073582649230957, + "learning_rate": 8.163884673748104e-07, + "loss": 0.2672, + "step": 3200 + }, + { + "epoch": 0.9223895654680406, + "eval_loss": 0.2335718721151352, + "eval_mse": 0.23357188405096532, + "eval_runtime": 3.4739, + "eval_samples_per_second": 287.861, + "eval_steps_per_second": 18.135, + "step": 3200 + }, + { + "epoch": 0.9238307991640845, + "grad_norm": 2.310436248779297, + "learning_rate": 8.012139605462823e-07, + "loss": 0.2489, + "step": 3205 + }, + { + "epoch": 0.9252720328601283, + "grad_norm": 1.7982414960861206, + "learning_rate": 7.860394537177542e-07, + "loss": 0.2421, + "step": 3210 + }, + { + "epoch": 0.926713266556172, + "grad_norm": 1.9083003997802734, + "learning_rate": 7.708649468892261e-07, + "loss": 0.2398, + "step": 3215 + }, + { + "epoch": 0.9281545002522159, + "grad_norm": 3.1134302616119385, + "learning_rate": 7.55690440060698e-07, + "loss": 0.255, + "step": 3220 + }, + { + "epoch": 0.9295957339482597, + "grad_norm": 2.5554730892181396, + "learning_rate": 7.4051593323217e-07, + "loss": 0.255, + "step": 3225 + }, + { + "epoch": 0.9310369676443035, + "grad_norm": 2.0975911617279053, + "learning_rate": 7.253414264036419e-07, + "loss": 0.2405, + "step": 3230 + }, + { + "epoch": 0.9324782013403473, + "grad_norm": 2.0389764308929443, + "learning_rate": 7.101669195751138e-07, + "loss": 0.2298, + "step": 3235 + }, + { + "epoch": 0.9339194350363912, + "grad_norm": 2.1120331287384033, + "learning_rate": 6.949924127465859e-07, + "loss": 0.2455, + "step": 3240 + }, + { + "epoch": 0.935360668732435, + "grad_norm": 1.8716710805892944, + "learning_rate": 6.798179059180578e-07, + "loss": 0.2502, + "step": 3245 + }, + { + "epoch": 0.9368019024284788, + "grad_norm": 3.4095449447631836, + "learning_rate": 6.646433990895297e-07, + "loss": 0.2431, + "step": 3250 + }, + { + "epoch": 0.9382431361245226, + "grad_norm": 1.740820288658142, + "learning_rate": 6.494688922610016e-07, + "loss": 0.2493, + "step": 3255 + }, + { + "epoch": 0.9396843698205664, + "grad_norm": 1.8503363132476807, + "learning_rate": 6.342943854324735e-07, + "loss": 0.2537, + "step": 3260 + }, + { + "epoch": 0.9411256035166102, + "grad_norm": 2.1327052116394043, + "learning_rate": 6.191198786039454e-07, + "loss": 0.2346, + "step": 3265 + }, + { + "epoch": 0.942566837212654, + "grad_norm": 2.9109222888946533, + "learning_rate": 6.039453717754174e-07, + "loss": 0.2415, + "step": 3270 + }, + { + "epoch": 0.9440080709086979, + "grad_norm": 1.9122915267944336, + "learning_rate": 5.887708649468893e-07, + "loss": 0.2401, + "step": 3275 + }, + { + "epoch": 0.9454493046047416, + "grad_norm": 2.2561800479888916, + "learning_rate": 5.735963581183612e-07, + "loss": 0.2331, + "step": 3280 + }, + { + "epoch": 0.9468905383007855, + "grad_norm": 2.547837972640991, + "learning_rate": 5.584218512898331e-07, + "loss": 0.2373, + "step": 3285 + }, + { + "epoch": 0.9483317719968293, + "grad_norm": 2.524578332901001, + "learning_rate": 5.43247344461305e-07, + "loss": 0.2225, + "step": 3290 + }, + { + "epoch": 0.9497730056928732, + "grad_norm": 1.7476263046264648, + "learning_rate": 5.28072837632777e-07, + "loss": 0.2339, + "step": 3295 + }, + { + "epoch": 0.9512142393889169, + "grad_norm": 1.938265323638916, + "learning_rate": 5.12898330804249e-07, + "loss": 0.2375, + "step": 3300 + }, + { + "epoch": 0.9512142393889169, + "eval_loss": 0.23372192680835724, + "eval_mse": 0.23372193356230855, + "eval_runtime": 3.4568, + "eval_samples_per_second": 289.284, + "eval_steps_per_second": 18.225, + "step": 3300 + }, + { + "epoch": 0.9526554730849607, + "grad_norm": 2.301506519317627, + "learning_rate": 4.977238239757208e-07, + "loss": 0.2699, + "step": 3305 + }, + { + "epoch": 0.9540967067810046, + "grad_norm": 1.916329026222229, + "learning_rate": 4.825493171471927e-07, + "loss": 0.2101, + "step": 3310 + }, + { + "epoch": 0.9555379404770483, + "grad_norm": 2.0248494148254395, + "learning_rate": 4.673748103186647e-07, + "loss": 0.2202, + "step": 3315 + }, + { + "epoch": 0.9569791741730922, + "grad_norm": 3.3901896476745605, + "learning_rate": 4.5220030349013665e-07, + "loss": 0.2336, + "step": 3320 + }, + { + "epoch": 0.958420407869136, + "grad_norm": 1.915697455406189, + "learning_rate": 4.3702579666160853e-07, + "loss": 0.2444, + "step": 3325 + }, + { + "epoch": 0.9598616415651798, + "grad_norm": 3.5947072505950928, + "learning_rate": 4.2185128983308046e-07, + "loss": 0.2515, + "step": 3330 + }, + { + "epoch": 0.9613028752612236, + "grad_norm": 3.344456911087036, + "learning_rate": 4.066767830045524e-07, + "loss": 0.2347, + "step": 3335 + }, + { + "epoch": 0.9627441089572675, + "grad_norm": 2.0030887126922607, + "learning_rate": 3.9150227617602433e-07, + "loss": 0.2323, + "step": 3340 + }, + { + "epoch": 0.9641853426533112, + "grad_norm": 1.546491265296936, + "learning_rate": 3.763277693474962e-07, + "loss": 0.247, + "step": 3345 + }, + { + "epoch": 0.965626576349355, + "grad_norm": 1.7898391485214233, + "learning_rate": 3.6115326251896814e-07, + "loss": 0.2441, + "step": 3350 + }, + { + "epoch": 0.9670678100453989, + "grad_norm": 1.4800883531570435, + "learning_rate": 3.459787556904401e-07, + "loss": 0.2069, + "step": 3355 + }, + { + "epoch": 0.9685090437414426, + "grad_norm": 5.112166881561279, + "learning_rate": 3.3080424886191206e-07, + "loss": 0.2405, + "step": 3360 + }, + { + "epoch": 0.9699502774374865, + "grad_norm": 2.2341766357421875, + "learning_rate": 3.156297420333839e-07, + "loss": 0.2217, + "step": 3365 + }, + { + "epoch": 0.9713915111335303, + "grad_norm": 2.1282143592834473, + "learning_rate": 3.004552352048559e-07, + "loss": 0.2317, + "step": 3370 + }, + { + "epoch": 0.9728327448295742, + "grad_norm": 4.700996398925781, + "learning_rate": 2.852807283763278e-07, + "loss": 0.2458, + "step": 3375 + }, + { + "epoch": 0.9742739785256179, + "grad_norm": 2.893071413040161, + "learning_rate": 2.701062215477997e-07, + "loss": 0.2373, + "step": 3380 + }, + { + "epoch": 0.9757152122216617, + "grad_norm": 2.4132773876190186, + "learning_rate": 2.549317147192716e-07, + "loss": 0.2587, + "step": 3385 + }, + { + "epoch": 0.9771564459177056, + "grad_norm": 3.645531415939331, + "learning_rate": 2.3975720789074356e-07, + "loss": 0.2523, + "step": 3390 + }, + { + "epoch": 0.9785976796137493, + "grad_norm": 3.378278970718384, + "learning_rate": 2.245827010622155e-07, + "loss": 0.2336, + "step": 3395 + }, + { + "epoch": 0.9800389133097932, + "grad_norm": 2.213209867477417, + "learning_rate": 2.0940819423368745e-07, + "loss": 0.2423, + "step": 3400 + }, + { + "epoch": 0.9800389133097932, + "eval_loss": 0.23236581683158875, + "eval_mse": 0.2323658305592835, + "eval_runtime": 3.516, + "eval_samples_per_second": 284.418, + "eval_steps_per_second": 17.918, + "step": 3400 + }, + { + "epoch": 0.981480147005837, + "grad_norm": 1.9079331159591675, + "learning_rate": 1.9423368740515936e-07, + "loss": 0.2418, + "step": 3405 + }, + { + "epoch": 0.9829213807018808, + "grad_norm": 1.9052362442016602, + "learning_rate": 1.790591805766313e-07, + "loss": 0.2642, + "step": 3410 + }, + { + "epoch": 0.9843626143979246, + "grad_norm": 1.942834496498108, + "learning_rate": 1.638846737481032e-07, + "loss": 0.245, + "step": 3415 + }, + { + "epoch": 0.9858038480939685, + "grad_norm": 3.9796624183654785, + "learning_rate": 1.4871016691957513e-07, + "loss": 0.2705, + "step": 3420 + }, + { + "epoch": 0.9872450817900122, + "grad_norm": 1.7400703430175781, + "learning_rate": 1.3353566009104704e-07, + "loss": 0.2316, + "step": 3425 + }, + { + "epoch": 0.988686315486056, + "grad_norm": 2.3475584983825684, + "learning_rate": 1.1836115326251897e-07, + "loss": 0.2353, + "step": 3430 + }, + { + "epoch": 0.9901275491820999, + "grad_norm": 1.7061249017715454, + "learning_rate": 1.031866464339909e-07, + "loss": 0.2384, + "step": 3435 + }, + { + "epoch": 0.9915687828781437, + "grad_norm": 1.9734495878219604, + "learning_rate": 8.801213960546283e-08, + "loss": 0.2717, + "step": 3440 + }, + { + "epoch": 0.9930100165741875, + "grad_norm": 1.485177993774414, + "learning_rate": 7.283763277693476e-08, + "loss": 0.2242, + "step": 3445 + }, + { + "epoch": 0.9944512502702313, + "grad_norm": 2.0481748580932617, + "learning_rate": 5.7663125948406686e-08, + "loss": 0.2768, + "step": 3450 + }, + { + "epoch": 0.9958924839662752, + "grad_norm": 2.793379545211792, + "learning_rate": 4.248861911987861e-08, + "loss": 0.225, + "step": 3455 + }, + { + "epoch": 0.9973337176623189, + "grad_norm": 2.208509922027588, + "learning_rate": 2.7314112291350533e-08, + "loss": 0.2576, + "step": 3460 + }, + { + "epoch": 0.9987749513583628, + "grad_norm": 2.3073906898498535, + "learning_rate": 1.213960546282246e-08, + "loss": 0.2368, + "step": 3465 + }, + { + "epoch": 0.9999279383151978, + "step": 3469, + "total_flos": 5.881871499303322e+16, + "train_loss": 0.29927895559842593, + "train_runtime": 2787.8171, + "train_samples_per_second": 159.283, + "train_steps_per_second": 1.244 + } + ], + "logging_steps": 5, + "max_steps": 3469, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.881871499303322e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}