{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.09999063758075087, "global_step": 4272, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.1682242990654204e-06, "loss": 10.9364, "theoretical_loss": 20.81281780154715, "tokens_seen": 65536 }, { "epoch": 0.0, "learning_rate": 5.841121495327103e-05, "loss": 8.9947, "theoretical_loss": 8.563482664611069, "tokens_seen": 3276800 }, { "epoch": 0.0, "learning_rate": 0.00011682242990654206, "loss": 7.0133, "theoretical_loss": 7.4777587180480305, "tokens_seen": 6553600 }, { "epoch": 0.0, "learning_rate": 0.00017523364485981307, "loss": 6.2244, "theoretical_loss": 6.9337544888949, "tokens_seen": 9830400 }, { "epoch": 0.0, "learning_rate": 0.00023364485981308412, "loss": 5.822, "theoretical_loss": 6.583566228426414, "tokens_seen": 13107200 }, { "epoch": 0.01, "learning_rate": 0.00029205607476635517, "loss": 5.5642, "theoretical_loss": 6.330713565116083, "tokens_seen": 16384000 }, { "epoch": 0.01, "learning_rate": 0.00035046728971962614, "loss": 5.3688, "theoretical_loss": 6.135529231940326, "tokens_seen": 19660800 }, { "epoch": 0.01, "learning_rate": 0.0004088785046728972, "loss": 5.1987, "theoretical_loss": 5.978101583869607, "tokens_seen": 22937600 }, { "epoch": 0.01, "learning_rate": 0.00046728971962616824, "loss": 5.1026, "theoretical_loss": 5.8471173262659235, "tokens_seen": 26214400 }, { "epoch": 0.01, "learning_rate": 0.000499739928125591, "loss": 4.9312, "theoretical_loss": 5.7355768158821245, "tokens_seen": 29491200 }, { "epoch": 0.01, "learning_rate": 0.0004991488556837526, "loss": 4.8598, "theoretical_loss": 5.638870144071353, "tokens_seen": 32768000 }, { "epoch": 0.01, "learning_rate": 0.0004985577832419141, "loss": 4.7297, "theoretical_loss": 5.553812381844907, "tokens_seen": 36044800 }, { "epoch": 0.01, "learning_rate": 0.0004979667108000757, "loss": 4.6466, "theoretical_loss": 5.478118080556438, "tokens_seen": 39321600 }, { "epoch": 0.02, "learning_rate": 0.0004973756383582371, "loss": 4.5646, "theoretical_loss": 5.410095959579362, "tokens_seen": 42598400 }, { "epoch": 0.02, "learning_rate": 0.0004967845659163987, "loss": 4.5162, "theoretical_loss": 5.348462083735834, "tokens_seen": 45875200 }, { "epoch": 0.02, "learning_rate": 0.0004961934934745603, "loss": 4.3809, "theoretical_loss": 5.292220566937567, "tokens_seen": 49152000 }, { "epoch": 0.02, "learning_rate": 0.0004956024210327218, "loss": 4.336, "theoretical_loss": 5.240584625769978, "tokens_seen": 52428800 }, { "epoch": 0.02, "learning_rate": 0.0004950113485908833, "loss": 4.2829, "theoretical_loss": 5.192922724525789, "tokens_seen": 55705600 }, { "epoch": 0.02, "learning_rate": 0.0004944202761490448, "loss": 4.209, "theoretical_loss": 5.1487208633564405, "tokens_seen": 58982400 }, { "epoch": 0.02, "learning_rate": 0.0004938292037072064, "loss": 4.0751, "theoretical_loss": 5.107555562405102, "tokens_seen": 62259200 }, { "epoch": 0.02, "learning_rate": 0.0004932381312653678, "loss": 3.9696, "theoretical_loss": 5.069074117143246, "tokens_seen": 65536000 }, { "epoch": 0.02, "learning_rate": 0.0004926470588235294, "loss": 3.9197, "theoretical_loss": 5.032979909838007, "tokens_seen": 68812800 }, { "epoch": 0.03, "learning_rate": 0.000492055986381691, "loss": 3.8378, "theoretical_loss": 4.999021308224664, "tokens_seen": 72089600 }, { "epoch": 0.03, "learning_rate": 0.0004914649139398525, "loss": 3.7856, "theoretical_loss": 4.966983155351962, "tokens_seen": 75366400 }, { "epoch": 0.03, "learning_rate": 0.000490873841498014, "loss": 3.6988, "theoretical_loss": 4.9366801616251355, "tokens_seen": 78643200 }, { "epoch": 0.03, "learning_rate": 0.0004902827690561755, "loss": 3.6826, "theoretical_loss": 4.907951713830082, "tokens_seen": 81920000 }, { "epoch": 0.03, "learning_rate": 0.0004896916966143371, "loss": 3.6271, "theoretical_loss": 4.880657753812926, "tokens_seen": 85196800 }, { "epoch": 0.03, "learning_rate": 0.0004891006241724985, "loss": 3.617, "theoretical_loss": 4.854675474481779, "tokens_seen": 88473600 }, { "epoch": 0.03, "learning_rate": 0.0004885095517306601, "loss": 3.528, "theoretical_loss": 4.8298966473088125, "tokens_seen": 91750400 }, { "epoch": 0.03, "learning_rate": 0.0004879184792888217, "loss": 3.5513, "theoretical_loss": 4.8062254427779205, "tokens_seen": 95027200 }, { "epoch": 0.04, "learning_rate": 0.0004873274068469832, "loss": 3.4962, "theoretical_loss": 4.783576639276257, "tokens_seen": 98304000 }, { "epoch": 0.04, "learning_rate": 0.00048673633440514467, "loss": 3.5334, "theoretical_loss": 4.761874140772408, "tokens_seen": 101580800 }, { "epoch": 0.04, "learning_rate": 0.0004861452619633062, "loss": 3.5329, "theoretical_loss": 4.741049741962473, "tokens_seen": 104857600 }, { "epoch": 0.04, "learning_rate": 0.0004855541895214677, "loss": 3.5329, "theoretical_loss": 4.721042093249051, "tokens_seen": 108134400 }, { "epoch": 0.04, "learning_rate": 0.0004849631170796293, "loss": 3.4926, "theoretical_loss": 4.701795828231866, "tokens_seen": 111411200 }, { "epoch": 0.04, "learning_rate": 0.0004843720446377908, "loss": 3.4835, "theoretical_loss": 4.68326082423593, "tokens_seen": 114688000 }, { "epoch": 0.04, "learning_rate": 0.00048378097219595233, "loss": 3.4587, "theoretical_loss": 4.665391572426282, "tokens_seen": 117964800 }, { "epoch": 0.04, "learning_rate": 0.00048318989975411385, "loss": 3.4307, "theoretical_loss": 4.648146638719739, "tokens_seen": 121241600 }, { "epoch": 0.04, "learning_rate": 0.00048259882731227537, "loss": 3.443, "theoretical_loss": 4.631488200339643, "tokens_seen": 124518400 }, { "epoch": 0.05, "learning_rate": 0.0004820077548704369, "loss": 3.45, "theoretical_loss": 4.615381645715717, "tokens_seen": 127795200 }, { "epoch": 0.05, "learning_rate": 0.00048141668242859847, "loss": 3.3857, "theoretical_loss": 4.599795227690505, "tokens_seen": 131072000 }, { "epoch": 0.05, "learning_rate": 0.00048082560998676, "loss": 3.4065, "theoretical_loss": 4.584699761792674, "tokens_seen": 134348800 }, { "epoch": 0.05, "learning_rate": 0.0004802345375449215, "loss": 3.3659, "theoretical_loss": 4.570068362778516, "tokens_seen": 137625600 }, { "epoch": 0.05, "learning_rate": 0.00047964346510308303, "loss": 3.3781, "theoretical_loss": 4.555876213804037, "tokens_seen": 140902400 }, { "epoch": 0.05, "learning_rate": 0.00047905239266124455, "loss": 3.3612, "theoretical_loss": 4.542100363530799, "tokens_seen": 144179200 }, { "epoch": 0.05, "learning_rate": 0.00047846132021940607, "loss": 3.3402, "theoretical_loss": 4.528719547234816, "tokens_seen": 147456000 }, { "epoch": 0.05, "learning_rate": 0.0004778702477775676, "loss": 3.2899, "theoretical_loss": 4.515714028614996, "tokens_seen": 150732800 }, { "epoch": 0.06, "learning_rate": 0.00047727917533572917, "loss": 3.3099, "theoretical_loss": 4.503065459513339, "tokens_seen": 154009600 }, { "epoch": 0.06, "learning_rate": 0.0004766881028938907, "loss": 3.3162, "theoretical_loss": 4.4907567551852665, "tokens_seen": 157286400 }, { "epoch": 0.06, "learning_rate": 0.0004760970304520522, "loss": 3.3036, "theoretical_loss": 4.478771983111967, "tokens_seen": 160563200 }, { "epoch": 0.06, "objective/train/docs_used": 104000, "objective/train/instantaneous_batch_size": 32, "objective/train/instantaneous_microbatch_size": 32768, "objective/train/original_loss": 3.2747654914855957, "objective/train/theoretical_loss": 4.467096263641219, "objective/train/tokens_used": 184300000, "theoretical_loss": 4.467096263641219, "tokens_seen": 163840000 }, { "epoch": 0.06, "learning_rate": 0.00047550595801021373, "loss": 3.2837, "theoretical_loss": 4.467096263641219, "tokens_seen": 163840000 }, { "epoch": 0.06, "learning_rate": 0.00047491488556837525, "loss": 3.2955, "theoretical_loss": 4.455715680989545, "tokens_seen": 167116800 }, { "epoch": 0.06, "learning_rate": 0.00047432381312653677, "loss": 3.2608, "theoretical_loss": 4.44461720334543, "tokens_seen": 170393600 }, { "epoch": 0.06, "learning_rate": 0.00047373274068469835, "loss": 3.2026, "theoretical_loss": 4.433788610987646, "tokens_seen": 173670400 }, { "epoch": 0.06, "learning_rate": 0.00047314166824285987, "loss": 3.2621, "theoretical_loss": 4.42321843148016, "tokens_seen": 176947200 }, { "epoch": 0.06, "learning_rate": 0.0004725505958010214, "loss": 3.1999, "theoretical_loss": 4.412895881130142, "tokens_seen": 180224000 }, { "epoch": 0.07, "learning_rate": 0.0004719595233591829, "loss": 3.2384, "theoretical_loss": 4.4028108120020795, "tokens_seen": 183500800 }, { "epoch": 0.07, "learning_rate": 0.00047136845091734443, "loss": 3.244, "theoretical_loss": 4.392953663871862, "tokens_seen": 186777600 }, { "epoch": 0.07, "learning_rate": 0.00047077737847550595, "loss": 3.2329, "theoretical_loss": 4.383315420582533, "tokens_seen": 190054400 }, { "epoch": 0.07, "learning_rate": 0.0004701863060336675, "loss": 3.2634, "theoretical_loss": 4.373887570330275, "tokens_seen": 193331200 }, { "epoch": 0.07, "learning_rate": 0.00046959523359182905, "loss": 3.2143, "theoretical_loss": 4.364662069466704, "tokens_seen": 196608000 }, { "epoch": 0.07, "learning_rate": 0.00046900416114999057, "loss": 3.2128, "theoretical_loss": 4.355631309453283, "tokens_seen": 199884800 }, { "epoch": 0.07, "learning_rate": 0.0004684130887081521, "loss": 3.1675, "theoretical_loss": 4.346788086646671, "tokens_seen": 203161600 }, { "epoch": 0.07, "learning_rate": 0.0004678220162663136, "loss": 3.1967, "theoretical_loss": 4.33812557463116, "tokens_seen": 206438400 }, { "epoch": 0.07, "learning_rate": 0.00046723094382447513, "loss": 3.2042, "theoretical_loss": 4.329637298846812, "tokens_seen": 209715200 }, { "epoch": 0.08, "learning_rate": 0.00046663987138263665, "loss": 3.1574, "theoretical_loss": 4.321317113290252, "tokens_seen": 212992000 }, { "epoch": 0.08, "learning_rate": 0.0004660487989407982, "loss": 3.1317, "theoretical_loss": 4.3131591790897925, "tokens_seen": 216268800 }, { "epoch": 0.08, "learning_rate": 0.00046545772649895975, "loss": 3.1829, "theoretical_loss": 4.305157944778228, "tokens_seen": 219545600 }, { "epoch": 0.08, "learning_rate": 0.00046486665405712127, "loss": 3.2073, "theoretical_loss": 4.297308128105687, "tokens_seen": 222822400 }, { "epoch": 0.08, "learning_rate": 0.0004642755816152828, "loss": 3.1994, "theoretical_loss": 4.2896046992515995, "tokens_seen": 226099200 }, { "epoch": 0.08, "learning_rate": 0.0004636845091734443, "loss": 3.2312, "theoretical_loss": 4.282042865309616, "tokens_seen": 229376000 }, { "epoch": 0.08, "learning_rate": 0.00046309343673160583, "loss": 3.2006, "theoretical_loss": 4.274618055932298, "tokens_seen": 232652800 }, { "epoch": 0.08, "learning_rate": 0.0004625023642897674, "loss": 3.1756, "theoretical_loss": 4.267325910033897, "tokens_seen": 235929600 }, { "epoch": 0.09, "learning_rate": 0.0004619112918479289, "loss": 3.1158, "theoretical_loss": 4.260162263459744, "tokens_seen": 239206400 }, { "epoch": 0.09, "learning_rate": 0.00046132021940609044, "loss": 3.1448, "theoretical_loss": 4.253123137539814, "tokens_seen": 242483200 }, { "epoch": 0.09, "learning_rate": 0.00046072914696425197, "loss": 3.166, "theoretical_loss": 4.246204728452055, "tokens_seen": 245760000 }, { "epoch": 0.09, "learning_rate": 0.0004601380745224135, "loss": 3.1762, "theoretical_loss": 4.239403397328261, "tokens_seen": 249036800 }, { "epoch": 0.09, "learning_rate": 0.000459547002080575, "loss": 3.1442, "theoretical_loss": 4.232715661041632, "tokens_seen": 252313600 }, { "epoch": 0.09, "learning_rate": 0.0004589559296387366, "loss": 3.1504, "theoretical_loss": 4.226138183620867, "tokens_seen": 255590400 }, { "epoch": 0.09, "learning_rate": 0.0004583648571968981, "loss": 3.1099, "theoretical_loss": 4.219667768240775, "tokens_seen": 258867200 }, { "epoch": 0.09, "learning_rate": 0.0004577737847550596, "loss": 3.114, "theoretical_loss": 4.213301349743924, "tokens_seen": 262144000 }, { "epoch": 0.09, "learning_rate": 0.0004571827123132211, "loss": 3.0703, "theoretical_loss": 4.20703598765197, "tokens_seen": 265420800 }, { "epoch": 0.1, "learning_rate": 0.0004565916398713826, "loss": 3.0383, "theoretical_loss": 4.2008688596290025, "tokens_seen": 268697600 }, { "epoch": 0.1, "learning_rate": 0.00045600056742954413, "loss": 2.9881, "theoretical_loss": 4.194797255362549, "tokens_seen": 271974400 }, { "epoch": 0.1, "learning_rate": 0.0004554094949877057, "loss": 3.068, "theoretical_loss": 4.188818570830883, "tokens_seen": 275251200 }, { "epoch": 0.1, "learning_rate": 0.00045481842254586723, "loss": 3.0823, "theoretical_loss": 4.182930302927963, "tokens_seen": 278528000 } ], "max_steps": 42724, "num_train_epochs": 9223372036854775807, "total_flos": 1.42878663770112e+17, "trial_name": null, "trial_params": null }