|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.09999063758075087, |
|
"global_step": 4272, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 1.1682242990654204e-06, |
|
"loss": 10.9364, |
|
"theoretical_loss": 20.81281780154715, |
|
"tokens_seen": 65536 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 5.841121495327103e-05, |
|
"loss": 8.9947, |
|
"theoretical_loss": 8.563482664611069, |
|
"tokens_seen": 3276800 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00011682242990654206, |
|
"loss": 7.0133, |
|
"theoretical_loss": 7.4777587180480305, |
|
"tokens_seen": 6553600 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00017523364485981307, |
|
"loss": 6.2244, |
|
"theoretical_loss": 6.9337544888949, |
|
"tokens_seen": 9830400 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00023364485981308412, |
|
"loss": 5.822, |
|
"theoretical_loss": 6.583566228426414, |
|
"tokens_seen": 13107200 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00029205607476635517, |
|
"loss": 5.5642, |
|
"theoretical_loss": 6.330713565116083, |
|
"tokens_seen": 16384000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00035046728971962614, |
|
"loss": 5.3688, |
|
"theoretical_loss": 6.135529231940326, |
|
"tokens_seen": 19660800 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004088785046728972, |
|
"loss": 5.1987, |
|
"theoretical_loss": 5.978101583869607, |
|
"tokens_seen": 22937600 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00046728971962616824, |
|
"loss": 5.1026, |
|
"theoretical_loss": 5.8471173262659235, |
|
"tokens_seen": 26214400 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.000499739928125591, |
|
"loss": 4.9312, |
|
"theoretical_loss": 5.7355768158821245, |
|
"tokens_seen": 29491200 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004991488556837526, |
|
"loss": 4.8598, |
|
"theoretical_loss": 5.638870144071353, |
|
"tokens_seen": 32768000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004985577832419141, |
|
"loss": 4.7297, |
|
"theoretical_loss": 5.553812381844907, |
|
"tokens_seen": 36044800 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004979667108000757, |
|
"loss": 4.6466, |
|
"theoretical_loss": 5.478118080556438, |
|
"tokens_seen": 39321600 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004973756383582371, |
|
"loss": 4.5646, |
|
"theoretical_loss": 5.410095959579362, |
|
"tokens_seen": 42598400 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004967845659163987, |
|
"loss": 4.5162, |
|
"theoretical_loss": 5.348462083735834, |
|
"tokens_seen": 45875200 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004961934934745603, |
|
"loss": 4.3809, |
|
"theoretical_loss": 5.292220566937567, |
|
"tokens_seen": 49152000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004956024210327218, |
|
"loss": 4.336, |
|
"theoretical_loss": 5.240584625769978, |
|
"tokens_seen": 52428800 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004950113485908833, |
|
"loss": 4.2829, |
|
"theoretical_loss": 5.192922724525789, |
|
"tokens_seen": 55705600 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004944202761490448, |
|
"loss": 4.209, |
|
"theoretical_loss": 5.1487208633564405, |
|
"tokens_seen": 58982400 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004938292037072064, |
|
"loss": 4.0751, |
|
"theoretical_loss": 5.107555562405102, |
|
"tokens_seen": 62259200 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004932381312653678, |
|
"loss": 3.9696, |
|
"theoretical_loss": 5.069074117143246, |
|
"tokens_seen": 65536000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004926470588235294, |
|
"loss": 3.9197, |
|
"theoretical_loss": 5.032979909838007, |
|
"tokens_seen": 68812800 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.000492055986381691, |
|
"loss": 3.8378, |
|
"theoretical_loss": 4.999021308224664, |
|
"tokens_seen": 72089600 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004914649139398525, |
|
"loss": 3.7856, |
|
"theoretical_loss": 4.966983155351962, |
|
"tokens_seen": 75366400 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.000490873841498014, |
|
"loss": 3.6988, |
|
"theoretical_loss": 4.9366801616251355, |
|
"tokens_seen": 78643200 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004902827690561755, |
|
"loss": 3.6826, |
|
"theoretical_loss": 4.907951713830082, |
|
"tokens_seen": 81920000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004896916966143371, |
|
"loss": 3.6271, |
|
"theoretical_loss": 4.880657753812926, |
|
"tokens_seen": 85196800 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004891006241724985, |
|
"loss": 3.617, |
|
"theoretical_loss": 4.854675474481779, |
|
"tokens_seen": 88473600 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004885095517306601, |
|
"loss": 3.528, |
|
"theoretical_loss": 4.8298966473088125, |
|
"tokens_seen": 91750400 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004879184792888217, |
|
"loss": 3.5513, |
|
"theoretical_loss": 4.8062254427779205, |
|
"tokens_seen": 95027200 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004873274068469832, |
|
"loss": 3.4962, |
|
"theoretical_loss": 4.783576639276257, |
|
"tokens_seen": 98304000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048673633440514467, |
|
"loss": 3.5334, |
|
"theoretical_loss": 4.761874140772408, |
|
"tokens_seen": 101580800 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004861452619633062, |
|
"loss": 3.5329, |
|
"theoretical_loss": 4.741049741962473, |
|
"tokens_seen": 104857600 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004855541895214677, |
|
"loss": 3.5329, |
|
"theoretical_loss": 4.721042093249051, |
|
"tokens_seen": 108134400 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004849631170796293, |
|
"loss": 3.4926, |
|
"theoretical_loss": 4.701795828231866, |
|
"tokens_seen": 111411200 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004843720446377908, |
|
"loss": 3.4835, |
|
"theoretical_loss": 4.68326082423593, |
|
"tokens_seen": 114688000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048378097219595233, |
|
"loss": 3.4587, |
|
"theoretical_loss": 4.665391572426282, |
|
"tokens_seen": 117964800 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048318989975411385, |
|
"loss": 3.4307, |
|
"theoretical_loss": 4.648146638719739, |
|
"tokens_seen": 121241600 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048259882731227537, |
|
"loss": 3.443, |
|
"theoretical_loss": 4.631488200339643, |
|
"tokens_seen": 124518400 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.0004820077548704369, |
|
"loss": 3.45, |
|
"theoretical_loss": 4.615381645715717, |
|
"tokens_seen": 127795200 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00048141668242859847, |
|
"loss": 3.3857, |
|
"theoretical_loss": 4.599795227690505, |
|
"tokens_seen": 131072000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00048082560998676, |
|
"loss": 3.4065, |
|
"theoretical_loss": 4.584699761792674, |
|
"tokens_seen": 134348800 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.0004802345375449215, |
|
"loss": 3.3659, |
|
"theoretical_loss": 4.570068362778516, |
|
"tokens_seen": 137625600 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00047964346510308303, |
|
"loss": 3.3781, |
|
"theoretical_loss": 4.555876213804037, |
|
"tokens_seen": 140902400 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00047905239266124455, |
|
"loss": 3.3612, |
|
"theoretical_loss": 4.542100363530799, |
|
"tokens_seen": 144179200 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00047846132021940607, |
|
"loss": 3.3402, |
|
"theoretical_loss": 4.528719547234816, |
|
"tokens_seen": 147456000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.0004778702477775676, |
|
"loss": 3.2899, |
|
"theoretical_loss": 4.515714028614996, |
|
"tokens_seen": 150732800 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047727917533572917, |
|
"loss": 3.3099, |
|
"theoretical_loss": 4.503065459513339, |
|
"tokens_seen": 154009600 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004766881028938907, |
|
"loss": 3.3162, |
|
"theoretical_loss": 4.4907567551852665, |
|
"tokens_seen": 157286400 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004760970304520522, |
|
"loss": 3.3036, |
|
"theoretical_loss": 4.478771983111967, |
|
"tokens_seen": 160563200 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"objective/train/docs_used": 104000, |
|
"objective/train/instantaneous_batch_size": 32, |
|
"objective/train/instantaneous_microbatch_size": 32768, |
|
"objective/train/original_loss": 3.2747654914855957, |
|
"objective/train/theoretical_loss": 4.467096263641219, |
|
"objective/train/tokens_used": 184300000, |
|
"theoretical_loss": 4.467096263641219, |
|
"tokens_seen": 163840000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047550595801021373, |
|
"loss": 3.2837, |
|
"theoretical_loss": 4.467096263641219, |
|
"tokens_seen": 163840000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047491488556837525, |
|
"loss": 3.2955, |
|
"theoretical_loss": 4.455715680989545, |
|
"tokens_seen": 167116800 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047432381312653677, |
|
"loss": 3.2608, |
|
"theoretical_loss": 4.44461720334543, |
|
"tokens_seen": 170393600 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047373274068469835, |
|
"loss": 3.2026, |
|
"theoretical_loss": 4.433788610987646, |
|
"tokens_seen": 173670400 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047314166824285987, |
|
"loss": 3.2621, |
|
"theoretical_loss": 4.42321843148016, |
|
"tokens_seen": 176947200 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004725505958010214, |
|
"loss": 3.1999, |
|
"theoretical_loss": 4.412895881130142, |
|
"tokens_seen": 180224000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004719595233591829, |
|
"loss": 3.2384, |
|
"theoretical_loss": 4.4028108120020795, |
|
"tokens_seen": 183500800 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00047136845091734443, |
|
"loss": 3.244, |
|
"theoretical_loss": 4.392953663871862, |
|
"tokens_seen": 186777600 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00047077737847550595, |
|
"loss": 3.2329, |
|
"theoretical_loss": 4.383315420582533, |
|
"tokens_seen": 190054400 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004701863060336675, |
|
"loss": 3.2634, |
|
"theoretical_loss": 4.373887570330275, |
|
"tokens_seen": 193331200 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00046959523359182905, |
|
"loss": 3.2143, |
|
"theoretical_loss": 4.364662069466704, |
|
"tokens_seen": 196608000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00046900416114999057, |
|
"loss": 3.2128, |
|
"theoretical_loss": 4.355631309453283, |
|
"tokens_seen": 199884800 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004684130887081521, |
|
"loss": 3.1675, |
|
"theoretical_loss": 4.346788086646671, |
|
"tokens_seen": 203161600 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004678220162663136, |
|
"loss": 3.1967, |
|
"theoretical_loss": 4.33812557463116, |
|
"tokens_seen": 206438400 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00046723094382447513, |
|
"loss": 3.2042, |
|
"theoretical_loss": 4.329637298846812, |
|
"tokens_seen": 209715200 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00046663987138263665, |
|
"loss": 3.1574, |
|
"theoretical_loss": 4.321317113290252, |
|
"tokens_seen": 212992000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004660487989407982, |
|
"loss": 3.1317, |
|
"theoretical_loss": 4.3131591790897925, |
|
"tokens_seen": 216268800 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00046545772649895975, |
|
"loss": 3.1829, |
|
"theoretical_loss": 4.305157944778228, |
|
"tokens_seen": 219545600 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00046486665405712127, |
|
"loss": 3.2073, |
|
"theoretical_loss": 4.297308128105687, |
|
"tokens_seen": 222822400 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004642755816152828, |
|
"loss": 3.1994, |
|
"theoretical_loss": 4.2896046992515995, |
|
"tokens_seen": 226099200 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004636845091734443, |
|
"loss": 3.2312, |
|
"theoretical_loss": 4.282042865309616, |
|
"tokens_seen": 229376000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00046309343673160583, |
|
"loss": 3.2006, |
|
"theoretical_loss": 4.274618055932298, |
|
"tokens_seen": 232652800 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004625023642897674, |
|
"loss": 3.1756, |
|
"theoretical_loss": 4.267325910033897, |
|
"tokens_seen": 235929600 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004619112918479289, |
|
"loss": 3.1158, |
|
"theoretical_loss": 4.260162263459744, |
|
"tokens_seen": 239206400 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00046132021940609044, |
|
"loss": 3.1448, |
|
"theoretical_loss": 4.253123137539814, |
|
"tokens_seen": 242483200 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00046072914696425197, |
|
"loss": 3.166, |
|
"theoretical_loss": 4.246204728452055, |
|
"tokens_seen": 245760000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004601380745224135, |
|
"loss": 3.1762, |
|
"theoretical_loss": 4.239403397328261, |
|
"tokens_seen": 249036800 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.000459547002080575, |
|
"loss": 3.1442, |
|
"theoretical_loss": 4.232715661041632, |
|
"tokens_seen": 252313600 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004589559296387366, |
|
"loss": 3.1504, |
|
"theoretical_loss": 4.226138183620867, |
|
"tokens_seen": 255590400 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004583648571968981, |
|
"loss": 3.1099, |
|
"theoretical_loss": 4.219667768240775, |
|
"tokens_seen": 258867200 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004577737847550596, |
|
"loss": 3.114, |
|
"theoretical_loss": 4.213301349743924, |
|
"tokens_seen": 262144000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004571827123132211, |
|
"loss": 3.0703, |
|
"theoretical_loss": 4.20703598765197, |
|
"tokens_seen": 265420800 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.0004565916398713826, |
|
"loss": 3.0383, |
|
"theoretical_loss": 4.2008688596290025, |
|
"tokens_seen": 268697600 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045600056742954413, |
|
"loss": 2.9881, |
|
"theoretical_loss": 4.194797255362549, |
|
"tokens_seen": 271974400 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.0004554094949877057, |
|
"loss": 3.068, |
|
"theoretical_loss": 4.188818570830883, |
|
"tokens_seen": 275251200 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045481842254586723, |
|
"loss": 3.0823, |
|
"theoretical_loss": 4.182930302927963, |
|
"tokens_seen": 278528000 |
|
} |
|
], |
|
"max_steps": 42724, |
|
"num_train_epochs": 9223372036854775807, |
|
"total_flos": 1.42878663770112e+17, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|