cpsc-checkmle / checkpoint-4272 /trainer_state.json
kejian's picture
Training in progress, step 4272
fd87fb9
raw
history blame contribute delete
No virus
16.1 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.09999063758075087,
"global_step": 4272,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 1.1682242990654204e-06,
"loss": 10.9364,
"theoretical_loss": 20.81281780154715,
"tokens_seen": 65536
},
{
"epoch": 0.0,
"learning_rate": 5.841121495327103e-05,
"loss": 8.9947,
"theoretical_loss": 8.563482664611069,
"tokens_seen": 3276800
},
{
"epoch": 0.0,
"learning_rate": 0.00011682242990654206,
"loss": 7.0133,
"theoretical_loss": 7.4777587180480305,
"tokens_seen": 6553600
},
{
"epoch": 0.0,
"learning_rate": 0.00017523364485981307,
"loss": 6.2244,
"theoretical_loss": 6.9337544888949,
"tokens_seen": 9830400
},
{
"epoch": 0.0,
"learning_rate": 0.00023364485981308412,
"loss": 5.822,
"theoretical_loss": 6.583566228426414,
"tokens_seen": 13107200
},
{
"epoch": 0.01,
"learning_rate": 0.00029205607476635517,
"loss": 5.5642,
"theoretical_loss": 6.330713565116083,
"tokens_seen": 16384000
},
{
"epoch": 0.01,
"learning_rate": 0.00035046728971962614,
"loss": 5.3688,
"theoretical_loss": 6.135529231940326,
"tokens_seen": 19660800
},
{
"epoch": 0.01,
"learning_rate": 0.0004088785046728972,
"loss": 5.1987,
"theoretical_loss": 5.978101583869607,
"tokens_seen": 22937600
},
{
"epoch": 0.01,
"learning_rate": 0.00046728971962616824,
"loss": 5.1026,
"theoretical_loss": 5.8471173262659235,
"tokens_seen": 26214400
},
{
"epoch": 0.01,
"learning_rate": 0.000499739928125591,
"loss": 4.9312,
"theoretical_loss": 5.7355768158821245,
"tokens_seen": 29491200
},
{
"epoch": 0.01,
"learning_rate": 0.0004991488556837526,
"loss": 4.8598,
"theoretical_loss": 5.638870144071353,
"tokens_seen": 32768000
},
{
"epoch": 0.01,
"learning_rate": 0.0004985577832419141,
"loss": 4.7297,
"theoretical_loss": 5.553812381844907,
"tokens_seen": 36044800
},
{
"epoch": 0.01,
"learning_rate": 0.0004979667108000757,
"loss": 4.6466,
"theoretical_loss": 5.478118080556438,
"tokens_seen": 39321600
},
{
"epoch": 0.02,
"learning_rate": 0.0004973756383582371,
"loss": 4.5646,
"theoretical_loss": 5.410095959579362,
"tokens_seen": 42598400
},
{
"epoch": 0.02,
"learning_rate": 0.0004967845659163987,
"loss": 4.5162,
"theoretical_loss": 5.348462083735834,
"tokens_seen": 45875200
},
{
"epoch": 0.02,
"learning_rate": 0.0004961934934745603,
"loss": 4.3809,
"theoretical_loss": 5.292220566937567,
"tokens_seen": 49152000
},
{
"epoch": 0.02,
"learning_rate": 0.0004956024210327218,
"loss": 4.336,
"theoretical_loss": 5.240584625769978,
"tokens_seen": 52428800
},
{
"epoch": 0.02,
"learning_rate": 0.0004950113485908833,
"loss": 4.2829,
"theoretical_loss": 5.192922724525789,
"tokens_seen": 55705600
},
{
"epoch": 0.02,
"learning_rate": 0.0004944202761490448,
"loss": 4.209,
"theoretical_loss": 5.1487208633564405,
"tokens_seen": 58982400
},
{
"epoch": 0.02,
"learning_rate": 0.0004938292037072064,
"loss": 4.0751,
"theoretical_loss": 5.107555562405102,
"tokens_seen": 62259200
},
{
"epoch": 0.02,
"learning_rate": 0.0004932381312653678,
"loss": 3.9696,
"theoretical_loss": 5.069074117143246,
"tokens_seen": 65536000
},
{
"epoch": 0.02,
"learning_rate": 0.0004926470588235294,
"loss": 3.9197,
"theoretical_loss": 5.032979909838007,
"tokens_seen": 68812800
},
{
"epoch": 0.03,
"learning_rate": 0.000492055986381691,
"loss": 3.8378,
"theoretical_loss": 4.999021308224664,
"tokens_seen": 72089600
},
{
"epoch": 0.03,
"learning_rate": 0.0004914649139398525,
"loss": 3.7856,
"theoretical_loss": 4.966983155351962,
"tokens_seen": 75366400
},
{
"epoch": 0.03,
"learning_rate": 0.000490873841498014,
"loss": 3.6988,
"theoretical_loss": 4.9366801616251355,
"tokens_seen": 78643200
},
{
"epoch": 0.03,
"learning_rate": 0.0004902827690561755,
"loss": 3.6826,
"theoretical_loss": 4.907951713830082,
"tokens_seen": 81920000
},
{
"epoch": 0.03,
"learning_rate": 0.0004896916966143371,
"loss": 3.6271,
"theoretical_loss": 4.880657753812926,
"tokens_seen": 85196800
},
{
"epoch": 0.03,
"learning_rate": 0.0004891006241724985,
"loss": 3.617,
"theoretical_loss": 4.854675474481779,
"tokens_seen": 88473600
},
{
"epoch": 0.03,
"learning_rate": 0.0004885095517306601,
"loss": 3.528,
"theoretical_loss": 4.8298966473088125,
"tokens_seen": 91750400
},
{
"epoch": 0.03,
"learning_rate": 0.0004879184792888217,
"loss": 3.5513,
"theoretical_loss": 4.8062254427779205,
"tokens_seen": 95027200
},
{
"epoch": 0.04,
"learning_rate": 0.0004873274068469832,
"loss": 3.4962,
"theoretical_loss": 4.783576639276257,
"tokens_seen": 98304000
},
{
"epoch": 0.04,
"learning_rate": 0.00048673633440514467,
"loss": 3.5334,
"theoretical_loss": 4.761874140772408,
"tokens_seen": 101580800
},
{
"epoch": 0.04,
"learning_rate": 0.0004861452619633062,
"loss": 3.5329,
"theoretical_loss": 4.741049741962473,
"tokens_seen": 104857600
},
{
"epoch": 0.04,
"learning_rate": 0.0004855541895214677,
"loss": 3.5329,
"theoretical_loss": 4.721042093249051,
"tokens_seen": 108134400
},
{
"epoch": 0.04,
"learning_rate": 0.0004849631170796293,
"loss": 3.4926,
"theoretical_loss": 4.701795828231866,
"tokens_seen": 111411200
},
{
"epoch": 0.04,
"learning_rate": 0.0004843720446377908,
"loss": 3.4835,
"theoretical_loss": 4.68326082423593,
"tokens_seen": 114688000
},
{
"epoch": 0.04,
"learning_rate": 0.00048378097219595233,
"loss": 3.4587,
"theoretical_loss": 4.665391572426282,
"tokens_seen": 117964800
},
{
"epoch": 0.04,
"learning_rate": 0.00048318989975411385,
"loss": 3.4307,
"theoretical_loss": 4.648146638719739,
"tokens_seen": 121241600
},
{
"epoch": 0.04,
"learning_rate": 0.00048259882731227537,
"loss": 3.443,
"theoretical_loss": 4.631488200339643,
"tokens_seen": 124518400
},
{
"epoch": 0.05,
"learning_rate": 0.0004820077548704369,
"loss": 3.45,
"theoretical_loss": 4.615381645715717,
"tokens_seen": 127795200
},
{
"epoch": 0.05,
"learning_rate": 0.00048141668242859847,
"loss": 3.3857,
"theoretical_loss": 4.599795227690505,
"tokens_seen": 131072000
},
{
"epoch": 0.05,
"learning_rate": 0.00048082560998676,
"loss": 3.4065,
"theoretical_loss": 4.584699761792674,
"tokens_seen": 134348800
},
{
"epoch": 0.05,
"learning_rate": 0.0004802345375449215,
"loss": 3.3659,
"theoretical_loss": 4.570068362778516,
"tokens_seen": 137625600
},
{
"epoch": 0.05,
"learning_rate": 0.00047964346510308303,
"loss": 3.3781,
"theoretical_loss": 4.555876213804037,
"tokens_seen": 140902400
},
{
"epoch": 0.05,
"learning_rate": 0.00047905239266124455,
"loss": 3.3612,
"theoretical_loss": 4.542100363530799,
"tokens_seen": 144179200
},
{
"epoch": 0.05,
"learning_rate": 0.00047846132021940607,
"loss": 3.3402,
"theoretical_loss": 4.528719547234816,
"tokens_seen": 147456000
},
{
"epoch": 0.05,
"learning_rate": 0.0004778702477775676,
"loss": 3.2899,
"theoretical_loss": 4.515714028614996,
"tokens_seen": 150732800
},
{
"epoch": 0.06,
"learning_rate": 0.00047727917533572917,
"loss": 3.3099,
"theoretical_loss": 4.503065459513339,
"tokens_seen": 154009600
},
{
"epoch": 0.06,
"learning_rate": 0.0004766881028938907,
"loss": 3.3162,
"theoretical_loss": 4.4907567551852665,
"tokens_seen": 157286400
},
{
"epoch": 0.06,
"learning_rate": 0.0004760970304520522,
"loss": 3.3036,
"theoretical_loss": 4.478771983111967,
"tokens_seen": 160563200
},
{
"epoch": 0.06,
"objective/train/docs_used": 104000,
"objective/train/instantaneous_batch_size": 32,
"objective/train/instantaneous_microbatch_size": 32768,
"objective/train/original_loss": 3.2747654914855957,
"objective/train/theoretical_loss": 4.467096263641219,
"objective/train/tokens_used": 184300000,
"theoretical_loss": 4.467096263641219,
"tokens_seen": 163840000
},
{
"epoch": 0.06,
"learning_rate": 0.00047550595801021373,
"loss": 3.2837,
"theoretical_loss": 4.467096263641219,
"tokens_seen": 163840000
},
{
"epoch": 0.06,
"learning_rate": 0.00047491488556837525,
"loss": 3.2955,
"theoretical_loss": 4.455715680989545,
"tokens_seen": 167116800
},
{
"epoch": 0.06,
"learning_rate": 0.00047432381312653677,
"loss": 3.2608,
"theoretical_loss": 4.44461720334543,
"tokens_seen": 170393600
},
{
"epoch": 0.06,
"learning_rate": 0.00047373274068469835,
"loss": 3.2026,
"theoretical_loss": 4.433788610987646,
"tokens_seen": 173670400
},
{
"epoch": 0.06,
"learning_rate": 0.00047314166824285987,
"loss": 3.2621,
"theoretical_loss": 4.42321843148016,
"tokens_seen": 176947200
},
{
"epoch": 0.06,
"learning_rate": 0.0004725505958010214,
"loss": 3.1999,
"theoretical_loss": 4.412895881130142,
"tokens_seen": 180224000
},
{
"epoch": 0.07,
"learning_rate": 0.0004719595233591829,
"loss": 3.2384,
"theoretical_loss": 4.4028108120020795,
"tokens_seen": 183500800
},
{
"epoch": 0.07,
"learning_rate": 0.00047136845091734443,
"loss": 3.244,
"theoretical_loss": 4.392953663871862,
"tokens_seen": 186777600
},
{
"epoch": 0.07,
"learning_rate": 0.00047077737847550595,
"loss": 3.2329,
"theoretical_loss": 4.383315420582533,
"tokens_seen": 190054400
},
{
"epoch": 0.07,
"learning_rate": 0.0004701863060336675,
"loss": 3.2634,
"theoretical_loss": 4.373887570330275,
"tokens_seen": 193331200
},
{
"epoch": 0.07,
"learning_rate": 0.00046959523359182905,
"loss": 3.2143,
"theoretical_loss": 4.364662069466704,
"tokens_seen": 196608000
},
{
"epoch": 0.07,
"learning_rate": 0.00046900416114999057,
"loss": 3.2128,
"theoretical_loss": 4.355631309453283,
"tokens_seen": 199884800
},
{
"epoch": 0.07,
"learning_rate": 0.0004684130887081521,
"loss": 3.1675,
"theoretical_loss": 4.346788086646671,
"tokens_seen": 203161600
},
{
"epoch": 0.07,
"learning_rate": 0.0004678220162663136,
"loss": 3.1967,
"theoretical_loss": 4.33812557463116,
"tokens_seen": 206438400
},
{
"epoch": 0.07,
"learning_rate": 0.00046723094382447513,
"loss": 3.2042,
"theoretical_loss": 4.329637298846812,
"tokens_seen": 209715200
},
{
"epoch": 0.08,
"learning_rate": 0.00046663987138263665,
"loss": 3.1574,
"theoretical_loss": 4.321317113290252,
"tokens_seen": 212992000
},
{
"epoch": 0.08,
"learning_rate": 0.0004660487989407982,
"loss": 3.1317,
"theoretical_loss": 4.3131591790897925,
"tokens_seen": 216268800
},
{
"epoch": 0.08,
"learning_rate": 0.00046545772649895975,
"loss": 3.1829,
"theoretical_loss": 4.305157944778228,
"tokens_seen": 219545600
},
{
"epoch": 0.08,
"learning_rate": 0.00046486665405712127,
"loss": 3.2073,
"theoretical_loss": 4.297308128105687,
"tokens_seen": 222822400
},
{
"epoch": 0.08,
"learning_rate": 0.0004642755816152828,
"loss": 3.1994,
"theoretical_loss": 4.2896046992515995,
"tokens_seen": 226099200
},
{
"epoch": 0.08,
"learning_rate": 0.0004636845091734443,
"loss": 3.2312,
"theoretical_loss": 4.282042865309616,
"tokens_seen": 229376000
},
{
"epoch": 0.08,
"learning_rate": 0.00046309343673160583,
"loss": 3.2006,
"theoretical_loss": 4.274618055932298,
"tokens_seen": 232652800
},
{
"epoch": 0.08,
"learning_rate": 0.0004625023642897674,
"loss": 3.1756,
"theoretical_loss": 4.267325910033897,
"tokens_seen": 235929600
},
{
"epoch": 0.09,
"learning_rate": 0.0004619112918479289,
"loss": 3.1158,
"theoretical_loss": 4.260162263459744,
"tokens_seen": 239206400
},
{
"epoch": 0.09,
"learning_rate": 0.00046132021940609044,
"loss": 3.1448,
"theoretical_loss": 4.253123137539814,
"tokens_seen": 242483200
},
{
"epoch": 0.09,
"learning_rate": 0.00046072914696425197,
"loss": 3.166,
"theoretical_loss": 4.246204728452055,
"tokens_seen": 245760000
},
{
"epoch": 0.09,
"learning_rate": 0.0004601380745224135,
"loss": 3.1762,
"theoretical_loss": 4.239403397328261,
"tokens_seen": 249036800
},
{
"epoch": 0.09,
"learning_rate": 0.000459547002080575,
"loss": 3.1442,
"theoretical_loss": 4.232715661041632,
"tokens_seen": 252313600
},
{
"epoch": 0.09,
"learning_rate": 0.0004589559296387366,
"loss": 3.1504,
"theoretical_loss": 4.226138183620867,
"tokens_seen": 255590400
},
{
"epoch": 0.09,
"learning_rate": 0.0004583648571968981,
"loss": 3.1099,
"theoretical_loss": 4.219667768240775,
"tokens_seen": 258867200
},
{
"epoch": 0.09,
"learning_rate": 0.0004577737847550596,
"loss": 3.114,
"theoretical_loss": 4.213301349743924,
"tokens_seen": 262144000
},
{
"epoch": 0.09,
"learning_rate": 0.0004571827123132211,
"loss": 3.0703,
"theoretical_loss": 4.20703598765197,
"tokens_seen": 265420800
},
{
"epoch": 0.1,
"learning_rate": 0.0004565916398713826,
"loss": 3.0383,
"theoretical_loss": 4.2008688596290025,
"tokens_seen": 268697600
},
{
"epoch": 0.1,
"learning_rate": 0.00045600056742954413,
"loss": 2.9881,
"theoretical_loss": 4.194797255362549,
"tokens_seen": 271974400
},
{
"epoch": 0.1,
"learning_rate": 0.0004554094949877057,
"loss": 3.068,
"theoretical_loss": 4.188818570830883,
"tokens_seen": 275251200
},
{
"epoch": 0.1,
"learning_rate": 0.00045481842254586723,
"loss": 3.0823,
"theoretical_loss": 4.182930302927963,
"tokens_seen": 278528000
}
],
"max_steps": 42724,
"num_train_epochs": 9223372036854775807,
"total_flos": 1.42878663770112e+17,
"trial_name": null,
"trial_params": null
}