|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.6310143555765894, |
|
"eval_steps": 500, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.7190074920654297, |
|
"learning_rate": 4.9995083170283816e-05, |
|
"loss": 2.9245, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.431870222091675, |
|
"learning_rate": 4.998033461515242e-05, |
|
"loss": 2.0053, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.3315682411193848, |
|
"learning_rate": 4.9955760135896534e-05, |
|
"loss": 1.888, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.2937276363372803, |
|
"learning_rate": 4.992136939879856e-05, |
|
"loss": 1.8447, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.7375714778900146, |
|
"learning_rate": 4.9877175931330346e-05, |
|
"loss": 1.8212, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.15061092376709, |
|
"learning_rate": 4.982319711683221e-05, |
|
"loss": 1.793, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.0427424907684326, |
|
"learning_rate": 4.975945418767529e-05, |
|
"loss": 1.756, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.107785224914551, |
|
"learning_rate": 4.968597221690986e-05, |
|
"loss": 1.7285, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.100552558898926, |
|
"learning_rate": 4.96027801084029e-05, |
|
"loss": 1.7297, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.2227377891540527, |
|
"learning_rate": 4.950991058546893e-05, |
|
"loss": 1.7602, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.535144567489624, |
|
"learning_rate": 4.940740017799833e-05, |
|
"loss": 1.7433, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.6522979736328125, |
|
"learning_rate": 4.929528920808854e-05, |
|
"loss": 1.7363, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.8091869354248047, |
|
"learning_rate": 4.917362177418342e-05, |
|
"loss": 1.6872, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.1017510890960693, |
|
"learning_rate": 4.904244573372733e-05, |
|
"loss": 1.7084, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.6424258947372437, |
|
"learning_rate": 4.8901812684340564e-05, |
|
"loss": 1.6997, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.4547488689422607, |
|
"learning_rate": 4.8751777943523634e-05, |
|
"loss": 1.6747, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.6251146793365479, |
|
"learning_rate": 4.8592400526898314e-05, |
|
"loss": 1.6836, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.098386526107788, |
|
"learning_rate": 4.842374312499405e-05, |
|
"loss": 1.6552, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.2387640476226807, |
|
"learning_rate": 4.824587207858888e-05, |
|
"loss": 1.6489, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.7299611568450928, |
|
"learning_rate": 4.805885735261454e-05, |
|
"loss": 1.6576, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.5701665878295898, |
|
"learning_rate": 4.786277250863599e-05, |
|
"loss": 1.6533, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.417296886444092, |
|
"learning_rate": 4.765769467591625e-05, |
|
"loss": 1.6356, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.2636029720306396, |
|
"learning_rate": 4.744370452107789e-05, |
|
"loss": 1.6389, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.576324224472046, |
|
"learning_rate": 4.722088621637309e-05, |
|
"loss": 1.6546, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.9720542430877686, |
|
"learning_rate": 4.698932740657479e-05, |
|
"loss": 1.6354, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.5250279903411865, |
|
"learning_rate": 4.6749119174501975e-05, |
|
"loss": 1.6342, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.4737966060638428, |
|
"learning_rate": 4.6500356005192514e-05, |
|
"loss": 1.6407, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.2792372703552246, |
|
"learning_rate": 4.6243135748737864e-05, |
|
"loss": 1.6339, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.5593037605285645, |
|
"learning_rate": 4.597755958179406e-05, |
|
"loss": 1.6095, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.3141404390335083, |
|
"learning_rate": 4.570373196778427e-05, |
|
"loss": 1.6036, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.2617065906524658, |
|
"learning_rate": 4.5421760615808474e-05, |
|
"loss": 1.6244, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.64117431640625, |
|
"learning_rate": 4.513175643827647e-05, |
|
"loss": 1.6449, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.7132749557495117, |
|
"learning_rate": 4.4833833507280884e-05, |
|
"loss": 1.5948, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.1323654651641846, |
|
"learning_rate": 4.4528109009727336e-05, |
|
"loss": 1.627, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.253115653991699, |
|
"learning_rate": 4.42147032012394e-05, |
|
"loss": 1.6151, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.6143097877502441, |
|
"learning_rate": 4.389373935885646e-05, |
|
"loss": 1.5838, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.3353707790374756, |
|
"learning_rate": 4.356534373254316e-05, |
|
"loss": 1.5935, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.283742904663086, |
|
"learning_rate": 4.322964549552943e-05, |
|
"loss": 1.6015, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.437249779701233, |
|
"learning_rate": 4.288677669350066e-05, |
|
"loss": 1.577, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.5190638303756714, |
|
"learning_rate": 4.2536872192658036e-05, |
|
"loss": 1.5843, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.1320886611938477, |
|
"learning_rate": 4.218006962666934e-05, |
|
"loss": 1.6145, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.0696591138839722, |
|
"learning_rate": 4.181650934253132e-05, |
|
"loss": 1.5601, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.3149545192718506, |
|
"learning_rate": 4.144633434536467e-05, |
|
"loss": 1.5664, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.3661577701568604, |
|
"learning_rate": 4.1069690242163484e-05, |
|
"loss": 1.6002, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.6984481811523438, |
|
"learning_rate": 4.06867251845213e-05, |
|
"loss": 1.576, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.2728784084320068, |
|
"learning_rate": 4.0297589810356165e-05, |
|
"loss": 1.5448, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.4147616624832153, |
|
"learning_rate": 3.9902437184657784e-05, |
|
"loss": 1.5595, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.2289011478424072, |
|
"learning_rate": 3.9501422739279956e-05, |
|
"loss": 1.5628, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.5690233707427979, |
|
"learning_rate": 3.909470421180201e-05, |
|
"loss": 1.5731, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.4935098886489868, |
|
"learning_rate": 3.8682441583483314e-05, |
|
"loss": 1.545, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.2939772605895996, |
|
"learning_rate": 3.8264797016335205e-05, |
|
"loss": 1.5793, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.2150651216506958, |
|
"learning_rate": 3.7841934789335164e-05, |
|
"loss": 1.5378, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.2153139114379883, |
|
"learning_rate": 3.741402123380828e-05, |
|
"loss": 1.5345, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.290591835975647, |
|
"learning_rate": 3.6981224668001424e-05, |
|
"loss": 1.5517, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.1924967765808105, |
|
"learning_rate": 3.654371533087586e-05, |
|
"loss": 1.5472, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.6345056295394897, |
|
"learning_rate": 3.610166531514436e-05, |
|
"loss": 1.5564, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.185119867324829, |
|
"learning_rate": 3.565524849957921e-05, |
|
"loss": 1.5574, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.3646321296691895, |
|
"learning_rate": 3.520464048061758e-05, |
|
"loss": 1.5584, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.2333228588104248, |
|
"learning_rate": 3.47500185032913e-05, |
|
"loss": 1.518, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.3945318460464478, |
|
"learning_rate": 3.4291561391508185e-05, |
|
"loss": 1.5339, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.304306149482727, |
|
"learning_rate": 3.3829449477712324e-05, |
|
"loss": 1.5339, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.6393932104110718, |
|
"learning_rate": 3.336386453195088e-05, |
|
"loss": 1.5399, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.2000635862350464, |
|
"learning_rate": 3.2894989690375626e-05, |
|
"loss": 1.5233, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.1479601860046387, |
|
"learning_rate": 3.2423009383206876e-05, |
|
"loss": 1.538, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.1483389139175415, |
|
"learning_rate": 3.194810926218861e-05, |
|
"loss": 1.528, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.2403253316879272, |
|
"learning_rate": 3.147047612756302e-05, |
|
"loss": 1.5307, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.3997712135314941, |
|
"learning_rate": 3.099029785459328e-05, |
|
"loss": 1.4915, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.2010352611541748, |
|
"learning_rate": 3.0507763319663517e-05, |
|
"loss": 1.5268, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.0670932531356812, |
|
"learning_rate": 3.002306232598497e-05, |
|
"loss": 1.5273, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.2283655405044556, |
|
"learning_rate": 2.9536385528937567e-05, |
|
"loss": 1.5273, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.1306476593017578, |
|
"learning_rate": 2.9047924361076345e-05, |
|
"loss": 1.5072, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.1699943542480469, |
|
"learning_rate": 2.8557870956832132e-05, |
|
"loss": 1.4856, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.2550854682922363, |
|
"learning_rate": 2.8066418076936167e-05, |
|
"loss": 1.4983, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.0610970258712769, |
|
"learning_rate": 2.7573759032598366e-05, |
|
"loss": 1.5518, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.1754754781723022, |
|
"learning_rate": 2.7080087609469062e-05, |
|
"loss": 1.4998, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.1955766677856445, |
|
"learning_rate": 2.6585597991414114e-05, |
|
"loss": 1.5109, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.0891656875610352, |
|
"learning_rate": 2.6090484684133404e-05, |
|
"loss": 1.5007, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.0880335569381714, |
|
"learning_rate": 2.5594942438652688e-05, |
|
"loss": 1.5049, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.345954418182373, |
|
"learning_rate": 2.509916617471903e-05, |
|
"loss": 1.5154, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.1668224334716797, |
|
"learning_rate": 2.46033509041298e-05, |
|
"loss": 1.4883, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.055127501487732, |
|
"learning_rate": 2.410769165402549e-05, |
|
"loss": 1.5053, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.0528500080108643, |
|
"learning_rate": 2.3612383390176503e-05, |
|
"loss": 1.4871, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.328258991241455, |
|
"learning_rate": 2.3117620940294048e-05, |
|
"loss": 1.5037, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.0326772928237915, |
|
"learning_rate": 2.2623598917395438e-05, |
|
"loss": 1.4525, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 3.057058811187744, |
|
"learning_rate": 2.213051164325366e-05, |
|
"loss": 1.4898, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.1190940141677856, |
|
"learning_rate": 2.1638553071961708e-05, |
|
"loss": 1.488, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.1501041650772095, |
|
"learning_rate": 2.1147916713641367e-05, |
|
"loss": 1.4711, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.090022325515747, |
|
"learning_rate": 2.0658795558326743e-05, |
|
"loss": 1.488, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.0642565488815308, |
|
"learning_rate": 2.017138200005236e-05, |
|
"loss": 1.4791, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.3562296628952026, |
|
"learning_rate": 1.9685867761175584e-05, |
|
"loss": 1.4956, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.2069261074066162, |
|
"learning_rate": 1.9202443816963425e-05, |
|
"loss": 1.4918, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.3227437734603882, |
|
"learning_rate": 1.872130032047302e-05, |
|
"loss": 1.4577, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.0784181356430054, |
|
"learning_rate": 1.824262652775568e-05, |
|
"loss": 1.4888, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.000135898590088, |
|
"learning_rate": 1.7766610723413684e-05, |
|
"loss": 1.4673, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.136026382446289, |
|
"learning_rate": 1.7293440146539196e-05, |
|
"loss": 1.4779, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.123252272605896, |
|
"learning_rate": 1.682330091706446e-05, |
|
"loss": 1.4583, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.0559343099594116, |
|
"learning_rate": 1.6356377962552238e-05, |
|
"loss": 1.4471, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.0266658067703247, |
|
"learning_rate": 1.589285494545514e-05, |
|
"loss": 1.4632, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.1371444463729858, |
|
"learning_rate": 1.5432914190872757e-05, |
|
"loss": 1.4732, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.1203784942626953, |
|
"learning_rate": 1.4976736614834664e-05, |
|
"loss": 1.452, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1584, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1000, |
|
"total_flos": 7.003073868034212e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|