{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6310143555765894, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 3.7190074920654297, "learning_rate": 4.9995083170283816e-05, "loss": 2.9245, "step": 10 }, { "epoch": 0.01, "grad_norm": 3.431870222091675, "learning_rate": 4.998033461515242e-05, "loss": 2.0053, "step": 20 }, { "epoch": 0.02, "grad_norm": 2.3315682411193848, "learning_rate": 4.9955760135896534e-05, "loss": 1.888, "step": 30 }, { "epoch": 0.03, "grad_norm": 3.2937276363372803, "learning_rate": 4.992136939879856e-05, "loss": 1.8447, "step": 40 }, { "epoch": 0.03, "grad_norm": 2.7375714778900146, "learning_rate": 4.9877175931330346e-05, "loss": 1.8212, "step": 50 }, { "epoch": 0.04, "grad_norm": 2.15061092376709, "learning_rate": 4.982319711683221e-05, "loss": 1.793, "step": 60 }, { "epoch": 0.04, "grad_norm": 2.0427424907684326, "learning_rate": 4.975945418767529e-05, "loss": 1.756, "step": 70 }, { "epoch": 0.05, "grad_norm": 2.107785224914551, "learning_rate": 4.968597221690986e-05, "loss": 1.7285, "step": 80 }, { "epoch": 0.06, "grad_norm": 2.100552558898926, "learning_rate": 4.96027801084029e-05, "loss": 1.7297, "step": 90 }, { "epoch": 0.06, "grad_norm": 2.2227377891540527, "learning_rate": 4.950991058546893e-05, "loss": 1.7602, "step": 100 }, { "epoch": 0.07, "grad_norm": 1.535144567489624, "learning_rate": 4.940740017799833e-05, "loss": 1.7433, "step": 110 }, { "epoch": 0.08, "grad_norm": 1.6522979736328125, "learning_rate": 4.929528920808854e-05, "loss": 1.7363, "step": 120 }, { "epoch": 0.08, "grad_norm": 2.8091869354248047, "learning_rate": 4.917362177418342e-05, "loss": 1.6872, "step": 130 }, { "epoch": 0.09, "grad_norm": 2.1017510890960693, "learning_rate": 4.904244573372733e-05, "loss": 1.7084, "step": 140 }, { "epoch": 0.09, "grad_norm": 1.6424258947372437, "learning_rate": 4.8901812684340564e-05, "loss": 1.6997, "step": 150 }, { "epoch": 0.1, "grad_norm": 1.4547488689422607, "learning_rate": 4.8751777943523634e-05, "loss": 1.6747, "step": 160 }, { "epoch": 0.11, "grad_norm": 1.6251146793365479, "learning_rate": 4.8592400526898314e-05, "loss": 1.6836, "step": 170 }, { "epoch": 0.11, "grad_norm": 2.098386526107788, "learning_rate": 4.842374312499405e-05, "loss": 1.6552, "step": 180 }, { "epoch": 0.12, "grad_norm": 2.2387640476226807, "learning_rate": 4.824587207858888e-05, "loss": 1.6489, "step": 190 }, { "epoch": 0.13, "grad_norm": 1.7299611568450928, "learning_rate": 4.805885735261454e-05, "loss": 1.6576, "step": 200 }, { "epoch": 0.13, "grad_norm": 1.5701665878295898, "learning_rate": 4.786277250863599e-05, "loss": 1.6533, "step": 210 }, { "epoch": 0.14, "grad_norm": 2.417296886444092, "learning_rate": 4.765769467591625e-05, "loss": 1.6356, "step": 220 }, { "epoch": 0.15, "grad_norm": 1.2636029720306396, "learning_rate": 4.744370452107789e-05, "loss": 1.6389, "step": 230 }, { "epoch": 0.15, "grad_norm": 1.576324224472046, "learning_rate": 4.722088621637309e-05, "loss": 1.6546, "step": 240 }, { "epoch": 0.16, "grad_norm": 1.9720542430877686, "learning_rate": 4.698932740657479e-05, "loss": 1.6354, "step": 250 }, { "epoch": 0.16, "grad_norm": 1.5250279903411865, "learning_rate": 4.6749119174501975e-05, "loss": 1.6342, "step": 260 }, { "epoch": 0.17, "grad_norm": 2.4737966060638428, "learning_rate": 4.6500356005192514e-05, "loss": 1.6407, "step": 270 }, { "epoch": 0.18, "grad_norm": 1.2792372703552246, "learning_rate": 4.6243135748737864e-05, "loss": 1.6339, "step": 280 }, { "epoch": 0.18, "grad_norm": 1.5593037605285645, "learning_rate": 4.597755958179406e-05, "loss": 1.6095, "step": 290 }, { "epoch": 0.19, "grad_norm": 1.3141404390335083, "learning_rate": 4.570373196778427e-05, "loss": 1.6036, "step": 300 }, { "epoch": 0.2, "grad_norm": 1.2617065906524658, "learning_rate": 4.5421760615808474e-05, "loss": 1.6244, "step": 310 }, { "epoch": 0.2, "grad_norm": 1.64117431640625, "learning_rate": 4.513175643827647e-05, "loss": 1.6449, "step": 320 }, { "epoch": 0.21, "grad_norm": 1.7132749557495117, "learning_rate": 4.4833833507280884e-05, "loss": 1.5948, "step": 330 }, { "epoch": 0.21, "grad_norm": 2.1323654651641846, "learning_rate": 4.4528109009727336e-05, "loss": 1.627, "step": 340 }, { "epoch": 0.22, "grad_norm": 2.253115653991699, "learning_rate": 4.42147032012394e-05, "loss": 1.6151, "step": 350 }, { "epoch": 0.23, "grad_norm": 1.6143097877502441, "learning_rate": 4.389373935885646e-05, "loss": 1.5838, "step": 360 }, { "epoch": 0.23, "grad_norm": 1.3353707790374756, "learning_rate": 4.356534373254316e-05, "loss": 1.5935, "step": 370 }, { "epoch": 0.24, "grad_norm": 1.283742904663086, "learning_rate": 4.322964549552943e-05, "loss": 1.6015, "step": 380 }, { "epoch": 0.25, "grad_norm": 1.437249779701233, "learning_rate": 4.288677669350066e-05, "loss": 1.577, "step": 390 }, { "epoch": 0.25, "grad_norm": 1.5190638303756714, "learning_rate": 4.2536872192658036e-05, "loss": 1.5843, "step": 400 }, { "epoch": 0.26, "grad_norm": 2.1320886611938477, "learning_rate": 4.218006962666934e-05, "loss": 1.6145, "step": 410 }, { "epoch": 0.27, "grad_norm": 1.0696591138839722, "learning_rate": 4.181650934253132e-05, "loss": 1.5601, "step": 420 }, { "epoch": 0.27, "grad_norm": 1.3149545192718506, "learning_rate": 4.144633434536467e-05, "loss": 1.5664, "step": 430 }, { "epoch": 0.28, "grad_norm": 1.3661577701568604, "learning_rate": 4.1069690242163484e-05, "loss": 1.6002, "step": 440 }, { "epoch": 0.28, "grad_norm": 1.6984481811523438, "learning_rate": 4.06867251845213e-05, "loss": 1.576, "step": 450 }, { "epoch": 0.29, "grad_norm": 1.2728784084320068, "learning_rate": 4.0297589810356165e-05, "loss": 1.5448, "step": 460 }, { "epoch": 0.3, "grad_norm": 1.4147616624832153, "learning_rate": 3.9902437184657784e-05, "loss": 1.5595, "step": 470 }, { "epoch": 0.3, "grad_norm": 1.2289011478424072, "learning_rate": 3.9501422739279956e-05, "loss": 1.5628, "step": 480 }, { "epoch": 0.31, "grad_norm": 1.5690233707427979, "learning_rate": 3.909470421180201e-05, "loss": 1.5731, "step": 490 }, { "epoch": 0.32, "grad_norm": 1.4935098886489868, "learning_rate": 3.8682441583483314e-05, "loss": 1.545, "step": 500 }, { "epoch": 0.32, "grad_norm": 1.2939772605895996, "learning_rate": 3.8264797016335205e-05, "loss": 1.5793, "step": 510 }, { "epoch": 0.33, "grad_norm": 1.2150651216506958, "learning_rate": 3.7841934789335164e-05, "loss": 1.5378, "step": 520 }, { "epoch": 0.33, "grad_norm": 1.2153139114379883, "learning_rate": 3.741402123380828e-05, "loss": 1.5345, "step": 530 }, { "epoch": 0.34, "grad_norm": 1.290591835975647, "learning_rate": 3.6981224668001424e-05, "loss": 1.5517, "step": 540 }, { "epoch": 0.35, "grad_norm": 1.1924967765808105, "learning_rate": 3.654371533087586e-05, "loss": 1.5472, "step": 550 }, { "epoch": 0.35, "grad_norm": 1.6345056295394897, "learning_rate": 3.610166531514436e-05, "loss": 1.5564, "step": 560 }, { "epoch": 0.36, "grad_norm": 2.185119867324829, "learning_rate": 3.565524849957921e-05, "loss": 1.5574, "step": 570 }, { "epoch": 0.37, "grad_norm": 1.3646321296691895, "learning_rate": 3.520464048061758e-05, "loss": 1.5584, "step": 580 }, { "epoch": 0.37, "grad_norm": 1.2333228588104248, "learning_rate": 3.47500185032913e-05, "loss": 1.518, "step": 590 }, { "epoch": 0.38, "grad_norm": 1.3945318460464478, "learning_rate": 3.4291561391508185e-05, "loss": 1.5339, "step": 600 }, { "epoch": 0.38, "grad_norm": 1.304306149482727, "learning_rate": 3.3829449477712324e-05, "loss": 1.5339, "step": 610 }, { "epoch": 0.39, "grad_norm": 1.6393932104110718, "learning_rate": 3.336386453195088e-05, "loss": 1.5399, "step": 620 }, { "epoch": 0.4, "grad_norm": 1.2000635862350464, "learning_rate": 3.2894989690375626e-05, "loss": 1.5233, "step": 630 }, { "epoch": 0.4, "grad_norm": 1.1479601860046387, "learning_rate": 3.2423009383206876e-05, "loss": 1.538, "step": 640 }, { "epoch": 0.41, "grad_norm": 1.1483389139175415, "learning_rate": 3.194810926218861e-05, "loss": 1.528, "step": 650 }, { "epoch": 0.42, "grad_norm": 1.2403253316879272, "learning_rate": 3.147047612756302e-05, "loss": 1.5307, "step": 660 }, { "epoch": 0.42, "grad_norm": 1.3997712135314941, "learning_rate": 3.099029785459328e-05, "loss": 1.4915, "step": 670 }, { "epoch": 0.43, "grad_norm": 1.2010352611541748, "learning_rate": 3.0507763319663517e-05, "loss": 1.5268, "step": 680 }, { "epoch": 0.44, "grad_norm": 1.0670932531356812, "learning_rate": 3.002306232598497e-05, "loss": 1.5273, "step": 690 }, { "epoch": 0.44, "grad_norm": 1.2283655405044556, "learning_rate": 2.9536385528937567e-05, "loss": 1.5273, "step": 700 }, { "epoch": 0.45, "grad_norm": 1.1306476593017578, "learning_rate": 2.9047924361076345e-05, "loss": 1.5072, "step": 710 }, { "epoch": 0.45, "grad_norm": 1.1699943542480469, "learning_rate": 2.8557870956832132e-05, "loss": 1.4856, "step": 720 }, { "epoch": 0.46, "grad_norm": 1.2550854682922363, "learning_rate": 2.8066418076936167e-05, "loss": 1.4983, "step": 730 }, { "epoch": 0.47, "grad_norm": 1.0610970258712769, "learning_rate": 2.7573759032598366e-05, "loss": 1.5518, "step": 740 }, { "epoch": 0.47, "grad_norm": 1.1754754781723022, "learning_rate": 2.7080087609469062e-05, "loss": 1.4998, "step": 750 }, { "epoch": 0.48, "grad_norm": 1.1955766677856445, "learning_rate": 2.6585597991414114e-05, "loss": 1.5109, "step": 760 }, { "epoch": 0.49, "grad_norm": 1.0891656875610352, "learning_rate": 2.6090484684133404e-05, "loss": 1.5007, "step": 770 }, { "epoch": 0.49, "grad_norm": 1.0880335569381714, "learning_rate": 2.5594942438652688e-05, "loss": 1.5049, "step": 780 }, { "epoch": 0.5, "grad_norm": 1.345954418182373, "learning_rate": 2.509916617471903e-05, "loss": 1.5154, "step": 790 }, { "epoch": 0.5, "grad_norm": 1.1668224334716797, "learning_rate": 2.46033509041298e-05, "loss": 1.4883, "step": 800 }, { "epoch": 0.51, "grad_norm": 1.055127501487732, "learning_rate": 2.410769165402549e-05, "loss": 1.5053, "step": 810 }, { "epoch": 0.52, "grad_norm": 1.0528500080108643, "learning_rate": 2.3612383390176503e-05, "loss": 1.4871, "step": 820 }, { "epoch": 0.52, "grad_norm": 1.328258991241455, "learning_rate": 2.3117620940294048e-05, "loss": 1.5037, "step": 830 }, { "epoch": 0.53, "grad_norm": 1.0326772928237915, "learning_rate": 2.2623598917395438e-05, "loss": 1.4525, "step": 840 }, { "epoch": 0.54, "grad_norm": 3.057058811187744, "learning_rate": 2.213051164325366e-05, "loss": 1.4898, "step": 850 }, { "epoch": 0.54, "grad_norm": 1.1190940141677856, "learning_rate": 2.1638553071961708e-05, "loss": 1.488, "step": 860 }, { "epoch": 0.55, "grad_norm": 1.1501041650772095, "learning_rate": 2.1147916713641367e-05, "loss": 1.4711, "step": 870 }, { "epoch": 0.56, "grad_norm": 1.090022325515747, "learning_rate": 2.0658795558326743e-05, "loss": 1.488, "step": 880 }, { "epoch": 0.56, "grad_norm": 1.0642565488815308, "learning_rate": 2.017138200005236e-05, "loss": 1.4791, "step": 890 }, { "epoch": 0.57, "grad_norm": 1.3562296628952026, "learning_rate": 1.9685867761175584e-05, "loss": 1.4956, "step": 900 }, { "epoch": 0.57, "grad_norm": 1.2069261074066162, "learning_rate": 1.9202443816963425e-05, "loss": 1.4918, "step": 910 }, { "epoch": 0.58, "grad_norm": 1.3227437734603882, "learning_rate": 1.872130032047302e-05, "loss": 1.4577, "step": 920 }, { "epoch": 0.59, "grad_norm": 1.0784181356430054, "learning_rate": 1.824262652775568e-05, "loss": 1.4888, "step": 930 }, { "epoch": 0.59, "grad_norm": 1.000135898590088, "learning_rate": 1.7766610723413684e-05, "loss": 1.4673, "step": 940 }, { "epoch": 0.6, "grad_norm": 1.136026382446289, "learning_rate": 1.7293440146539196e-05, "loss": 1.4779, "step": 950 }, { "epoch": 0.61, "grad_norm": 1.123252272605896, "learning_rate": 1.682330091706446e-05, "loss": 1.4583, "step": 960 }, { "epoch": 0.61, "grad_norm": 1.0559343099594116, "learning_rate": 1.6356377962552238e-05, "loss": 1.4471, "step": 970 }, { "epoch": 0.62, "grad_norm": 1.0266658067703247, "learning_rate": 1.589285494545514e-05, "loss": 1.4632, "step": 980 }, { "epoch": 0.62, "grad_norm": 1.1371444463729858, "learning_rate": 1.5432914190872757e-05, "loss": 1.4732, "step": 990 }, { "epoch": 0.63, "grad_norm": 1.1203784942626953, "learning_rate": 1.4976736614834664e-05, "loss": 1.452, "step": 1000 } ], "logging_steps": 10, "max_steps": 1584, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "total_flos": 7.003073868034212e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }