{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9995267392333176, "eval_steps": 500, "global_step": 1584, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 3.7190074920654297, "learning_rate": 4.9995083170283816e-05, "loss": 2.9245, "step": 10 }, { "epoch": 0.01, "grad_norm": 3.431870222091675, "learning_rate": 4.998033461515242e-05, "loss": 2.0053, "step": 20 }, { "epoch": 0.02, "grad_norm": 2.3315682411193848, "learning_rate": 4.9955760135896534e-05, "loss": 1.888, "step": 30 }, { "epoch": 0.03, "grad_norm": 3.2937276363372803, "learning_rate": 4.992136939879856e-05, "loss": 1.8447, "step": 40 }, { "epoch": 0.03, "grad_norm": 2.7375714778900146, "learning_rate": 4.9877175931330346e-05, "loss": 1.8212, "step": 50 }, { "epoch": 0.04, "grad_norm": 2.15061092376709, "learning_rate": 4.982319711683221e-05, "loss": 1.793, "step": 60 }, { "epoch": 0.04, "grad_norm": 2.0427424907684326, "learning_rate": 4.975945418767529e-05, "loss": 1.756, "step": 70 }, { "epoch": 0.05, "grad_norm": 2.107785224914551, "learning_rate": 4.968597221690986e-05, "loss": 1.7285, "step": 80 }, { "epoch": 0.06, "grad_norm": 2.100552558898926, "learning_rate": 4.96027801084029e-05, "loss": 1.7297, "step": 90 }, { "epoch": 0.06, "grad_norm": 2.2227377891540527, "learning_rate": 4.950991058546893e-05, "loss": 1.7602, "step": 100 }, { "epoch": 0.07, "grad_norm": 1.535144567489624, "learning_rate": 4.940740017799833e-05, "loss": 1.7433, "step": 110 }, { "epoch": 0.08, "grad_norm": 1.6522979736328125, "learning_rate": 4.929528920808854e-05, "loss": 1.7363, "step": 120 }, { "epoch": 0.08, "grad_norm": 2.8091869354248047, "learning_rate": 4.917362177418342e-05, "loss": 1.6872, "step": 130 }, { "epoch": 0.09, "grad_norm": 2.1017510890960693, "learning_rate": 4.904244573372733e-05, "loss": 1.7084, "step": 140 }, { "epoch": 0.09, "grad_norm": 1.6424258947372437, "learning_rate": 4.8901812684340564e-05, "loss": 1.6997, "step": 150 }, { "epoch": 0.1, "grad_norm": 1.4547488689422607, "learning_rate": 4.8751777943523634e-05, "loss": 1.6747, "step": 160 }, { "epoch": 0.11, "grad_norm": 1.6251146793365479, "learning_rate": 4.8592400526898314e-05, "loss": 1.6836, "step": 170 }, { "epoch": 0.11, "grad_norm": 2.098386526107788, "learning_rate": 4.842374312499405e-05, "loss": 1.6552, "step": 180 }, { "epoch": 0.12, "grad_norm": 2.2387640476226807, "learning_rate": 4.824587207858888e-05, "loss": 1.6489, "step": 190 }, { "epoch": 0.13, "grad_norm": 1.7299611568450928, "learning_rate": 4.805885735261454e-05, "loss": 1.6576, "step": 200 }, { "epoch": 0.13, "grad_norm": 1.5701665878295898, "learning_rate": 4.786277250863599e-05, "loss": 1.6533, "step": 210 }, { "epoch": 0.14, "grad_norm": 2.417296886444092, "learning_rate": 4.765769467591625e-05, "loss": 1.6356, "step": 220 }, { "epoch": 0.15, "grad_norm": 1.2636029720306396, "learning_rate": 4.744370452107789e-05, "loss": 1.6389, "step": 230 }, { "epoch": 0.15, "grad_norm": 1.576324224472046, "learning_rate": 4.722088621637309e-05, "loss": 1.6546, "step": 240 }, { "epoch": 0.16, "grad_norm": 1.9720542430877686, "learning_rate": 4.698932740657479e-05, "loss": 1.6354, "step": 250 }, { "epoch": 0.16, "grad_norm": 1.5250279903411865, "learning_rate": 4.6749119174501975e-05, "loss": 1.6342, "step": 260 }, { "epoch": 0.17, "grad_norm": 2.4737966060638428, "learning_rate": 4.6500356005192514e-05, "loss": 1.6407, "step": 270 }, { "epoch": 0.18, "grad_norm": 1.2792372703552246, "learning_rate": 4.6243135748737864e-05, "loss": 1.6339, "step": 280 }, { "epoch": 0.18, "grad_norm": 1.5593037605285645, "learning_rate": 4.597755958179406e-05, "loss": 1.6095, "step": 290 }, { "epoch": 0.19, "grad_norm": 1.3141404390335083, "learning_rate": 4.570373196778427e-05, "loss": 1.6036, "step": 300 }, { "epoch": 0.2, "grad_norm": 1.2617065906524658, "learning_rate": 4.5421760615808474e-05, "loss": 1.6244, "step": 310 }, { "epoch": 0.2, "grad_norm": 1.64117431640625, "learning_rate": 4.513175643827647e-05, "loss": 1.6449, "step": 320 }, { "epoch": 0.21, "grad_norm": 1.7132749557495117, "learning_rate": 4.4833833507280884e-05, "loss": 1.5948, "step": 330 }, { "epoch": 0.21, "grad_norm": 2.1323654651641846, "learning_rate": 4.4528109009727336e-05, "loss": 1.627, "step": 340 }, { "epoch": 0.22, "grad_norm": 2.253115653991699, "learning_rate": 4.42147032012394e-05, "loss": 1.6151, "step": 350 }, { "epoch": 0.23, "grad_norm": 1.6143097877502441, "learning_rate": 4.389373935885646e-05, "loss": 1.5838, "step": 360 }, { "epoch": 0.23, "grad_norm": 1.3353707790374756, "learning_rate": 4.356534373254316e-05, "loss": 1.5935, "step": 370 }, { "epoch": 0.24, "grad_norm": 1.283742904663086, "learning_rate": 4.322964549552943e-05, "loss": 1.6015, "step": 380 }, { "epoch": 0.25, "grad_norm": 1.437249779701233, "learning_rate": 4.288677669350066e-05, "loss": 1.577, "step": 390 }, { "epoch": 0.25, "grad_norm": 1.5190638303756714, "learning_rate": 4.2536872192658036e-05, "loss": 1.5843, "step": 400 }, { "epoch": 0.26, "grad_norm": 2.1320886611938477, "learning_rate": 4.218006962666934e-05, "loss": 1.6145, "step": 410 }, { "epoch": 0.27, "grad_norm": 1.0696591138839722, "learning_rate": 4.181650934253132e-05, "loss": 1.5601, "step": 420 }, { "epoch": 0.27, "grad_norm": 1.3149545192718506, "learning_rate": 4.144633434536467e-05, "loss": 1.5664, "step": 430 }, { "epoch": 0.28, "grad_norm": 1.3661577701568604, "learning_rate": 4.1069690242163484e-05, "loss": 1.6002, "step": 440 }, { "epoch": 0.28, "grad_norm": 1.6984481811523438, "learning_rate": 4.06867251845213e-05, "loss": 1.576, "step": 450 }, { "epoch": 0.29, "grad_norm": 1.2728784084320068, "learning_rate": 4.0297589810356165e-05, "loss": 1.5448, "step": 460 }, { "epoch": 0.3, "grad_norm": 1.4147616624832153, "learning_rate": 3.9902437184657784e-05, "loss": 1.5595, "step": 470 }, { "epoch": 0.3, "grad_norm": 1.2289011478424072, "learning_rate": 3.9501422739279956e-05, "loss": 1.5628, "step": 480 }, { "epoch": 0.31, "grad_norm": 1.5690233707427979, "learning_rate": 3.909470421180201e-05, "loss": 1.5731, "step": 490 }, { "epoch": 0.32, "grad_norm": 1.4935098886489868, "learning_rate": 3.8682441583483314e-05, "loss": 1.545, "step": 500 }, { "epoch": 0.32, "grad_norm": 1.2939772605895996, "learning_rate": 3.8264797016335205e-05, "loss": 1.5793, "step": 510 }, { "epoch": 0.33, "grad_norm": 1.2150651216506958, "learning_rate": 3.7841934789335164e-05, "loss": 1.5378, "step": 520 }, { "epoch": 0.33, "grad_norm": 1.2153139114379883, "learning_rate": 3.741402123380828e-05, "loss": 1.5345, "step": 530 }, { "epoch": 0.34, "grad_norm": 1.290591835975647, "learning_rate": 3.6981224668001424e-05, "loss": 1.5517, "step": 540 }, { "epoch": 0.35, "grad_norm": 1.1924967765808105, "learning_rate": 3.654371533087586e-05, "loss": 1.5472, "step": 550 }, { "epoch": 0.35, "grad_norm": 1.6345056295394897, "learning_rate": 3.610166531514436e-05, "loss": 1.5564, "step": 560 }, { "epoch": 0.36, "grad_norm": 2.185119867324829, "learning_rate": 3.565524849957921e-05, "loss": 1.5574, "step": 570 }, { "epoch": 0.37, "grad_norm": 1.3646321296691895, "learning_rate": 3.520464048061758e-05, "loss": 1.5584, "step": 580 }, { "epoch": 0.37, "grad_norm": 1.2333228588104248, "learning_rate": 3.47500185032913e-05, "loss": 1.518, "step": 590 }, { "epoch": 0.38, "grad_norm": 1.3945318460464478, "learning_rate": 3.4291561391508185e-05, "loss": 1.5339, "step": 600 }, { "epoch": 0.38, "grad_norm": 1.304306149482727, "learning_rate": 3.3829449477712324e-05, "loss": 1.5339, "step": 610 }, { "epoch": 0.39, "grad_norm": 1.6393932104110718, "learning_rate": 3.336386453195088e-05, "loss": 1.5399, "step": 620 }, { "epoch": 0.4, "grad_norm": 1.2000635862350464, "learning_rate": 3.2894989690375626e-05, "loss": 1.5233, "step": 630 }, { "epoch": 0.4, "grad_norm": 1.1479601860046387, "learning_rate": 3.2423009383206876e-05, "loss": 1.538, "step": 640 }, { "epoch": 0.41, "grad_norm": 1.1483389139175415, "learning_rate": 3.194810926218861e-05, "loss": 1.528, "step": 650 }, { "epoch": 0.42, "grad_norm": 1.2403253316879272, "learning_rate": 3.147047612756302e-05, "loss": 1.5307, "step": 660 }, { "epoch": 0.42, "grad_norm": 1.3997712135314941, "learning_rate": 3.099029785459328e-05, "loss": 1.4915, "step": 670 }, { "epoch": 0.43, "grad_norm": 1.2010352611541748, "learning_rate": 3.0507763319663517e-05, "loss": 1.5268, "step": 680 }, { "epoch": 0.44, "grad_norm": 1.0670932531356812, "learning_rate": 3.002306232598497e-05, "loss": 1.5273, "step": 690 }, { "epoch": 0.44, "grad_norm": 1.2283655405044556, "learning_rate": 2.9536385528937567e-05, "loss": 1.5273, "step": 700 }, { "epoch": 0.45, "grad_norm": 1.1306476593017578, "learning_rate": 2.9047924361076345e-05, "loss": 1.5072, "step": 710 }, { "epoch": 0.45, "grad_norm": 1.1699943542480469, "learning_rate": 2.8557870956832132e-05, "loss": 1.4856, "step": 720 }, { "epoch": 0.46, "grad_norm": 1.2550854682922363, "learning_rate": 2.8066418076936167e-05, "loss": 1.4983, "step": 730 }, { "epoch": 0.47, "grad_norm": 1.0610970258712769, "learning_rate": 2.7573759032598366e-05, "loss": 1.5518, "step": 740 }, { "epoch": 0.47, "grad_norm": 1.1754754781723022, "learning_rate": 2.7080087609469062e-05, "loss": 1.4998, "step": 750 }, { "epoch": 0.48, "grad_norm": 1.1955766677856445, "learning_rate": 2.6585597991414114e-05, "loss": 1.5109, "step": 760 }, { "epoch": 0.49, "grad_norm": 1.0891656875610352, "learning_rate": 2.6090484684133404e-05, "loss": 1.5007, "step": 770 }, { "epoch": 0.49, "grad_norm": 1.0880335569381714, "learning_rate": 2.5594942438652688e-05, "loss": 1.5049, "step": 780 }, { "epoch": 0.5, "grad_norm": 1.345954418182373, "learning_rate": 2.509916617471903e-05, "loss": 1.5154, "step": 790 }, { "epoch": 0.5, "grad_norm": 1.1668224334716797, "learning_rate": 2.46033509041298e-05, "loss": 1.4883, "step": 800 }, { "epoch": 0.51, "grad_norm": 1.055127501487732, "learning_rate": 2.410769165402549e-05, "loss": 1.5053, "step": 810 }, { "epoch": 0.52, "grad_norm": 1.0528500080108643, "learning_rate": 2.3612383390176503e-05, "loss": 1.4871, "step": 820 }, { "epoch": 0.52, "grad_norm": 1.328258991241455, "learning_rate": 2.3117620940294048e-05, "loss": 1.5037, "step": 830 }, { "epoch": 0.53, "grad_norm": 1.0326772928237915, "learning_rate": 2.2623598917395438e-05, "loss": 1.4525, "step": 840 }, { "epoch": 0.54, "grad_norm": 3.057058811187744, "learning_rate": 2.213051164325366e-05, "loss": 1.4898, "step": 850 }, { "epoch": 0.54, "grad_norm": 1.1190940141677856, "learning_rate": 2.1638553071961708e-05, "loss": 1.488, "step": 860 }, { "epoch": 0.55, "grad_norm": 1.1501041650772095, "learning_rate": 2.1147916713641367e-05, "loss": 1.4711, "step": 870 }, { "epoch": 0.56, "grad_norm": 1.090022325515747, "learning_rate": 2.0658795558326743e-05, "loss": 1.488, "step": 880 }, { "epoch": 0.56, "grad_norm": 1.0642565488815308, "learning_rate": 2.017138200005236e-05, "loss": 1.4791, "step": 890 }, { "epoch": 0.57, "grad_norm": 1.3562296628952026, "learning_rate": 1.9685867761175584e-05, "loss": 1.4956, "step": 900 }, { "epoch": 0.57, "grad_norm": 1.2069261074066162, "learning_rate": 1.9202443816963425e-05, "loss": 1.4918, "step": 910 }, { "epoch": 0.58, "grad_norm": 1.3227437734603882, "learning_rate": 1.872130032047302e-05, "loss": 1.4577, "step": 920 }, { "epoch": 0.59, "grad_norm": 1.0784181356430054, "learning_rate": 1.824262652775568e-05, "loss": 1.4888, "step": 930 }, { "epoch": 0.59, "grad_norm": 1.000135898590088, "learning_rate": 1.7766610723413684e-05, "loss": 1.4673, "step": 940 }, { "epoch": 0.6, "grad_norm": 1.136026382446289, "learning_rate": 1.7293440146539196e-05, "loss": 1.4779, "step": 950 }, { "epoch": 0.61, "grad_norm": 1.123252272605896, "learning_rate": 1.682330091706446e-05, "loss": 1.4583, "step": 960 }, { "epoch": 0.61, "grad_norm": 1.0559343099594116, "learning_rate": 1.6356377962552238e-05, "loss": 1.4471, "step": 970 }, { "epoch": 0.62, "grad_norm": 1.0266658067703247, "learning_rate": 1.589285494545514e-05, "loss": 1.4632, "step": 980 }, { "epoch": 0.62, "grad_norm": 1.1371444463729858, "learning_rate": 1.5432914190872757e-05, "loss": 1.4732, "step": 990 }, { "epoch": 0.63, "grad_norm": 1.1203784942626953, "learning_rate": 1.4976736614834664e-05, "loss": 1.452, "step": 1000 }, { "epoch": 0.64, "grad_norm": 1.0037944316864014, "learning_rate": 1.4524501653137787e-05, "loss": 1.461, "step": 1010 }, { "epoch": 0.64, "grad_norm": 1.1353282928466797, "learning_rate": 1.4076387190766017e-05, "loss": 1.4538, "step": 1020 }, { "epoch": 0.65, "grad_norm": 1.1203887462615967, "learning_rate": 1.363256949191972e-05, "loss": 1.4681, "step": 1030 }, { "epoch": 0.66, "grad_norm": 1.0686651468276978, "learning_rate": 1.3193223130682936e-05, "loss": 1.4548, "step": 1040 }, { "epoch": 0.66, "grad_norm": 1.0339988470077515, "learning_rate": 1.2758520922355226e-05, "loss": 1.4535, "step": 1050 }, { "epoch": 0.67, "grad_norm": 1.4555269479751587, "learning_rate": 1.2328633855475429e-05, "loss": 1.4621, "step": 1060 }, { "epoch": 0.68, "grad_norm": 1.0318940877914429, "learning_rate": 1.1903731024563966e-05, "loss": 1.4621, "step": 1070 }, { "epoch": 0.68, "grad_norm": 1.084612488746643, "learning_rate": 1.148397956361007e-05, "loss": 1.4636, "step": 1080 }, { "epoch": 0.69, "grad_norm": 1.0705621242523193, "learning_rate": 1.106954458033026e-05, "loss": 1.4495, "step": 1090 }, { "epoch": 0.69, "grad_norm": 1.050857424736023, "learning_rate": 1.0660589091223855e-05, "loss": 1.4395, "step": 1100 }, { "epoch": 0.7, "grad_norm": 1.0744839906692505, "learning_rate": 1.025727395745095e-05, "loss": 1.4583, "step": 1110 }, { "epoch": 0.71, "grad_norm": 1.0446105003356934, "learning_rate": 9.859757821558337e-06, "loss": 1.4606, "step": 1120 }, { "epoch": 0.71, "grad_norm": 1.1479051113128662, "learning_rate": 9.468197045077976e-06, "loss": 1.454, "step": 1130 }, { "epoch": 0.72, "grad_norm": 0.985953152179718, "learning_rate": 9.082745647022797e-06, "loss": 1.4654, "step": 1140 }, { "epoch": 0.73, "grad_norm": 1.1085201501846313, "learning_rate": 8.703555243303835e-06, "loss": 1.4526, "step": 1150 }, { "epoch": 0.73, "grad_norm": 1.2304482460021973, "learning_rate": 8.330774987092712e-06, "loss": 1.448, "step": 1160 }, { "epoch": 0.74, "grad_norm": 1.0740071535110474, "learning_rate": 7.96455151015272e-06, "loss": 1.4606, "step": 1170 }, { "epoch": 0.74, "grad_norm": 1.0380760431289673, "learning_rate": 7.605028865161809e-06, "loss": 1.4661, "step": 1180 }, { "epoch": 0.75, "grad_norm": 1.1115810871124268, "learning_rate": 7.25234846904993e-06, "loss": 1.4567, "step": 1190 }, { "epoch": 0.76, "grad_norm": 0.9248858094215393, "learning_rate": 6.906649047373246e-06, "loss": 1.4372, "step": 1200 }, { "epoch": 0.76, "grad_norm": 1.0288389921188354, "learning_rate": 6.568066579746901e-06, "loss": 1.4542, "step": 1210 }, { "epoch": 0.77, "grad_norm": 1.0125763416290283, "learning_rate": 6.2367342463579475e-06, "loss": 1.4426, "step": 1220 }, { "epoch": 0.78, "grad_norm": 0.9536031484603882, "learning_rate": 5.912782375579412e-06, "loss": 1.4292, "step": 1230 }, { "epoch": 0.78, "grad_norm": 0.993061363697052, "learning_rate": 5.596338392706077e-06, "loss": 1.432, "step": 1240 }, { "epoch": 0.79, "grad_norm": 0.9642956852912903, "learning_rate": 5.2875267698322325e-06, "loss": 1.4427, "step": 1250 }, { "epoch": 0.8, "grad_norm": 0.9925894737243652, "learning_rate": 4.986468976890993e-06, "loss": 1.4199, "step": 1260 }, { "epoch": 0.8, "grad_norm": 1.0030889511108398, "learning_rate": 4.693283433874565e-06, "loss": 1.4253, "step": 1270 }, { "epoch": 0.81, "grad_norm": 0.986602783203125, "learning_rate": 4.408085464254183e-06, "loss": 1.4382, "step": 1280 }, { "epoch": 0.81, "grad_norm": 0.9463419318199158, "learning_rate": 4.130987249617993e-06, "loss": 1.439, "step": 1290 }, { "epoch": 0.82, "grad_norm": 0.9418216347694397, "learning_rate": 3.8620977855448935e-06, "loss": 1.4322, "step": 1300 }, { "epoch": 0.83, "grad_norm": 1.067226529121399, "learning_rate": 3.601522838731461e-06, "loss": 1.4305, "step": 1310 }, { "epoch": 0.83, "grad_norm": 0.9662885665893555, "learning_rate": 3.3493649053890326e-06, "loss": 1.4188, "step": 1320 }, { "epoch": 0.84, "grad_norm": 1.1397868394851685, "learning_rate": 3.1057231709272077e-06, "loss": 1.4426, "step": 1330 }, { "epoch": 0.85, "grad_norm": 1.0030759572982788, "learning_rate": 2.8706934709395892e-06, "loss": 1.4185, "step": 1340 }, { "epoch": 0.85, "grad_norm": 0.9549908638000488, "learning_rate": 2.6443682535072177e-06, "loss": 1.4276, "step": 1350 }, { "epoch": 0.86, "grad_norm": 0.9839365482330322, "learning_rate": 2.4268365428344736e-06, "loss": 1.4174, "step": 1360 }, { "epoch": 0.86, "grad_norm": 0.954189121723175, "learning_rate": 2.21818390423168e-06, "loss": 1.441, "step": 1370 }, { "epoch": 0.87, "grad_norm": 0.9914742708206177, "learning_rate": 2.0184924104583613e-06, "loss": 1.4322, "step": 1380 }, { "epoch": 0.88, "grad_norm": 0.9965653419494629, "learning_rate": 1.8278406094401623e-06, "loss": 1.411, "step": 1390 }, { "epoch": 0.88, "grad_norm": 1.0744175910949707, "learning_rate": 1.6463034933723337e-06, "loss": 1.4368, "step": 1400 }, { "epoch": 0.89, "grad_norm": 0.9871243238449097, "learning_rate": 1.4739524692218314e-06, "loss": 1.396, "step": 1410 }, { "epoch": 0.9, "grad_norm": 0.9976981282234192, "learning_rate": 1.3108553306396265e-06, "loss": 1.439, "step": 1420 }, { "epoch": 0.9, "grad_norm": 0.9817109704017639, "learning_rate": 1.1570762312943295e-06, "loss": 1.4113, "step": 1430 }, { "epoch": 0.91, "grad_norm": 0.9741029143333435, "learning_rate": 1.0126756596375686e-06, "loss": 1.4438, "step": 1440 }, { "epoch": 0.91, "grad_norm": 1.0171328783035278, "learning_rate": 8.777104151110826e-07, "loss": 1.4365, "step": 1450 }, { "epoch": 0.92, "grad_norm": 0.980021595954895, "learning_rate": 7.522335858048707e-07, "loss": 1.4355, "step": 1460 }, { "epoch": 0.93, "grad_norm": 0.9966154098510742, "learning_rate": 6.362945275751736e-07, "loss": 1.431, "step": 1470 }, { "epoch": 0.93, "grad_norm": 0.9687898755073547, "learning_rate": 5.299388446305343e-07, "loss": 1.4057, "step": 1480 }, { "epoch": 0.94, "grad_norm": 0.9906119704246521, "learning_rate": 4.3320837159353813e-07, "loss": 1.421, "step": 1490 }, { "epoch": 0.95, "grad_norm": 1.0227527618408203, "learning_rate": 3.4614115704533767e-07, "loss": 1.4319, "step": 1500 }, { "epoch": 0.95, "grad_norm": 1.0115277767181396, "learning_rate": 2.687714485593462e-07, "loss": 1.4295, "step": 1510 }, { "epoch": 0.96, "grad_norm": 0.993654727935791, "learning_rate": 2.011296792301165e-07, "loss": 1.4294, "step": 1520 }, { "epoch": 0.97, "grad_norm": 0.8775748014450073, "learning_rate": 1.4324245570256633e-07, "loss": 1.4562, "step": 1530 }, { "epoch": 0.97, "grad_norm": 0.9754842519760132, "learning_rate": 9.513254770636137e-08, "loss": 1.4447, "step": 1540 }, { "epoch": 0.98, "grad_norm": 0.9996697902679443, "learning_rate": 5.681887909952388e-08, "loss": 1.4229, "step": 1550 }, { "epoch": 0.98, "grad_norm": 0.9914098381996155, "learning_rate": 2.831652042480093e-08, "loss": 1.4458, "step": 1560 }, { "epoch": 0.99, "grad_norm": 0.9639108777046204, "learning_rate": 9.636682981720158e-09, "loss": 1.4267, "step": 1570 }, { "epoch": 1.0, "grad_norm": 0.9515108466148376, "learning_rate": 7.867144166728846e-10, "loss": 1.4373, "step": 1580 }, { "epoch": 1.0, "step": 1584, "total_flos": 1.1098698583858217e+18, "train_loss": 1.5383612829627413, "train_runtime": 4681.1872, "train_samples_per_second": 21.666, "train_steps_per_second": 0.338 } ], "logging_steps": 10, "max_steps": 1584, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "total_flos": 1.1098698583858217e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }